Changeset 3521ba


Ignore:
Timestamp:
05/24/10 13:31:50 (4 years ago)
Author:
Darius Buntinas <buntinas@…>
Branches:
master
Children:
c6c76f
Parents:
89ead9
Message:

[svn-r6716] added suffix to checkpoint context file to keep multiple proxies from clobbering each other's files

Location:
src/pm/hydra
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • src/pm/hydra/pm/pmiserv/pmip_cb.c

    r8d94fbc r3521ba  
    511511 
    512512        /* Restart the proxy.  Specify stdin fd only if pmi_rank 0 is in this proxy. */ 
    513         status = HYDT_ckpoint_restart(env, HYD_pmcd_pmip.local.proxy_process_count, 
     513        status = HYDT_ckpoint_restart(HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id, 
     514                                      env, HYD_pmcd_pmip.local.proxy_process_count, 
    514515                                      pmi_ranks, 
    515516                                      pmi_ranks[0] ? NULL : 
     
    893894    else if (cmd == CKPOINT) { 
    894895        HYD_pmcd_pmi_proxy_dump(status, STDOUT_FILENO, "requesting checkpoint\n"); 
    895         status = HYDT_ckpoint_suspend(); 
     896        status = HYDT_ckpoint_suspend(HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id); 
    896897        HYDU_ERR_POP(status, "checkpoint suspend failed\n"); 
    897898        HYD_pmcd_pmi_proxy_dump(status, STDOUT_FILENO, "checkpoint completed\n"); 
  • src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.c

    r35929a r3521ba  
    131131} 
    132132 
    133 HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix) 
     133HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix, int pgid, int id) 
    134134{ 
    135135    HYD_status status = HYD_SUCCESS; 
     
    143143 
    144144    /* build the checkpoint filename */ 
    145     snprintf(filename, sizeof(filename), "%s/context", prefix); 
     145    snprintf(filename, sizeof(filename), "%s/context-%d-%d", prefix, pgid, id); 
    146146 
    147147    /* remove existing checkpoint file, if any */ 
     
    202202} 
    203203 
    204 HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, struct HYD_env *envlist, 
     204HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, struct HYD_env *envlist, 
    205205                                     int num_ranks, int ranks[], int *in, int *out, int *err) 
    206206{ 
     
    225225        HYDU_ERR_POP(status, "blcr restart\n"); 
    226226 
    227     snprintf(filename, sizeof(filename), "%s/context", prefix); 
     227    snprintf(filename, sizeof(filename), "%s/context-%d-%d", prefix, pgid, id); 
    228228 
    229229    context_fd = open(filename, O_RDONLY /* | O_LARGEFILE */); 
  • src/pm/hydra/tools/ckpoint/blcr/ckpoint_blcr.h

    r46bc89 r3521ba  
    99 
    1010HYD_status HYDT_ckpoint_blcr_init(void); 
    11 HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix); 
    12 HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, struct HYD_env *envlist, 
     11HYD_status HYDT_ckpoint_blcr_suspend(const char *prefix, int pgid, int id); 
     12HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, struct HYD_env *envlist, 
    1313                                     int num_ranks, int ranks[], int *in, int *out, int *err); 
    1414 
  • src/pm/hydra/tools/ckpoint/ckpoint.c

    readc1f r3521ba  
    5151} 
    5252 
    53 HYD_status HYDT_ckpoint_suspend(void) 
     53HYD_status HYDT_ckpoint_suspend(int pgid, int id) 
    5454{ 
    5555    HYD_status status = HYD_SUCCESS; 
     
    6262#if defined HAVE_BLCR 
    6363    if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) { 
    64         status = HYDT_ckpoint_blcr_suspend(HYDT_ckpoint_info.ckpoint_prefix); 
     64        status = HYDT_ckpoint_blcr_suspend(HYDT_ckpoint_info.ckpoint_prefix, pgid, id); 
    6565        HYDU_ERR_POP(status, "blcr checkpoint returned error\n"); 
    6666    } 
     
    7575} 
    7676 
    77 HYD_status HYDT_ckpoint_restart(struct HYD_env *envlist, int num_ranks, int ranks[], int *in, 
    78                                 int *out, int *err) 
     77HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env *envlist, int num_ranks, int ranks[], int *in, int *out, int *err) 
    7978{ 
    8079    HYD_status status = HYD_SUCCESS; 
     
    8887    if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) { 
    8988        status = 
    90             HYDT_ckpoint_blcr_restart(HYDT_ckpoint_info.ckpoint_prefix, envlist, num_ranks, 
    91                                       ranks, in, out, err); 
     89            HYDT_ckpoint_blcr_restart(HYDT_ckpoint_info.ckpoint_prefix, pgid, id, envlist, num_ranks, ranks, in, out, err); 
    9290        HYDU_ERR_POP(status, "blcr checkpoint returned error\n"); 
    9391    } 
  • src/pm/hydra/tools/ckpoint/ckpoint.h

    r46bc89 r3521ba  
    5050 * \brief HYDT_ckpoint_suspend - Initiate suspend of child processes 
    5151 * 
     52 * \param[in] pgid  process group id 
     53 * \param[in] id    proxy id 
     54 * 
    5255 * This function is called by a proxy to suspend all of its child 
    5356 * processes. 
    5457 */ 
    55 HYD_status HYDT_ckpoint_suspend(void); 
     58HYD_status HYDT_ckpoint_suspend(int pgid, int id); 
    5659 
    5760 
     
    5962 * \brief HYDT_ckpoint_restart - Restart child processes 
    6063 * 
     64 * \param[in] pgid       process group id 
     65 * \param[in] id         proxy id 
    6166 * \param[in] envlist    Environment setup from before the checkpoint 
    6267 * \param[in] num_ranks  Number of child processes to restart 
     
    7176 * each process. 
    7277 */ 
    73 HYD_status HYDT_ckpoint_restart(struct HYD_env *envlist, int num_ranks, int ranks[], int *in, 
     78HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env *envlist, int num_ranks, int ranks[], int *in, 
    7479                                int *out, int *err); 
    7580 
Note: See TracChangeset for help on using the changeset viewer.