Hi all, I'm not sure, if it is relevant to this specific commit, but it is relevant for some of epoch changes. I was not able to compile latest trunk version on our cray system, the failure was in ess/alps component, for me it seems like simple typo. I did not have chance to check my fix on our system, because I have been fighting with Open MPI - VT component compilation on Cray. Please let me know if the patch is ok.
Please see the patch below: Index: orte/mca/ess/alps/ess_alps_module.c =================================================================== --- orte/mca/ess/alps/ess_alps_module.c (revision 25108) +++ orte/mca/ess/alps/ess_alps_module.c (working copy) @@ -363,8 +363,7 @@ ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; - ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); Pavel (Pasha) Shamis --- Application Performance Tools Group Computer Science and Math Division Oak Ridge National Laboratory On Aug 26, 2011, at 6:18 PM, Wesley Bland wrote: > The epoch and resilient rote code is now macro'd away. To enable use > > --enable-resilient-orte > > which defines: > > ORTE_ENABLE_EPOCH > ORTE_RESIL_ORTE > > -- > > Wesley > > On Aug 26, 2011, at 6:16 PM, wbl...@osl.iu.edu wrote: > >> Author: wbland >> Date: 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> New Revision: 25093 >> URL: hxxps://svn.open-mpi.org/trac/ompi/changeset/25093 >> >> Log: >> By popular demand the epoch code is now disabled by default. >> >> To enable the epochs and the resilient orte code, use the configure flag: >> >> --enable-resilient-orte >> >> This will define both: >> >> ORTE_ENABLE_EPOCH >> ORTE_RESIL_ORTE >> >> Text files modified: >> trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c | 12 ++++ >> trunk/ompi/mca/coll/sm2/coll_sm2_module.c | 3 >> trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 49 >> ++++++++---------- >> trunk/ompi/mca/dpm/orte/dpm_orte.c | 2 >> trunk/ompi/mca/pml/bfo/pml_bfo_failover.c | 10 +-- >> trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h | 6 -- >> trunk/ompi/proc/proc.c | 6 +- >> trunk/opal/config/opal_configure_options.m4 | 8 +++ >> trunk/orte/include/orte/types.h | 24 >> +++++++++ >> trunk/orte/mca/db/daemon/db_daemon.c | 2 >> trunk/orte/mca/errmgr/app/errmgr_app.c | 19 ++++++- >> trunk/orte/mca/errmgr/base/errmgr_base_fns.c | 12 ++-- >> trunk/orte/mca/errmgr/base/errmgr_base_tool.c | 6 +- >> trunk/orte/mca/errmgr/hnp/errmgr_hnp.c | 99 >> +++++++++++++++++++++++++++------------ >> trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 6 +- >> trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c | 6 +- >> trunk/orte/mca/errmgr/orted/errmgr_orted.c | 71 >> +++++++++++++++++++++------- >> trunk/orte/mca/ess/alps/ess_alps_module.c | 4 >> trunk/orte/mca/ess/base/base.h | 4 + >> trunk/orte/mca/ess/base/ess_base_select.c | 14 ++--- >> trunk/orte/mca/ess/env/ess_env_module.c | 3 >> trunk/orte/mca/ess/ess.h | 4 + >> trunk/orte/mca/ess/generic/ess_generic_module.c | 6 +- >> trunk/orte/mca/ess/hnp/ess_hnp_module.c | 2 >> trunk/orte/mca/ess/lsf/ess_lsf_module.c | 3 >> trunk/orte/mca/ess/singleton/ess_singleton_module.c | 2 >> trunk/orte/mca/ess/slave/ess_slave_module.c | 3 >> trunk/orte/mca/ess/slurm/ess_slurm_module.c | 3 >> trunk/orte/mca/ess/slurmd/ess_slurmd_module.c | 4 >> trunk/orte/mca/ess/tm/ess_tm_module.c | 2 >> trunk/orte/mca/filem/rsh/filem_rsh_module.c | 6 +- >> trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c | 21 ++----- >> trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c | 8 +- >> trunk/orte/mca/iof/base/base.h | 8 +- >> trunk/orte/mca/iof/base/iof_base_open.c | 2 >> trunk/orte/mca/iof/hnp/iof_hnp.c | 7 +- >> trunk/orte/mca/iof/hnp/iof_hnp_receive.c | 6 +- >> trunk/orte/mca/iof/orted/iof_orted.c | 2 >> trunk/orte/mca/odls/base/odls_base_default_fns.c | 7 +- >> trunk/orte/mca/odls/base/odls_base_open.c | 5 - >> trunk/orte/mca/odls/base/odls_base_state.c | 6 +- >> trunk/orte/mca/oob/tcp/oob_tcp_msg.c | 2 >> trunk/orte/mca/oob/tcp/oob_tcp_peer.c | 5 ++ >> trunk/orte/mca/plm/base/plm_base_jobid.c | 4 >> trunk/orte/mca/plm/base/plm_base_launch_support.c | 3 >> trunk/orte/mca/plm/base/plm_base_orted_cmds.c | 8 +-- >> trunk/orte/mca/plm/base/plm_base_receive.c | 7 ++ >> trunk/orte/mca/plm/base/plm_base_rsh_support.c | 4 + >> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 23 >> +++++---- >> trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c | 3 >> trunk/orte/mca/rmaps/seq/rmaps_seq.c | 3 >> trunk/orte/mca/rmcast/base/rmcast_base_open.c | 6 +- >> trunk/orte/mca/rmcast/tcp/rmcast_tcp.c | 4 >> trunk/orte/mca/rmcast/udp/rmcast_udp.c | 4 >> trunk/orte/mca/rml/base/rml_base_components.c | 5 + >> trunk/orte/mca/rml/rml_types.h | 6 + >> trunk/orte/mca/routed/base/routed_base_components.c | 6 +- >> trunk/orte/mca/routed/base/routed_base_register_sync.c | 4 + >> trunk/orte/mca/routed/binomial/routed_binomial.c | 54 >> ++++++++++++--------- >> trunk/orte/mca/routed/cm/routed_cm.c | 19 +++---- >> trunk/orte/mca/routed/direct/routed_direct.c | 3 >> trunk/orte/mca/routed/linear/routed_linear.c | 17 +++--- >> trunk/orte/mca/routed/radix/routed_radix.c | 22 >> ++++---- >> trunk/orte/mca/routed/slave/routed_slave.c | 6 +- >> trunk/orte/mca/sensor/file/sensor_file.c | 2 >> trunk/orte/mca/snapc/base/snapc_base_fns.c | 4 >> trunk/orte/mca/snapc/full/snapc_full_global.c | 12 ++-- >> trunk/orte/mca/snapc/full/snapc_full_local.c | 6 +- >> trunk/orte/mca/snapc/full/snapc_full_module.c | 4 >> trunk/orte/mca/sstore/base/sstore_base_fns.c | 6 +- >> trunk/orte/mca/sstore/central/sstore_central_global.c | 3 >> trunk/orte/mca/sstore/central/sstore_central_local.c | 6 +- >> trunk/orte/mca/sstore/stage/sstore_stage_global.c | 7 +- >> trunk/orte/mca/sstore/stage/sstore_stage_local.c | 12 ++-- >> trunk/orte/orted/orted_comm.c | 20 >> ++++---- >> trunk/orte/orted/orted_main.c | 7 +- >> trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c | 4 + >> trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c | 4 + >> trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c | 6 ++ >> trunk/orte/runtime/data_type_support/orte_dt_print_fns.c | 19 +++++++ >> trunk/orte/runtime/data_type_support/orte_dt_size_fns.c | 2 >> trunk/orte/runtime/data_type_support/orte_dt_support.h | 11 ++++ >> trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c | 10 +++ >> trunk/orte/runtime/orte_data_server.c | 2 >> trunk/orte/runtime/orte_globals.c | 4 + >> trunk/orte/runtime/orte_init.c | 9 +++ >> trunk/orte/runtime/orte_wait.h | 6 +- >> trunk/orte/test/system/oob_stress.c | 3 >> trunk/orte/test/system/orte_ring.c | 6 - >> trunk/orte/test/system/orte_spawn.c | 4 >> trunk/orte/tools/orte-ps/orte-ps.c | 10 +++ >> trunk/orte/tools/orte-top/orte-top.c | 2 >> trunk/orte/util/comm/comm.c | 7 ++ >> trunk/orte/util/comm/comm.h | 5 + >> trunk/orte/util/hnp_contact.c | 3 >> trunk/orte/util/name_fns.c | 47 >> ++++++++++++++---- >> trunk/orte/util/name_fns.h | 30 >> ++++++++++++ >> trunk/orte/util/nidmap.c | 13 ++++ >> trunk/orte/util/nidmap.h | 11 ++++ >> trunk/orte/util/proc_info.c | 14 ++++- >> trunk/test/util/orte_session_dir.c | 2 >> 101 files changed, 652 insertions(+), 362 deletions(-) >> >> Modified: trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >> ============================================================================== >> --- trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >> (original) >> +++ trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -693,8 +693,16 @@ >> bool found = false; >> >> BTL_VERBOSE(("Searching for ep and proc with follow parameters:" >> - "jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d", >> - process_name->jobid, process_name->vpid, >> process_name->epoch, subnet_id, lid)); >> + "jobid %d, vpid %d, " >> +#if ORTE_ENABLE_EPOCH >> + "epoch %d, " >> +#endif >> + "sid %" PRIx64 ", lid %d", >> + process_name->jobid, process_name->vpid, >> +#if ORTE_ENABLE_EPOCH >> + process_name->epoch, >> +#endif >> + subnet_id, lid)); >> /* find ibproc */ >> OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); >> for (ib_proc = (mca_btl_openib_proc_t*) >> >> Modified: trunk/ompi/mca/coll/sm2/coll_sm2_module.c >> ============================================================================== >> --- trunk/ompi/mca/coll/sm2/coll_sm2_module.c (original) >> +++ trunk/ompi/mca/coll/sm2/coll_sm2_module.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -1208,7 +1208,8 @@ >> peer = OBJ_NEW(orte_namelist_t); >> peer->name.jobid = >> comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid; >> peer->name.vpid = >> comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid; >> - peer->name.epoch = >> comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch; >> + >> ORTE_EPOCH_SET(peer->name.epoch,comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch); >> + >> opal_list_append(&peers, &peer->item); >> } >> /* prepare send data */ >> >> Modified: trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c >> ============================================================================== >> --- trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c (original) >> +++ trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -702,7 +702,7 @@ >> void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t >> *peer_ref) { >> peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; >> peer_ref->proc_name.vpid = ORTE_VPID_INVALID; >> - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); >> >> OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t); >> OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t); >> @@ -730,7 +730,7 @@ >> >> peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; >> peer_ref->proc_name.vpid = ORTE_VPID_INVALID; >> - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); >> >> while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) { >> HOKE_TRAFFIC_MSG_REF_RETURN(item); >> @@ -840,7 +840,7 @@ >> >> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >> >> msg_ref->matched = INVALID_INT; >> msg_ref->done = INVALID_INT; >> @@ -868,7 +868,7 @@ >> >> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >> >> msg_ref->matched = INVALID_INT; >> msg_ref->done = INVALID_INT; >> @@ -902,7 +902,7 @@ >> >> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >> >> msg_ref->done = INVALID_INT; >> msg_ref->active = INVALID_INT; >> @@ -934,7 +934,7 @@ >> >> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >> >> msg_ref->done = INVALID_INT; >> msg_ref->active = INVALID_INT; >> @@ -954,7 +954,7 @@ >> >> msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; >> msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; >> - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); >> } >> >> void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( >> ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) { >> @@ -962,7 +962,7 @@ >> >> msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; >> msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; >> - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); >> } >> >> >> @@ -1015,7 +1015,7 @@ >> } >> >> >> -#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, >> v_comm, p_jobid, p_vpid, p_epoch) \ >> +#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, >> v_comm, p_jobid, p_vpid) \ >> { \ >> HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \ >> \ >> @@ -1034,7 +1034,7 @@ >> \ >> msg_ref->proc_name.jobid = p_jobid; \ >> msg_ref->proc_name.vpid = p_vpid; \ >> - msg_ref->proc_name.epoch = p_epoch; \ >> + >> ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); >> \ >> \ >> msg_ref->matched = 0; \ >> msg_ref->done = 0; \ >> @@ -1043,7 +1043,7 @@ >> msg_ref->active_drain = 0; \ >> } >> >> -#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >> v_rank, v_comm, p_jobid, p_vpid, p_epoch) \ >> +#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >> v_rank, v_comm, p_jobid, p_vpid) \ >> { \ >> HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \ >> \ >> @@ -1063,7 +1063,7 @@ >> \ >> msg_ref->proc_name.jobid = p_jobid; \ >> msg_ref->proc_name.vpid = p_vpid; \ >> - msg_ref->proc_name.epoch = p_epoch; \ >> + >> ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); >> \ >> } >> >> >> @@ -1466,7 +1466,7 @@ >> >> new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid; >> new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid; >> - new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch; >> + >> ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch); >> >> opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, >> &(new_peer_ref->super)); >> } >> @@ -3237,13 +3237,11 @@ >> CREATE_NEW_MSG((*msg_ref), msg_type, >> count, ddt_size, tag, dest, comm, >> peer_ref->proc_name.jobid, >> - peer_ref->proc_name.vpid, >> - peer_ref->proc_name.epoch); >> + peer_ref->proc_name.vpid); >> } else { >> CREATE_NEW_MSG((*msg_ref), msg_type, >> count, ddt_size, tag, dest, comm, >> - ORTE_JOBID_INVALID, ORTE_VPID_INVALID, >> - ORTE_EPOCH_INVALID); >> + ORTE_JOBID_INVALID, ORTE_VPID_INVALID); >> } >> >> if( msg_type == COORD_MSG_TYPE_P_SEND || >> @@ -3377,7 +3375,7 @@ >> if( NULL == from_peer_ref && NULL != to_peer_ref ) { >> (*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid; >> (*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid; >> - (*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch; >> + >> ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch); >> } >> >> return exit_status; >> @@ -3808,8 +3806,7 @@ >> CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type, >> count, NULL, tag, dest, comm, >> peer_ref->proc_name.jobid, >> - peer_ref->proc_name.vpid, >> - peer_ref->proc_name.epoch); >> + peer_ref->proc_name.vpid); >> >> (*msg_ref)->done = 0; >> (*msg_ref)->active = 0; >> @@ -5284,8 +5281,7 @@ >> */ >> peer_name.jobid = ORTE_PROC_MY_NAME->jobid; >> peer_name.vpid = peer_idx; >> - peer_name.epoch = ORTE_EPOCH_INVALID; >> - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); >> + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); >> >> if( NULL == (peer_ref = find_peer(peer_name))) { >> opal_output(mca_crcp_bkmrk_component . super . output_handle, >> @@ -5346,8 +5342,7 @@ >> >> peer_name.jobid = ORTE_PROC_MY_NAME->jobid; >> peer_name.vpid = peer_idx; >> - peer_name.epoch = ORTE_EPOCH_INVALID; >> - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); >> + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); >> >> if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name, >> OMPI_CRCP_COORD_BOOKMARK_TAG, >> @@ -5529,7 +5524,8 @@ >> HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret); >> d_msg_ack->peer.jobid = peer_ref->proc_name.jobid; >> d_msg_ack->peer.vpid = peer_ref->proc_name.vpid; >> - d_msg_ack->peer.epoch = peer_ref->proc_name.epoch; >> + ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch); >> + >> d_msg_ack->complete = false; >> opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super)); >> OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component . super . output_handle, >> @@ -6169,8 +6165,7 @@ >> count, datatype_size, tag, rank, >> ompi_comm_lookup(comm_id), >> peer_ref->proc_name.jobid, >> - peer_ref->proc_name.vpid, >> - peer_ref->proc_name.epoch); >> + peer_ref->proc_name.vpid); >> >> traffic_message_create_drain_message(true, num_left_unresolved, >> peer_ref, >> >> Modified: trunk/ompi/mca/dpm/orte/dpm_orte.c >> ============================================================================== >> --- trunk/ompi/mca/dpm/orte/dpm_orte.c (original) >> +++ trunk/ompi/mca/dpm/orte/dpm_orte.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -1130,7 +1130,7 @@ >> /* flag the identity of the remote proc */ >> carport.jobid = mev->sender.jobid; >> carport.vpid = mev->sender.vpid; >> - carport.epoch = mev->sender.epoch; >> + ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch); >> >> /* release the event */ >> OBJ_RELEASE(mev); >> >> Modified: trunk/ompi/mca/pml/bfo/pml_bfo_failover.c >> ============================================================================== >> --- trunk/ompi/mca/pml/bfo/pml_bfo_failover.c (original) >> +++ trunk/ompi/mca/pml/bfo/pml_bfo_failover.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -1,8 +1,5 @@ >> /* >> * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. >> - * Copyright (c) 2004-2011 The University of Tennessee and The University >> - * of Tennessee Research Foundation. All rights >> - * reserved. >> * $COPYRIGHT$ >> * >> * Additional copyrights may follow >> @@ -398,13 +395,13 @@ >> (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { >> orte_proc.jobid = hdr->hdr_restart.hdr_jobid; >> orte_proc.vpid = hdr->hdr_restart.hdr_vpid; >> - orte_proc.epoch = hdr->hdr_restart.hdr_epoch; >> + >> ompi_proc = ompi_proc_find(&orte_proc); >> opal_output_verbose(20, mca_pml_bfo_output, >> "RNDVRESTARTNOTIFY: received: does not match >> request, sending NACK back " >> "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d >> SRC:req=%d,hdr=%d " >> "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, >> peer=%d, hdr->hdr_jobid=%d, " >> - "hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, >> ompi_proc->proc_hostname=%s", >> + "hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s", >> (uint16_t)recvreq->req_msgseq, >> hdr->hdr_match.hdr_seq, >> recvreq->req_recv . req_base . >> req_comm->c_contextid, hdr->hdr_match.hdr_ctx, >> recvreq->req_recv . req_base . req_ompi . >> req_status . MPI_SOURCE, >> @@ -413,7 +410,7 @@ >> recvreq->remote_req_send.pval, (void *)recvreq, >> recvreq->req_recv . req_base . req_ompi . >> req_status . MPI_SOURCE, >> hdr->hdr_restart.hdr_jobid, >> hdr->hdr_restart.hdr_vpid, >> - hdr->hdr_restart.hdr_epoch, >> ompi_proc->proc_hostname); >> + ompi_proc->proc_hostname); >> mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); >> return; >> } >> @@ -715,7 +712,6 @@ >> restart->hdr_dst_rank = sendreq->req_send . req_base . req_peer; /* >> Needed for NACKs */ >> restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid; >> restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid; >> - restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch; >> >> bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc); >> >> >> Modified: trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h >> ============================================================================== >> --- trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h (original) >> +++ trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -2,9 +2,6 @@ >> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >> * University Research and Technology >> * Corporation. All rights reserved. >> - * Copyright (c) 2004-2011 The University of Tennessee and The University >> - * of Tennessee Research Foundation. All rights >> - * reserved. >> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >> * University of Stuttgart. All rights reserved. >> * Copyright (c) 2004-2005 The Regents of the University of California. >> @@ -415,7 +412,6 @@ >> int32_t hdr_dst_rank; /**< needed to send NACK */ >> uint32_t hdr_jobid; /**< needed to send NACK */ >> uint32_t hdr_vpid; /**< needed to send NACK */ >> - uint32_t hdr_epoch; /**< needed to send NACK */ >> }; >> typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t; >> >> @@ -428,7 +424,6 @@ >> (h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \ >> (h).hdr_jobid = ntohl((h).hdr_jobid); \ >> (h).hdr_vpid = ntohl((h).hdr_vpid); \ >> - (h).hdr_epoch = ntohl((h).hdr_epoch); \ >> } while (0) >> >> #define MCA_PML_BFO_RESTART_HDR_HTON(h) \ >> @@ -437,7 +432,6 @@ >> (h).hdr_dst_rank = htonl((h).hdr_dst_rank); \ >> (h).hdr_jobid = htonl((h).hdr_jobid); \ >> (h).hdr_vpid = htonl((h).hdr_vpid); \ >> - (h).hdr_epoch = htonl((h).hdr_epoch); \ >> } while (0) >> >> #endif /* PML_BFO */ >> >> Modified: trunk/ompi/proc/proc.c >> ============================================================================== >> --- trunk/ompi/proc/proc.c (original) >> +++ trunk/ompi/proc/proc.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -108,7 +108,8 @@ >> >> proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; >> proc->proc_name.vpid = i; >> - proc->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN); >> + >> if (i == ORTE_PROC_MY_NAME->vpid) { >> ompi_proc_local_proc = proc; >> proc->proc_flags = OPAL_PROC_ALL_LOCAL; >> @@ -362,8 +363,7 @@ >> >> /* Does not change: proc->proc_name.vpid */ >> proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; >> - proc->proc_name.epoch = ORTE_EPOCH_INVALID; >> - proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name); >> + >> ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name)); >> >> /* Make sure to clear the local flag before we set it below */ >> proc->proc_flags = 0; >> >> Modified: trunk/opal/config/opal_configure_options.m4 >> ============================================================================== >> --- trunk/opal/config/opal_configure_options.m4 (original) >> +++ trunk/opal/config/opal_configure_options.m4 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -416,6 +416,14 @@ >> AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1") >> >> # >> +# Compile in resilient runtime code >> +# >> +AC_ARG_ENABLE(resilient-orte, >> + [AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient >> runtime code.])]) >> +AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"]) >> +AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"]) >> + >> +# >> # Do we want to install binaries? >> # >> AC_ARG_ENABLE([binaries], >> >> Modified: trunk/orte/include/orte/types.h >> ============================================================================== >> --- trunk/orte/include/orte/types.h (original) >> +++ trunk/orte/include/orte/types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -81,24 +81,43 @@ >> #define ORTE_VPID_T OPAL_UINT32 >> #define ORTE_VPID_MAX UINT32_MAX-2 >> #define ORTE_VPID_MIN 0 >> + >> +#if ORTE_ENABLE_EPOCH >> typedef uint32_t orte_epoch_t; >> #define ORTE_EPOCH_T OPAL_UINT32 >> #define ORTE_EPOCH_MAX UINT32_MAX-2 >> #define ORTE_EPOCH_MIN 0 >> +#endif >> >> +#if ORTE_ENABLE_EPOCH >> #define ORTE_PROCESS_NAME_HTON(n) \ >> do { \ >> n.jobid = htonl(n.jobid); \ >> n.vpid = htonl(n.vpid); \ >> n.epoch = htonl(n.epoch); \ >> } while (0) >> +#else >> +#define ORTE_PROCESS_NAME_HTON(n) \ >> +do { \ >> + n.jobid = htonl(n.jobid); \ >> + n.vpid = htonl(n.vpid); \ >> +} while (0) >> +#endif >> >> +#if ORTE_ENABLE_EPOCH >> #define ORTE_PROCESS_NAME_NTOH(n) \ >> do { \ >> n.jobid = ntohl(n.jobid); \ >> n.vpid = ntohl(n.vpid); \ >> n.epoch = ntohl(n.epoch); \ >> } while (0) >> +#else >> +#define ORTE_PROCESS_NAME_NTOH(n) \ >> +do { \ >> + n.jobid = ntohl(n.jobid); \ >> + n.vpid = ntohl(n.vpid); \ >> +} while (0) >> +#endif >> >> #define ORTE_NAME_ARGS(n) \ >> (unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : >> (unsigned long)(n)->jobid), \ >> @@ -127,6 +146,7 @@ >> struct orte_process_name_t { >> orte_jobid_t jobid; /**< Job number */ >> orte_vpid_t vpid; /**< Process id - equivalent to rank */ >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; /**< Epoch - used to measure the generation of a >> recovered process. >> * The epoch will start at ORTE_EPOCH_MIN and >> * increment every time the process is detected >> as >> @@ -135,6 +155,7 @@ >> * processes that did not directly detect the >> * failure to increment their epochs. >> */ >> +#endif >> }; >> typedef struct orte_process_name_t orte_process_name_t; >> >> @@ -157,7 +178,10 @@ >> #define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an >> orte_process_name_t */ >> #define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */ >> #define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid >> */ >> + >> +#if ORTE_ENABLE_EPOCH >> #define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch >> */ >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> /* State-related types */ >> >> Modified: trunk/orte/mca/db/daemon/db_daemon.c >> ============================================================================== >> --- trunk/orte/mca/db/daemon/db_daemon.c (original) >> +++ trunk/orte/mca/db/daemon/db_daemon.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -386,7 +386,7 @@ >> dat = OBJ_NEW(orte_db_data_t); >> dat->name.jobid = sender->jobid; >> dat->name.vpid = sender->vpid; >> - dat->name.epoch= sender->epoch; >> + ORTE_EPOCH_SET(dat->name.epoch,sender->epoch); >> dat->key = key; >> count=1; >> opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32); >> >> Modified: trunk/orte/mca/errmgr/app/errmgr_app.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/app/errmgr_app.c (original) >> +++ trunk/orte/mca/errmgr/app/errmgr_app.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -82,8 +82,10 @@ >> NULL, >> NULL, >> NULL, >> - orte_errmgr_base_register_migration_warning, >> - orte_errmgr_base_set_fault_callback >> + orte_errmgr_base_register_migration_warning >> +#if ORTE_RESIL_ORTE >> + ,orte_errmgr_base_set_fault_callback >> +#endif >> }; >> >> /************************ >> @@ -93,18 +95,23 @@ >> { >> int ret = ORTE_SUCCESS; >> >> +#if ORTE_RESIL_ORTE >> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >> ORTE_RML_TAG_EPOCH_CHANGE, >> ORTE_RML_PERSISTENT, >> epoch_change_recv, >> NULL); >> +#endif >> + >> return ret; >> } >> >> static int finalize(void) >> { >> +#if ORTE_RESIL_ORTE >> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, >> ORTE_RML_TAG_EPOCH_CHANGE); >> +#endif >> >> return ORTE_SUCCESS; >> } >> @@ -151,6 +158,7 @@ >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_RESIL_ORTE >> void epoch_change_recv(int status, >> orte_process_name_t *sender, >> opal_buffer_t *buffer, >> @@ -209,15 +217,20 @@ >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> (*fault_cbfunc)(procs); >> + } else if (NULL == fault_cbfunc) { >> + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >> + "%s errmgr:app Calling fault callback failed (NULL >> pointer)!", >> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> } else { >> OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >> - "%s errmgr:app Calling fault callback failed!", >> + "%s errmgr:app Calling fault callback failed (num_dead >> <= 0)!", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> } >> >> free(proc); >> OBJ_RELEASE(procs); >> } >> +#endif >> >> static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, >> orte_std_cntr_t num_procs) >> { >> >> Modified: trunk/orte/mca/errmgr/base/errmgr_base_fns.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/base/errmgr_base_fns.c (original) >> +++ trunk/orte/mca/errmgr/base/errmgr_base_fns.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -97,13 +97,13 @@ >> { >> item->proc_name.vpid = ORTE_VPID_INVALID; >> item->proc_name.jobid = ORTE_JOBID_INVALID; >> - item->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); >> } >> >> void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item) >> { >> item->proc_name.vpid = ORTE_VPID_INVALID; >> - item->proc_name.epoch = ORTE_EPOCH_INVALID; >> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); >> item->proc_name.jobid = ORTE_JOBID_INVALID; >> } >> >> @@ -139,13 +139,13 @@ >> void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) >> { >> item->proc_name.vpid = ORTE_VPID_INVALID; >> - item->proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); >> item->proc_name.jobid = ORTE_JOBID_INVALID; >> >> item->node_name = NULL; >> >> item->map_proc_name.vpid = ORTE_VPID_INVALID; >> - item->map_proc_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN); >> item->map_proc_name.jobid = ORTE_JOBID_INVALID; >> >> item->map_node_name = NULL; >> @@ -156,7 +156,7 @@ >> void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) >> { >> item->proc_name.vpid = ORTE_VPID_INVALID; >> - item->proc_name.epoch = ORTE_EPOCH_INVALID; >> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); >> item->proc_name.jobid = ORTE_JOBID_INVALID; >> >> if( NULL != item->node_name ) { >> @@ -165,7 +165,7 @@ >> } >> >> item->map_proc_name.vpid = ORTE_VPID_INVALID; >> - item->map_proc_name.epoch = ORTE_EPOCH_INVALID; >> + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID); >> item->map_proc_name.jobid = ORTE_JOBID_INVALID; >> >> if( NULL != item->map_node_name ) { >> >> Modified: trunk/orte/mca/errmgr/base/errmgr_base_tool.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/base/errmgr_base_tool.c (original) >> +++ trunk/orte/mca/errmgr/base/errmgr_base_tool.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -267,7 +267,7 @@ >> */ >> errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID; >> errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID; >> - errmgr_cmdline_sender.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN); >> if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >> ORTE_RML_TAG_MIGRATE, >> 0, >> @@ -379,14 +379,14 @@ >> if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, >> ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) { >> swap_dest.jobid = errmgr_cmdline_sender.jobid; >> swap_dest.vpid = errmgr_cmdline_sender.vpid; >> - swap_dest.epoch = errmgr_cmdline_sender.epoch; >> + ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch); >> >> errmgr_cmdline_sender = *sender; >> >> orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS); >> >> errmgr_cmdline_sender.jobid = swap_dest.jobid; >> errmgr_cmdline_sender.vpid = swap_dest.vpid; >> - errmgr_cmdline_sender.epoch = swap_dest.epoch; >> + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch); >> >> goto cleanup; >> } >> >> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp.c (original) >> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -53,6 +53,7 @@ >> #include "orte/runtime/orte_globals.h" >> #include "orte/runtime/orte_locks.h" >> #include "orte/runtime/orte_quit.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/errmgr/errmgr.h" >> #include "orte/mca/errmgr/base/base.h" >> @@ -83,9 +84,11 @@ >> orte_errmgr_hnp_global_suggest_map_targets, >> /* FT Event hook */ >> orte_errmgr_hnp_global_ft_event, >> - orte_errmgr_base_register_migration_warning, >> + orte_errmgr_base_register_migration_warning >> +#if ORTE_RESIL_ORTE >> /* Set the callback */ >> - orte_errmgr_base_set_fault_callback >> + ,orte_errmgr_base_set_fault_callback >> +#endif >> }; >> >> >> @@ -97,14 +100,16 @@ >> static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t >> jobstate, >> orte_proc_state_t state, >> orte_exit_code_t exit_code); >> static void check_job_complete(orte_job_t *jdata); >> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >> epoch); >> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); >> static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, >> orte_proc_state_t state, orte_exit_code_t exit_code); >> static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); >> +#if ORTE_RESIL_ORTE >> static int send_to_local_applications(opal_pointer_array_t *dead_names); >> static void failure_notification(int status, orte_process_name_t* sender, >> opal_buffer_t *buffer, orte_rml_tag_t tag, >> void* cbdata); >> +#endif >> >> /************************ >> * API Definitions >> @@ -380,16 +385,21 @@ >> **********************/ >> int orte_errmgr_hnp_base_global_init(void) >> { >> - int ret; >> + int ret = ORTE_SUCCESS; >> >> +#if ORTE_RESIL_ORTE >> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >> ORTE_RML_TAG_FAILURE_NOTICE, >> ORTE_RML_PERSISTENT, failure_notification, >> NULL); >> +#endif >> + >> return ret; >> } >> >> int orte_errmgr_hnp_base_global_finalize(void) >> { >> +#if ORTE_RESIL_ORTE >> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); >> +#endif >> >> return ORTE_SUCCESS; >> } >> @@ -406,6 +416,7 @@ >> orte_odls_child_t *child; >> int rc; >> orte_app_context_t *app; >> + orte_proc_t *pdat; >> >> OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >> "%s errmgr:hnp: job %s reported state %s" >> @@ -538,7 +549,7 @@ >> ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, >> exit_code); >> /* order all local procs for this job to be killed */ >> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >> check_job_complete(jdata); /* set the local proc states */ >> /* the job object for this job will have been NULL'd >> * in the array if the job was solely local. If it isn't >> @@ -550,7 +561,7 @@ >> break; >> case ORTE_JOB_STATE_COMM_FAILED: >> /* order all local procs for this job to be killed */ >> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >> check_job_complete(jdata); /* set the local proc states */ >> /* the job object for this job will have been NULL'd >> * in the array if the job was solely local. If it isn't >> @@ -562,7 +573,7 @@ >> break; >> case ORTE_JOB_STATE_HEARTBEAT_FAILED: >> /* order all local procs for this job to be killed */ >> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >> check_job_complete(jdata); /* set the local proc states */ >> /* the job object for this job will have been NULL'd >> * in the array if the job was solely local. If it isn't >> @@ -632,10 +643,6 @@ >> } >> } >> >> - if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { >> - exit_code = 0; >> - } >> - >> orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); >> check_job_complete(jdata); /* need to set the job state */ >> /* the job object for this job will have been NULL'd >> @@ -679,7 +686,7 @@ >> >> case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: >> if (jdata->enable_recovery) { >> - killprocs(proc->jobid, proc->vpid, proc->epoch); >> + killprocs(proc->jobid, proc->vpid); >> /* is this a local proc */ >> if (NULL != (child = proc_is_local(proc))) { >> /* local proc - see if it has reached its restart limit */ >> @@ -778,18 +785,37 @@ >> opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM >> FAILED DAEMON %s", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> ORTE_NAME_PRINT(proc)); >> /* kill all local procs */ >> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >> /* kill all jobs */ >> hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >> /* check if all is complete so we can terminate */ >> check_job_complete(jdata); >> } >> } else { >> +#if !ORTE_RESIL_ORTE >> + if (NULL == (pdat = >> (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { >> + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >> + orte_show_help("help-orte-errmgr-hnp.txt", >> "errmgr-hnp:daemon-died", true, >> + ORTE_VPID_PRINT(proc->vpid), >> "Unknown"); >> + } else { >> + orte_show_help("help-orte-errmgr-hnp.txt", >> "errmgr-hnp:daemon-died", true, >> + ORTE_VPID_PRINT(proc->vpid), >> + (NULL == pdat->node) ? "Unknown" : >> + ((NULL == pdat->node->name) ? >> "Unknown" : pdat->node->name)); >> + } >> +#endif >> if (ORTE_SUCCESS != >> orte_errmgr_hnp_record_dead_process(proc)) { >> /* The process is already dead so don't keep trying >> to do >> * this stuff. */ >> return ORTE_SUCCESS; >> } >> + >> +#if !ORTE_RESIL_ORTE >> + /* kill all local procs */ >> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >> + /* kill all jobs */ >> + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >> +#endif >> /* We'll check if the job was complete when we get the >> * message back from the HNP notifying us of the dead >> * process */ >> @@ -805,7 +831,7 @@ >> } else { >> orte_errmgr_hnp_record_dead_process(proc); >> /* kill all local procs */ >> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >> /* kill all jobs */ >> hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >> return ORTE_ERR_UNRECOVERABLE; >> @@ -824,6 +850,7 @@ >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_RESIL_ORTE >> static void failure_notification(int status, orte_process_name_t* sender, >> opal_buffer_t *buffer, orte_rml_tag_t tag, >> void* cbdata) >> @@ -984,6 +1011,7 @@ >> >> OBJ_RELEASE(dead_names); >> } >> +#endif >> >> /***************** >> * Local Functions >> @@ -1354,7 +1382,6 @@ >> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >> } >> break; >> -#if 0 >> case ORTE_PROC_STATE_ABORTED_BY_SIG: >> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >> "%s errmgr:hnp:check_job_completed proc %s >> aborted by signal", >> @@ -1370,7 +1397,6 @@ >> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >> } >> break; >> -#endif >> case ORTE_PROC_STATE_TERM_WO_SYNC: >> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >> "%s errmgr:hnp:check_job_completed proc %s >> terminated without sync", >> @@ -1393,7 +1419,6 @@ >> } >> break; >> case ORTE_PROC_STATE_COMM_FAILED: >> -#if 1 >> if (!jdata->abort) { >> jdata->state = ORTE_JOB_STATE_COMM_FAILED; >> /* point to the lowest rank to cause the problem */ >> @@ -1403,7 +1428,6 @@ >> jdata->abort = true; >> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >> } >> -#endif >> break; >> case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: >> if (!jdata->abort) { >> @@ -1530,9 +1554,6 @@ >> */ >> CHECK_DAEMONS: >> if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { >> -#if 0 >> - if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract >> one for the HNP */ >> -#endif >> if (0 == orte_routed.num_routes()) { >> /* orteds are done! */ >> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >> @@ -1696,7 +1717,7 @@ >> } >> } >> >> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >> epoch) >> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) >> { >> opal_pointer_array_t cmd; >> orte_proc_t proc; >> @@ -1707,7 +1728,9 @@ >> orte_sensor.stop(job); >> } >> >> - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && >> ORTE_EPOCH_WILDCARD == epoch) { >> + if (ORTE_JOBID_WILDCARD == job >> + && ORTE_VPID_WILDCARD == vpid >> + && ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { >> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { >> ORTE_ERROR_LOG(rc); >> } >> @@ -1718,7 +1741,7 @@ >> OBJ_CONSTRUCT(&proc, orte_proc_t); >> proc . name . jobid = job; >> proc . name . vpid = vpid; >> - proc . name . epoch = epoch; >> + ORTE_EPOCH_SET(proc . name . epoch,epoch); >> opal_pointer_array_add(&cmd, &proc); >> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { >> ORTE_ERROR_LOG(rc); >> @@ -1913,13 +1936,15 @@ >> } >> >> if (NULL != (pdat = >> (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && >> - ORTE_PROC_STATE_TERMINATED < pdat->state) { >> + ORTE_PROC_STATE_TERMINATED > pdat->state) { >> >> +#if ORTE_ENABLE_EPOCH >> /* Make sure that the epochs match. */ >> if (proc->epoch != pdat->name.epoch) { >> opal_output(1, "The epoch does not match the current epoch. >> Throwing the request out."); >> return ORTE_SUCCESS; >> } >> +#endif >> >> dead_names = OBJ_NEW(opal_pointer_array_t); >> >> @@ -1935,6 +1960,7 @@ >> } >> } >> >> +#if ORTE_RESIL_ORTE >> if (!mca_errmgr_hnp_component.term_in_progress) { >> /* >> * Send a message to the other daemons so they know that a daemon >> has >> @@ -1949,7 +1975,7 @@ >> OBJ_RELEASE(buffer); >> } else { >> >> - /* Iterate of the list of dead procs and send them along >> with >> + /* Iterate over the list of dead procs and send them along >> with >> * the rest. The HNP needs this info so it can tell the other >> * ORTEDs and they can inform the appropriate applications. >> */ >> @@ -1973,6 +1999,9 @@ >> } else { >> orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); >> } >> +#else >> + orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); >> +#endif >> } >> >> return ORTE_SUCCESS; >> @@ -2011,6 +2040,7 @@ >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> ORTE_NAME_PRINT(&pdat->name))); >> >> +#if ORTE_RESIL_ORTE >> /* Make sure the epochs match, if not it probably means that we >> * already reported this failure. */ >> if (name_item->epoch != pdat->name.epoch) { >> @@ -2018,6 +2048,7 @@ >> } >> >> orte_util_set_epoch(name_item, name_item->epoch + 1); >> +#endif >> >> /* Remove it from the job array */ >> opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); >> @@ -2034,6 +2065,7 @@ >> >> OBJ_RELEASE(pdat); >> >> +#if ORTE_RESIL_ORTE >> /* Create a new proc object that will keep track of the epoch >> * information */ >> pdat = OBJ_NEW(orte_proc_t); >> @@ -2041,14 +2073,15 @@ >> pdat->name.vpid = name_item->vpid; >> pdat->name.epoch = name_item->epoch + 1; >> >> - /* Set the state as terminated so we'll know the process isn't >> - * actually there. */ >> - pdat->state = ORTE_PROC_STATE_TERMINATED; >> - >> opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); >> jdat->num_procs++; >> jdat->num_terminated++; >> +#endif >> + /* Set the state as terminated so we'll know the process isn't >> + * actually there. */ >> + pdat->state = ORTE_PROC_STATE_TERMINATED; >> } else { >> +#if ORTE_RESIL_ORTE >> opal_output(0, "Proc data not found for %s", >> ORTE_NAME_PRINT(name_item)); >> /* Create a new proc object that will keep track of the epoch >> * information */ >> @@ -2064,11 +2097,13 @@ >> opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); >> jdat->num_procs++; >> jdat->num_terminated++; >> +#endif >> } >> >> check_job_complete(jdat); >> } >> >> +#if ORTE_RESIL_ORTE >> if (!orte_orteds_term_ordered) { >> /* Need to update the orted routing module. */ >> orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); >> @@ -2077,10 +2112,12 @@ >> (*fault_cbfunc)(dead_procs); >> } >> } >> +#endif >> >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_RESIL_ORTE >> int send_to_local_applications(opal_pointer_array_t *dead_names) { >> opal_buffer_t *buf; >> int ret = ORTE_SUCCESS; >> @@ -2121,3 +2158,5 @@ >> >> return ret; >> } >> +#endif >> + >> >> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c (original) >> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -522,7 +522,7 @@ >> wp_item = OBJ_NEW(errmgr_autor_wp_item_t); >> wp_item->name.jobid = proc->jobid; >> wp_item->name.vpid = proc->vpid; >> - wp_item->name.epoch = proc->epoch; >> + ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); >> wp_item->state = state; >> >> opal_list_append(procs_pending_recovery, &(wp_item->super)); >> @@ -626,7 +626,7 @@ >> { >> wp->name.jobid = ORTE_JOBID_INVALID; >> wp->name.vpid = ORTE_VPID_INVALID; >> - wp->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN); >> >> wp->state = 0; >> } >> @@ -635,7 +635,7 @@ >> { >> wp->name.jobid = ORTE_JOBID_INVALID; >> wp->name.vpid = ORTE_VPID_INVALID; >> - wp->name.epoch = ORTE_EPOCH_INVALID; >> + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); >> >> wp->state = 0; >> } >> >> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c (original) >> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -750,7 +750,7 @@ >> close_iof_stdin = true; >> iof_name.jobid = proc->name.jobid; >> iof_name.vpid = proc->name.vpid; >> - iof_name.epoch = proc->name.epoch; >> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >> } >> } >> } >> @@ -807,7 +807,7 @@ >> close_iof_stdin = true; >> iof_name.jobid = proc->name.jobid; >> iof_name.vpid = proc->name.vpid; >> - iof_name.epoch = proc->name.epoch; >> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >> } >> } >> } >> @@ -855,7 +855,7 @@ >> close_iof_stdin = true; >> iof_name.jobid = proc->name.jobid; >> iof_name.vpid = proc->name.vpid; >> - iof_name.epoch = proc->name.epoch; >> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >> } >> } >> } >> >> Modified: trunk/orte/mca/errmgr/orted/errmgr_orted.c >> ============================================================================== >> --- trunk/orte/mca/errmgr/orted/errmgr_orted.c (original) >> +++ trunk/orte/mca/errmgr/orted/errmgr_orted.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -34,6 +34,7 @@ >> #include "orte/util/show_help.h" >> #include "orte/util/nidmap.h" >> #include "orte/runtime/orte_globals.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> #include "orte/mca/rml/rml.h" >> #include "orte/mca/odls/odls.h" >> #include "orte/mca/odls/base/base.h" >> @@ -41,7 +42,9 @@ >> #include "orte/mca/plm/plm_types.h" >> #include "orte/mca/routed/routed.h" >> #include "orte/mca/sensor/sensor.h" >> +#include "orte/mca/ess/ess.h" >> #include "orte/runtime/orte_quit.h" >> +#include "orte/runtime/orte_globals.h" >> >> #include "orte/mca/errmgr/errmgr.h" >> #include "orte/mca/errmgr/base/base.h" >> @@ -59,13 +62,15 @@ >> static void update_local_children(orte_odls_job_t *jobdat, >> orte_job_state_t jobstate, >> orte_proc_state_t state); >> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >> epoch); >> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); >> static int record_dead_process(orte_process_name_t *proc); >> -static int send_to_local_applications(opal_pointer_array_t *dead_names); >> static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); >> +#if ORTE_RESIL_ORTE >> +static int send_to_local_applications(opal_pointer_array_t *dead_names); >> static void failure_notification(int status, orte_process_name_t* sender, >> opal_buffer_t *buffer, orte_rml_tag_t tag, >> void* cbdata); >> +#endif >> >> /* >> * Module functions: Global >> @@ -104,8 +109,10 @@ >> predicted_fault, >> suggest_map_targets, >> ft_event, >> - orte_errmgr_base_register_migration_warning, >> - orte_errmgr_base_set_fault_callback /* Set callback function */ >> + orte_errmgr_base_register_migration_warning >> +#if ORTE_RESIL_ORTE >> + ,orte_errmgr_base_set_fault_callback /* Set callback function */ >> +#endif >> }; >> >> /************************ >> @@ -113,16 +120,22 @@ >> ************************/ >> static int init(void) >> { >> - int ret; >> + int ret = ORTE_SUCCESS; >> >> +#if ORTE_RESIL_ORTE >> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >> ORTE_RML_TAG_FAILURE_NOTICE, >> ORTE_RML_PERSISTENT, failure_notification, >> NULL); >> +#endif >> + >> return ret; >> } >> >> static int finalize(void) >> { >> +#if ORTE_RESIL_ORTE >> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); >> +#endif >> + >> return ORTE_SUCCESS; >> } >> >> @@ -228,10 +241,10 @@ >> /* update all procs in job */ >> update_local_children(jobdat, jobstate, >> ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); >> /* order all local procs for this job to be killed */ >> - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); >> case ORTE_JOB_STATE_COMM_FAILED: >> /* kill all local procs */ >> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >> /* tell the caller we can't recover */ >> return ORTE_ERR_UNRECOVERABLE; >> break; >> @@ -276,7 +289,7 @@ >> /* see if this was a lifeline */ >> if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { >> /* kill our children */ >> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >> ORTE_EPOCH_WILDCARD); >> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >> /* terminate - our routed children will see >> * us leave and automatically die >> */ >> @@ -290,10 +303,18 @@ >> if (0 == orte_routed.num_routes() && >> 0 == opal_list_get_size(&orte_local_children)) { >> orte_quit(); >> + } else { >> + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >> + "%s errmgr:orted not exiting, num_routes() == >> %d, num children == %d", >> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> + orte_routed.num_routes(), >> + opal_list_get_size(&orte_local_children))); >> } >> } >> >> +#if ORTE_RESIL_ORTE >> record_dead_process(proc); >> +#endif >> >> /* if not, then indicate we can continue */ >> return ORTE_SUCCESS; >> @@ -344,7 +365,7 @@ >> /* Decrement the number of local procs */ >> jobdat->num_local_procs--; >> /* kill this proc */ >> - killprocs(proc->jobid, proc->vpid, proc->epoch); >> + killprocs(proc->jobid, proc->vpid); >> } >> app = >> (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, >> child->app_idx); >> if( jobdat->enable_recovery && child->restarts < >> app->max_restarts ) { >> @@ -526,10 +547,12 @@ >> ORTE_ERROR_LOG(rc); >> goto FINAL_CLEANUP; >> } >> +#if ORTE_ENABLE_EPOCH >> if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, >> &child->name->epoch, 1, ORTE_EPOCH))) { >> ORTE_ERROR_LOG(rc); >> goto FINAL_CLEANUP; >> } >> +#endif >> } >> } >> /* pack an invalid marker */ >> @@ -660,7 +683,7 @@ >> continue; >> } >> >> - if (name_item->epoch < orte_util_lookup_epoch(name_item)) { >> + if (0 < >> ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { >> continue; >> } >> >> @@ -669,9 +692,11 @@ >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> ORTE_NAME_PRINT(name_item))); >> >> +#if ORTE_ENABLE_EPOCH >> /* Increment the epoch */ >> orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); >> orte_util_set_epoch(name_item, name_item->epoch + 1); >> +#endif >> >> OPAL_THREAD_LOCK(&orte_odls_globals.mutex); >> >> @@ -706,6 +731,7 @@ >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_RESIL_ORTE >> static void failure_notification(int status, orte_process_name_t* sender, >> opal_buffer_t *buffer, orte_rml_tag_t tag, >> void* cbdata) >> @@ -714,7 +740,7 @@ >> orte_std_cntr_t n; >> int ret = ORTE_SUCCESS, num_failed; >> int32_t i; >> - orte_process_name_t *name_item, proc; >> + orte_process_name_t *name_item; >> >> dead_names = OBJ_NEW(opal_pointer_array_t); >> >> @@ -746,7 +772,7 @@ >> /* There shouldn't be an issue of receiving this message multiple >> * times but it doesn't hurt to double check. >> */ >> - if (proc.epoch < orte_util_lookup_epoch(name_item)) { >> + if (0 < >> ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { >> opal_output(1, "Received from proc %s local epoch %d", >> ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); >> continue; >> } >> @@ -767,6 +793,7 @@ >> free(name_item); >> } >> } >> +#endif >> >> /***************** >> * Local Functions >> @@ -948,11 +975,13 @@ >> ORTE_ERROR_LOG(rc); >> return rc; >> } >> +#if ORTE_ENABLE_EPOCH >> /* Pack the child's epoch. */ >> if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, >> &(child->name->epoch), 1, ORTE_EPOCH))) { >> ORTE_ERROR_LOG(rc); >> return rc; >> } >> +#endif >> /* pack the contact info */ >> if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, >> OPAL_STRING))) { >> ORTE_ERROR_LOG(rc); >> @@ -1015,7 +1044,7 @@ >> } >> } >> >> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >> epoch) >> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) >> { >> opal_pointer_array_t cmd; >> orte_proc_t proc; >> @@ -1026,7 +1055,9 @@ >> orte_sensor.stop(job); >> } >> >> - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && >> ORTE_EPOCH_WILDCARD == epoch) { >> + if (ORTE_JOBID_WILDCARD == job >> + && ORTE_VPID_WILDCARD == vpid >> + && 0 == ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { >> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { >> ORTE_ERROR_LOG(rc); >> } >> @@ -1037,7 +1068,7 @@ >> OBJ_CONSTRUCT(&proc, orte_proc_t); >> proc . name . jobid = job; >> proc . name . vpid = vpid; >> - proc . name . epoch = epoch; >> + ORTE_EPOCH_SET(proc . name . epoch,epoch); >> opal_pointer_array_add(&cmd, &proc); >> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { >> ORTE_ERROR_LOG(rc); >> @@ -1082,20 +1113,21 @@ >> return rc; >> } >> >> +#if ORTE_RESIL_ORTE >> int send_to_local_applications(opal_pointer_array_t *dead_names) { >> opal_buffer_t *buf; >> int ret; >> orte_process_name_t *name_item; >> int size, i; >> >> - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, >> - "%s Sending failure to local applications.", >> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> - >> buf = OBJ_NEW(opal_buffer_t); >> >> size = opal_pointer_array_get_size(dead_names); >> >> + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, >> + "%s Sending %d failure(s) to local applications.", >> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size)); >> + >> if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { >> ORTE_ERROR_LOG(ret); >> OBJ_RELEASE(buf); >> @@ -1122,4 +1154,5 @@ >> >> return ORTE_SUCCESS; >> } >> +#endif >> >> >> Modified: trunk/orte/mca/ess/alps/ess_alps_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/alps/ess_alps_module.c (original) >> +++ trunk/orte/mca/ess/alps/ess_alps_module.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -363,8 +363,8 @@ >> >> ORTE_PROC_MY_NAME->jobid = jobid; >> ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >> + ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); >> + >> ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "ess:alps set name to %s", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> Modified: trunk/orte/mca/ess/base/base.h >> ============================================================================== >> --- trunk/orte/mca/ess/base/base.h (original) >> +++ trunk/orte/mca/ess/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -57,7 +57,11 @@ >> >> ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available; >> >> +#if ORTE_ENABLE_EPOCH >> ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t >> *proc); >> +#else >> +ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc); >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> >> >> Modified: trunk/orte/mca/ess/base/ess_base_select.c >> ============================================================================== >> --- trunk/orte/mca/ess/base/ess_base_select.c (original) >> +++ trunk/orte/mca/ess/base/ess_base_select.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -36,21 +36,19 @@ >> * Generic function to retrieve the epoch of a specific process >> * from the job data. >> */ >> +#if !ORTE_ENABLE_EPOCH >> +int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { >> + return 0; >> +} >> +#else >> orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { >> orte_epoch_t epoch = ORTE_EPOCH_INVALID; >> >> -#if !ORTE_DISABLE_FULL_SUPPORT >> epoch = orte_util_lookup_epoch(proc); >> -#endif >> - >> - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, >> - "%s ess:generic: proc %s has epoch %d", >> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> - ORTE_NAME_PRINT(proc), >> - epoch)); >> >> return epoch; >> } >> +#endif >> >> int >> orte_ess_base_select(void) >> >> Modified: trunk/orte/mca/ess/env/ess_env_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/env/ess_env_module.c (original) >> +++ trunk/orte/mca/ess/env/ess_env_module.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -392,8 +392,7 @@ >> >> ORTE_PROC_MY_NAME->jobid = jobid; >> ORTE_PROC_MY_NAME->vpid = vpid; >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "ess:env set name to %s", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> Modified: trunk/orte/mca/ess/ess.h >> ============================================================================== >> --- trunk/orte/mca/ess/ess.h (original) >> +++ trunk/orte/mca/ess/ess.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -111,7 +111,11 @@ >> * will get the most up to date version stored within the orte_proc_t struct. >> * Obviously the epoch of the proc that is passed in will be ignored. >> */ >> +#if ORTE_ENABLE_EPOCH >> typedef orte_epoch_t >> (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); >> +#else >> +typedef int (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t >> *proc); >> +#endif >> >> /** >> * Update the pidmap >> >> Modified: trunk/orte/mca/ess/generic/ess_generic_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/generic/ess_generic_module.c (original) >> +++ trunk/orte/mca/ess/generic/ess_generic_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -155,7 +155,7 @@ >> goto error; >> } >> ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >> >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "%s completed name definition", >> @@ -273,7 +273,7 @@ >> if (vpid == ORTE_PROC_MY_NAME->vpid) { >> ORTE_PROC_MY_DAEMON->jobid = 0; >> ORTE_PROC_MY_DAEMON->vpid = i; >> - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >> } >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "%s node %d name %s rank %s", >> @@ -304,7 +304,7 @@ >> if (vpid == ORTE_PROC_MY_NAME->vpid) { >> ORTE_PROC_MY_DAEMON->jobid = 0; >> ORTE_PROC_MY_DAEMON->vpid = i; >> - ORTE_PROC_MY_DAEMON->epoch = >> ORTE_PROC_MY_NAME->epoch; >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >> } >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "%s node %d name %s rank %d", >> >> Modified: trunk/orte/mca/ess/hnp/ess_hnp_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/hnp/ess_hnp_module.c (original) >> +++ trunk/orte/mca/ess/hnp/ess_hnp_module.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -494,7 +494,7 @@ >> proc = OBJ_NEW(orte_proc_t); >> proc->name.jobid = ORTE_PROC_MY_NAME->jobid; >> proc->name.vpid = ORTE_PROC_MY_NAME->vpid; >> - proc->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >> >> proc->pid = orte_process_info.pid; >> proc->rml_uri = orte_rml.get_contact_info(); >> >> Modified: trunk/orte/mca/ess/lsf/ess_lsf_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/lsf/ess_lsf_module.c (original) >> +++ trunk/orte/mca/ess/lsf/ess_lsf_module.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -357,8 +357,7 @@ >> >> ORTE_PROC_MY_NAME->jobid = jobid; >> ORTE_PROC_MY_NAME->vpid = vpid; >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >> >> /* fix up the base name and make it the "real" name */ >> lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); >> >> Modified: trunk/orte/mca/ess/singleton/ess_singleton_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/singleton/ess_singleton_module.c (original) >> +++ trunk/orte/mca/ess/singleton/ess_singleton_module.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -188,7 +188,7 @@ >> /* set the name */ >> ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); >> ORTE_PROC_MY_NAME->vpid = 0; >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >> >> } else { >> /* >> >> Modified: trunk/orte/mca/ess/slave/ess_slave_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/slave/ess_slave_module.c (original) >> +++ trunk/orte/mca/ess/slave/ess_slave_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -280,8 +280,7 @@ >> >> ORTE_PROC_MY_NAME->jobid = jobid; >> ORTE_PROC_MY_NAME->vpid = vpid; >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "ess:slave set name to %s", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> Modified: trunk/orte/mca/ess/slurm/ess_slurm_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/slurm/ess_slurm_module.c (original) >> +++ trunk/orte/mca/ess/slurm/ess_slurm_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -368,8 +368,7 @@ >> /* fix up the vpid and make it the "real" vpid */ >> slurm_nodeid = atoi(getenv("SLURM_NODEID")); >> ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid; >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "ess:slurm set name to %s", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> Modified: trunk/orte/mca/ess/slurmd/ess_slurmd_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/slurmd/ess_slurmd_module.c (original) >> +++ trunk/orte/mca/ess/slurmd/ess_slurmd_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -195,7 +195,7 @@ >> } >> ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); >> #endif >> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >> /* get our local rank */ >> if (NULL == (envar = getenv("SLURM_LOCALID"))) { >> error = "could not get SLURM_LOCALID"; >> @@ -260,7 +260,7 @@ >> nodeid = strtol(envar, NULL, 10); >> ORTE_PROC_MY_DAEMON->jobid = 0; >> ORTE_PROC_MY_DAEMON->vpid = nodeid; >> - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >> >> /* get the number of ppn */ >> if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) { >> >> Modified: trunk/orte/mca/ess/tm/ess_tm_module.c >> ============================================================================== >> --- trunk/orte/mca/ess/tm/ess_tm_module.c (original) >> +++ trunk/orte/mca/ess/tm/ess_tm_module.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -364,7 +364,7 @@ >> >> ORTE_PROC_MY_NAME->jobid = jobid; >> ORTE_PROC_MY_NAME->vpid = vpid; >> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >> "ess:tm set name to %s", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> Modified: trunk/orte/mca/filem/rsh/filem_rsh_module.c >> ============================================================================== >> --- trunk/orte/mca/filem/rsh/filem_rsh_module.c (original) >> +++ trunk/orte/mca/filem/rsh/filem_rsh_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -1097,11 +1097,11 @@ >> if( NULL != proc_set ) { >> wp_item->proc_set . source . jobid = proc_set->source.jobid; >> wp_item->proc_set . source . vpid = proc_set->source.vpid; >> - wp_item->proc_set . source . epoch = proc_set->source.epoch; >> + ORTE_EPOCH_SET(wp_item->proc_set . source . >> epoch,proc_set->source.epoch); >> >> wp_item->proc_set . sink . jobid = proc_set->sink.jobid; >> wp_item->proc_set . sink . vpid = proc_set->sink.vpid; >> - wp_item->proc_set . sink . epoch = proc_set->sink.epoch; >> + ORTE_EPOCH_SET(wp_item->proc_set . sink . >> epoch,proc_set->sink.epoch); >> } >> /* Copy the File Set */ >> if( NULL != file_set ) { >> @@ -1396,7 +1396,7 @@ >> wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t); >> wp_item->proc_set . source . jobid = sender->jobid; >> wp_item->proc_set . source . vpid = sender->vpid; >> - wp_item->proc_set . source . epoch = sender->epoch; >> + ORTE_EPOCH_SET(wp_item->proc_set . source . >> epoch,sender->epoch); >> >> opal_list_append(&work_pool_waiting, &(wp_item->super)); >> } >> >> Modified: trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c >> ============================================================================== >> --- trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c (original) >> +++ trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -168,8 +168,7 @@ >> if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { >> /* I send first */ >> peer.vpid = vpids[1]; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> /* setup a temp buffer so I can inform the other side as to the >> * number of entries in my buffer >> @@ -226,8 +225,7 @@ >> opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); >> opal_dss.copy_payload(&buf, sendbuf); >> peer.vpid = vpids[0]; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, >> "%s grpcomm:coll:two-proc sending to %s", >> @@ -320,8 +318,7 @@ >> /* first send my current contents */ >> nv = (rank - distance + np) % np; >> peer.vpid = vpids[nv]; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> OBJ_CONSTRUCT(&buf, opal_buffer_t); >> opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); >> @@ -340,8 +337,7 @@ >> num_recvd = 0; >> nv = (rank + distance) % np; >> peer.vpid = vpids[nv]; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> OBJ_CONSTRUCT(&bucket, opal_buffer_t); >> if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, >> @@ -439,8 +435,7 @@ >> /* first send my current contents */ >> nv = rank ^ distance; >> peer.vpid = vpids[nv]; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> OBJ_CONSTRUCT(&buf, opal_buffer_t); >> opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); >> @@ -646,8 +641,7 @@ >> proc.jobid = jobid; >> proc.vpid = 0; >> while (proc.vpid < jobdat->num_procs && 0 < >> opal_list_get_size(&daemon_tree)) { >> - proc.epoch = ORTE_EPOCH_INVALID; >> - proc.epoch = orte_ess.proc_get_epoch(&proc); >> + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); >> >> /* get the daemon that hosts this proc */ >> daemonvpid = orte_ess.proc_get_daemon(&proc); >> @@ -713,8 +707,7 @@ >> /* send it */ >> my_parent.jobid = ORTE_PROC_MY_NAME->jobid; >> my_parent.vpid = orte_routed.get_routing_tree(NULL); >> - my_parent.epoch = ORTE_EPOCH_INVALID; >> - my_parent.epoch = orte_ess.proc_get_epoch(&my_parent); >> + ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); >> >> OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, >> "%s grpcomm:base:daemon_coll: daemon collective >> not the HNP - sending to parent %s", >> >> Modified: trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c >> ============================================================================== >> --- trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c (original) >> +++ trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -95,7 +95,7 @@ >> >> my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; >> my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; >> - my_local_rank_zero_proc.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); >> >> if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { >> ORTE_ERROR_LOG(rc); >> @@ -270,7 +270,7 @@ >> proc.jobid = ORTE_PROC_MY_NAME->jobid; >> for (v=0; v < orte_process_info.num_procs; v++) { >> proc.vpid = v; >> - proc.epoch = orte_util_lookup_epoch(&proc); >> + ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); >> >> /* is this proc local_rank=0 on its node? */ >> if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { >> @@ -285,7 +285,7 @@ >> nm = OBJ_NEW(orte_namelist_t); >> nm->name.jobid = proc.jobid; >> nm->name.vpid = proc.vpid; >> - nm->name.epoch = proc.epoch; >> + ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); >> >> opal_list_append(&my_local_peers, &nm->item); >> /* if I am not local_rank=0, is this one? */ >> @@ -293,7 +293,7 @@ >> 0 == orte_ess.get_local_rank(&proc)) { >> my_local_rank_zero_proc.jobid = proc.jobid; >> my_local_rank_zero_proc.vpid = proc.vpid; >> - my_local_rank_zero_proc.epoch = proc.epoch; >> + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); >> } >> } >> >> >> Modified: trunk/orte/mca/iof/base/base.h >> ============================================================================== >> --- trunk/orte/mca/iof/base/base.h (original) >> +++ trunk/orte/mca/iof/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -135,7 +135,7 @@ >> ep = OBJ_NEW(orte_iof_sink_t); \ >> ep->name.jobid = (nm)->jobid; \ >> ep->name.vpid = (nm)->vpid; \ >> - ep->name.epoch = (nm)->epoch; \ >> + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ >> ep->tag = (tg); \ >> if (0 <= (fid)) { \ >> ep->wev->fd = (fid); \ >> @@ -169,7 +169,7 @@ >> rev = OBJ_NEW(orte_iof_read_event_t); \ >> rev->name.jobid = (nm)->jobid; \ >> rev->name.vpid = (nm)->vpid; \ >> - rev->name.epoch = (nm)->epoch; \ >> + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ >> rev->tag = (tg); \ >> rev->fd = (fid); \ >> *(rv) = rev; \ >> @@ -194,7 +194,7 @@ >> ep = OBJ_NEW(orte_iof_sink_t); \ >> ep->name.jobid = (nm)->jobid; \ >> ep->name.vpid = (nm)->vpid; \ >> - ep->name.epoch = (nm)->epoch; \ >> + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ >> ep->tag = (tg); \ >> if (0 <= (fid)) { \ >> ep->wev->fd = (fid); \ >> @@ -215,7 +215,7 @@ >> rev = OBJ_NEW(orte_iof_read_event_t); \ >> rev->name.jobid = (nm)->jobid; \ >> rev->name.vpid = (nm)->vpid; \ >> - rev->name.epoch= (nm)->epoch; \ >> + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ >> rev->tag = (tg); \ >> *(rv) = rev; \ >> opal_event_set(opal_event_base, \ >> >> Modified: trunk/orte/mca/iof/base/iof_base_open.c >> ============================================================================== >> --- trunk/orte/mca/iof/base/iof_base_open.c (original) >> +++ trunk/orte/mca/iof/base/iof_base_open.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -91,7 +91,7 @@ >> { >> ptr->daemon.jobid = ORTE_JOBID_INVALID; >> ptr->daemon.vpid = ORTE_VPID_INVALID; >> - ptr->daemon.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN); >> ptr->wev = OBJ_NEW(orte_iof_write_event_t); >> } >> static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) >> >> Modified: trunk/orte/mca/iof/hnp/iof_hnp.c >> ============================================================================== >> --- trunk/orte/mca/iof/hnp/iof_hnp.c (original) >> +++ trunk/orte/mca/iof/hnp/iof_hnp.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -186,7 +186,7 @@ >> proct = OBJ_NEW(orte_iof_proc_t); >> proct->name.jobid = dst_name->jobid; >> proct->name.vpid = dst_name->vpid; >> - proct->name.epoch = dst_name->epoch; >> + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); >> opal_list_append(&mca_iof_hnp_component.procs, &proct->super); >> /* see if we are to output to a file */ >> if (NULL != orte_output_filename) { >> @@ -281,8 +281,7 @@ >> &mca_iof_hnp_component.sinks); >> sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; >> sink->daemon.vpid = proc->node->daemon->name.vpid; >> - sink->daemon.epoch = ORTE_EPOCH_INVALID; >> - sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon); >> + >> ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); >> } >> } >> >> @@ -389,7 +388,7 @@ >> &mca_iof_hnp_component.sinks); >> sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; >> sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; >> - sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); >> >> return ORTE_SUCCESS; >> } >> >> Modified: trunk/orte/mca/iof/hnp/iof_hnp_receive.c >> ============================================================================== >> --- trunk/orte/mca/iof/hnp/iof_hnp_receive.c (original) >> +++ trunk/orte/mca/iof/hnp/iof_hnp_receive.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -109,21 +109,21 @@ >> NULL, &mca_iof_hnp_component.sinks); >> sink->daemon.jobid = mev->sender.jobid; >> sink->daemon.vpid = mev->sender.vpid; >> - sink->daemon.epoch = mev->sender.epoch; >> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >> } >> if (ORTE_IOF_STDERR & stream) { >> ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, >> NULL, &mca_iof_hnp_component.sinks); >> sink->daemon.jobid = mev->sender.jobid; >> sink->daemon.vpid = mev->sender.vpid; >> - sink->daemon.epoch = mev->sender.epoch; >> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >> } >> if (ORTE_IOF_STDDIAG & stream) { >> ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, >> NULL, &mca_iof_hnp_component.sinks); >> sink->daemon.jobid = mev->sender.jobid; >> sink->daemon.vpid = mev->sender.vpid; >> - sink->daemon.epoch = mev->sender.epoch; >> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >> } >> goto CLEAN_RETURN; >> } >> >> Modified: trunk/orte/mca/iof/orted/iof_orted.c >> ============================================================================== >> --- trunk/orte/mca/iof/orted/iof_orted.c (original) >> +++ trunk/orte/mca/iof/orted/iof_orted.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -163,7 +163,7 @@ >> proct = OBJ_NEW(orte_iof_proc_t); >> proct->name.jobid = dst_name->jobid; >> proct->name.vpid = dst_name->vpid; >> - proct->name.epoch = dst_name->epoch; >> + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); >> opal_list_append(&mca_iof_orted_component.procs, &proct->super); >> /* see if we are to output to a file */ >> if (NULL != orte_output_filename) { >> >> Modified: trunk/orte/mca/odls/base/odls_base_default_fns.c >> ============================================================================== >> --- trunk/orte/mca/odls/base/odls_base_default_fns.c (original) >> +++ trunk/orte/mca/odls/base/odls_base_default_fns.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -734,8 +734,7 @@ >> proc.jobid = jobdat->jobid; >> for (j=0; j < jobdat->num_procs; j++) { >> proc.vpid = j; >> - proc.epoch = ORTE_EPOCH_INVALID; >> - proc.epoch = orte_ess.proc_get_epoch(&proc); >> + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); >> /* get the vpid of the daemon that is to host this proc */ >> if (ORTE_VPID_INVALID == (host_daemon = >> orte_ess.proc_get_daemon(&proc))) { >> ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >> @@ -1044,6 +1043,7 @@ >> free(param); >> free(value); >> >> +#if ORTE_ENABLE_EPOCH >> /* setup the epoch */ >> if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, >> child->name->epoch))) { >> ORTE_ERROR_LOG(rc); >> @@ -1057,6 +1057,7 @@ >> opal_setenv(param, value, true, env); >> free(param); >> free(value); >> +#endif >> >> /* setup the vpid */ >> if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, >> child->name->vpid))) { >> @@ -2721,7 +2722,7 @@ >> OBJ_CONSTRUCT(&proctmp, orte_proc_t); >> proctmp . name . jobid = ORTE_JOBID_WILDCARD; >> proctmp . name . vpid = ORTE_VPID_WILDCARD; >> - proctmp . name . epoch = ORTE_EPOCH_WILDCARD; >> + ORTE_EPOCH_SET(proctmp . name . epoch,ORTE_EPOCH_WILDCARD); >> opal_pointer_array_add(&procarray, &proctmp); >> procptr = &procarray; >> do_cleanup = true; >> >> Modified: trunk/orte/mca/odls/base/odls_base_open.c >> ============================================================================== >> --- trunk/orte/mca/odls/base/odls_base_open.c (original) >> +++ trunk/orte/mca/odls/base/odls_base_open.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -187,7 +187,7 @@ >> if (-1 == rank) { >> /* wildcard */ >> nm->name.vpid = ORTE_VPID_WILDCARD; >> - nm->name.epoch = ORTE_EPOCH_WILDCARD; >> + ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD); >> } else if (rank < 0) { >> /* error out on bozo case */ >> orte_show_help("help-odls-base.txt", >> @@ -200,8 +200,7 @@ >> * will be in the job - we'll check later >> */ >> nm->name.vpid = rank; >> - nm->name.epoch = ORTE_EPOCH_INVALID; >> - nm->name.epoch = orte_ess.proc_get_epoch(&nm->name); >> + >> ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); >> } >> opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); >> } >> >> Modified: trunk/orte/mca/odls/base/odls_base_state.c >> ============================================================================== >> --- trunk/orte/mca/odls/base/odls_base_state.c (original) >> +++ trunk/orte/mca/odls/base/odls_base_state.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -77,17 +77,17 @@ >> /* if I am the HNP, then use me as the source */ >> p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; >> p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; >> - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); >> } >> else { >> /* otherwise, set the HNP as the source */ >> p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; >> p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; >> - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; >> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); >> } >> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >> >> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >> >> >> Modified: trunk/orte/mca/oob/tcp/oob_tcp_msg.c >> ============================================================================== >> --- trunk/orte/mca/oob/tcp/oob_tcp_msg.c (original) >> +++ trunk/orte/mca/oob/tcp/oob_tcp_msg.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -137,6 +137,7 @@ >> bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct >> mca_oob_tcp_peer_t * peer) >> { >> int rc; >> + >> while(1) { >> rc = writev(peer->peer_sd, msg->msg_rwptr, msg->msg_rwnum); >> if(rc < 0) { >> @@ -338,6 +339,7 @@ >> orte_process_name_t src = msg->msg_hdr.msg_src; >> >> OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); >> + >> if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, >> &src) != OPAL_EQUAL) { >> opal_hash_table_remove_value_uint64(&mca_oob_tcp_component.tcp_peers, >> >> orte_util_hash_name(&peer->peer_name)); >> >> Modified: trunk/orte/mca/oob/tcp/oob_tcp_peer.c >> ============================================================================== >> --- trunk/orte/mca/oob/tcp/oob_tcp_peer.c (original) >> +++ trunk/orte/mca/oob/tcp/oob_tcp_peer.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -903,6 +903,11 @@ >> static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user) >> { >> mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user; >> + >> + if (orte_abnormal_term_ordered) { >> + return; >> + } >> + >> OPAL_THREAD_LOCK(&peer->peer_lock); >> switch(peer->peer_state) { >> case MCA_OOB_TCP_CONNECT_ACK: >> >> Modified: trunk/orte/mca/plm/base/plm_base_jobid.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_jobid.c (original) >> +++ trunk/orte/mca/plm/base/plm_base_jobid.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -62,12 +62,12 @@ >> /* set the name */ >> ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); >> ORTE_PROC_MY_NAME->vpid = 0; >> - ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >> >> /* copy it to the HNP field */ >> ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; >> ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; >> - ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_PROC_MY_NAME->epoch); >> >> /* done */ >> return ORTE_SUCCESS; >> >> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_launch_support.c (original) >> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -377,8 +377,7 @@ >> /* push stdin - the IOF will know what to do with the specified target */ >> name.jobid = job; >> name.vpid = jdata->stdin_target; >> - name.epoch = ORTE_EPOCH_INVALID; >> - name.epoch = orte_ess.proc_get_epoch(&name); >> + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >> >> if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) { >> ORTE_ERROR_LOG(rc); >> >> Modified: trunk/orte/mca/plm/base/plm_base_orted_cmds.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_orted_cmds.c (original) >> +++ trunk/orte/mca/plm/base/plm_base_orted_cmds.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -163,8 +163,7 @@ >> continue; >> } >> peer.vpid = v; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> /* don't worry about errors on the send here - just >> * issue it and keep going >> @@ -242,7 +241,7 @@ >> OBJ_CONSTRUCT(&proc, orte_proc_t); >> proc . name . jobid = jobid; >> proc . name . vpid = ORTE_VPID_WILDCARD; >> - proc . name . epoch = ORTE_EPOCH_WILDCARD; >> + ORTE_EPOCH_SET(proc . name . epoch,ORTE_EPOCH_WILDCARD); >> opal_pointer_array_add(&procs, &proc); >> if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) { >> ORTE_ERROR_LOG(rc); >> @@ -340,8 +339,7 @@ >> continue; >> } >> peer.vpid = v; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> /* check to see if this daemon is known to be "dead" */ >> if (proc->state > ORTE_PROC_STATE_UNTERMINATED) { >> /* don't try to send this */ >> >> Modified: trunk/orte/mca/plm/base/plm_base_receive.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_receive.c (original) >> +++ trunk/orte/mca/plm/base/plm_base_receive.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -146,7 +146,9 @@ >> orte_job_t *jdata, *parent; >> opal_buffer_t answer; >> orte_vpid_t vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; >> +#endif >> orte_proc_t *proc; >> orte_proc_state_t state; >> orte_exit_code_t exit_code; >> @@ -394,8 +396,7 @@ >> break; >> } >> name.vpid = vpid; >> - name.epoch = ORTE_EPOCH_INVALID; >> - name.epoch = orte_ess.proc_get_epoch(&name); >> + >> ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >> >> /* unpack the pid */ >> count = 1; >> @@ -488,9 +489,11 @@ >> } >> name.vpid = vpid; >> >> +#if ORTE_ENABLE_EPOCH >> count=1; >> opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH); >> name.epoch = epoch; >> +#endif >> >> OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, >> "%s plm:base:receive Described rank %s", >> >> Modified: trunk/orte/mca/plm/base/plm_base_rsh_support.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_rsh_support.c (original) >> +++ trunk/orte/mca/plm/base/plm_base_rsh_support.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -1527,7 +1527,9 @@ >> { >> char *param, *path, *tmp, *cmd, *basename, *dest_dir; >> int i; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; >> +#endif >> orte_process_name_t proc; >> >> /* if a prefix is set, pass it to the bootproxy in a special way */ >> @@ -1638,6 +1640,7 @@ >> opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv); >> free(cmd); >> >> +#if ORTE_ENABLE_EPOCH >> /* set the epoch */ >> proc.jobid = jobid; >> proc.vpid = vpid; >> @@ -1648,6 +1651,7 @@ >> opal_setenv(param, cmd, true, argv); >> free(param); >> free(cmd); >> +#endif >> >> /* set the number of procs */ >> asprintf(&cmd, "%d", (int)num_procs); >> >> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c >> ============================================================================== >> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) >> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -33,12 +33,14 @@ >> #include "orte/mca/ess/ess.h" >> #include "opal/mca/sysinfo/sysinfo_types.h" >> >> +#include "orte/types.h" >> #include "orte/util/show_help.h" >> #include "orte/util/name_fns.h" >> #include "orte/runtime/orte_globals.h" >> #include "orte/util/hostfile/hostfile.h" >> #include "orte/util/dash_host/dash_host.h" >> #include "orte/mca/errmgr/errmgr.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rmaps/base/rmaps_private.h" >> #include "orte/mca/rmaps/base/base.h" >> @@ -454,7 +456,7 @@ >> */ >> >> /* We do set the epoch here since they all start with the same value. >> */ >> - proc->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >> >> proc->app_idx = app_idx; >> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >> @@ -559,11 +561,12 @@ >> } >> } >> proc->name.vpid = vpid; >> - proc->name.epoch = ORTE_EPOCH_INVALID; >> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >> + >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> + >> /* If there is an invalid epoch here, it's because it >> doesn't exist yet. */ >> - if (ORTE_NODE_RANK_INVALID == proc->name.epoch) { >> - proc->name.epoch = ORTE_EPOCH_MIN; >> + if (0 == >> ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >> } >> } >> if (NULL == opal_pointer_array_get_item(jdata->procs, >> proc->name.vpid)) { >> @@ -601,8 +604,8 @@ >> } >> } >> proc->name.vpid = vpid; >> - proc->name.epoch = ORTE_EPOCH_INVALID; >> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >> + >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> } >> if (NULL == opal_pointer_array_get_item(jdata->procs, >> proc->name.vpid)) { >> if (ORTE_SUCCESS != (rc = >> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >> @@ -835,7 +838,7 @@ >> return ORTE_ERR_OUT_OF_RESOURCE; >> } >> proc->name.vpid = daemons->num_procs; /* take the next available >> vpid */ >> - proc->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >> proc->node = node; >> proc->nodename = node->name; >> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >> @@ -1014,8 +1017,8 @@ >> return ORTE_ERR_OUT_OF_RESOURCE; >> } >> proc->name.vpid = jdata->num_procs; /* take the next available vpid >> */ >> - proc->name.epoch = ORTE_EPOCH_INVALID; >> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >> + >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> proc->node = node; >> proc->nodename = node->name; >> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >> >> Modified: trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c >> ============================================================================== >> --- trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c (original) >> +++ trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -502,8 +502,7 @@ >> } >> proc->name.vpid = rank; >> /* Either init or update the epoch. */ >> - proc->name.epoch = ORTE_EPOCH_INVALID; >> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >> + >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> >> proc->slot_list = strdup(rfmap->slot_list); >> /* insert the proc into the proper place */ >> >> Modified: trunk/orte/mca/rmaps/seq/rmaps_seq.c >> ============================================================================== >> --- trunk/orte/mca/rmaps/seq/rmaps_seq.c (original) >> +++ trunk/orte/mca/rmaps/seq/rmaps_seq.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -235,8 +235,7 @@ >> } >> /* assign the vpid */ >> proc->name.vpid = vpid++; >> - proc->name.epoch = ORTE_EPOCH_INVALID; >> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >> + >> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >> >> /* add to the jdata proc array */ >> if (ORTE_SUCCESS != (rc = >> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >> >> Modified: trunk/orte/mca/rmcast/base/rmcast_base_open.c >> ============================================================================== >> --- trunk/orte/mca/rmcast/base/rmcast_base_open.c (original) >> +++ trunk/orte/mca/rmcast/base/rmcast_base_open.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -341,7 +341,7 @@ >> { >> ptr->name.jobid = ORTE_JOBID_INVALID; >> ptr->name.vpid = ORTE_VPID_INVALID; >> - ptr->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >> ptr->channel = ORTE_RMCAST_INVALID_CHANNEL; >> OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t); >> ptr->seq_num = ORTE_RMCAST_SEQ_INVALID; >> @@ -430,7 +430,7 @@ >> { >> ptr->name.jobid = ORTE_JOBID_INVALID; >> ptr->name.vpid = ORTE_VPID_INVALID; >> - ptr->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >> OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t); >> } >> static void recvlog_destruct(rmcast_recv_log_t *ptr) >> @@ -439,7 +439,7 @@ >> >> ptr->name.jobid = ORTE_JOBID_INVALID; >> ptr->name.vpid = ORTE_VPID_INVALID; >> - ptr->name.epoch = ORTE_EPOCH_INVALID; >> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_INVALID); >> while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) { >> OBJ_RELEASE(item); >> } >> >> Modified: trunk/orte/mca/rmcast/tcp/rmcast_tcp.c >> ============================================================================== >> --- trunk/orte/mca/rmcast/tcp/rmcast_tcp.c (original) >> +++ trunk/orte/mca/rmcast/tcp/rmcast_tcp.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -681,7 +681,7 @@ >> /* caller requested id of sender */ >> name->jobid = recvptr->name.jobid; >> name->vpid = recvptr->name.vpid; >> - name->epoch= recvptr->name.epoch; >> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >> } >> *seq_num = recvptr->seq_num; >> *msg = recvptr->iovec_array; >> @@ -776,7 +776,7 @@ >> /* caller requested id of sender */ >> name->jobid = recvptr->name.jobid; >> name->vpid = recvptr->name.vpid; >> - name->epoch= recvptr->name.epoch; >> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >> } >> *seq_num = recvptr->seq_num; >> if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { >> >> Modified: trunk/orte/mca/rmcast/udp/rmcast_udp.c >> ============================================================================== >> --- trunk/orte/mca/rmcast/udp/rmcast_udp.c (original) >> +++ trunk/orte/mca/rmcast/udp/rmcast_udp.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -460,7 +460,7 @@ >> /* caller requested id of sender */ >> name->jobid = recvptr->name.jobid; >> name->vpid = recvptr->name.vpid; >> - name->epoch= recvptr->name.epoch; >> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >> } >> *seq_num = recvptr->seq_num; >> *msg = recvptr->iovec_array; >> @@ -553,7 +553,7 @@ >> /* caller requested id of sender */ >> name->jobid = recvptr->name.jobid; >> name->vpid = recvptr->name.vpid; >> - name->epoch= recvptr->name.epoch; >> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >> } >> *seq_num = recvptr->seq_num; >> if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { >> >> Modified: trunk/orte/mca/rml/base/rml_base_components.c >> ============================================================================== >> --- trunk/orte/mca/rml/base/rml_base_components.c (original) >> +++ trunk/orte/mca/rml/base/rml_base_components.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -20,6 +20,7 @@ >> #include "opal/util/output.h" >> >> #include "orte/mca/rml/rml.h" >> +#include "orte/util/name_fns.h" >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> >> @@ -67,14 +68,14 @@ >> { >> pkt->sender.jobid = ORTE_JOBID_INVALID; >> pkt->sender.vpid = ORTE_VPID_INVALID; >> - pkt->sender.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_MIN); >> pkt->buffer = NULL; >> } >> static void msg_pkt_destructor(orte_msg_packet_t *pkt) >> { >> pkt->sender.jobid = ORTE_JOBID_INVALID; >> pkt->sender.vpid = ORTE_VPID_INVALID; >> - pkt->sender.epoch = ORTE_EPOCH_INVALID; >> + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_INVALID); >> if (NULL != pkt->buffer) { >> OBJ_RELEASE(pkt->buffer); >> } >> >> Modified: trunk/orte/mca/rml/rml_types.h >> ============================================================================== >> --- trunk/orte/mca/rml/rml_types.h (original) >> +++ trunk/orte/mca/rml/rml_types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -62,7 +62,7 @@ >> pkt = OBJ_NEW(orte_msg_packet_t); \ >> pkt->sender.jobid = (sndr)->jobid; \ >> pkt->sender.vpid = (sndr)->vpid; \ >> - pkt->sender.epoch = (sndr)->epoch; \ >> + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ >> if ((crt)) { \ >> pkt->buffer = OBJ_NEW(opal_buffer_t); \ >> opal_dss.copy_payload(pkt->buffer, *(buf)); \ >> @@ -85,7 +85,7 @@ >> pkt = OBJ_NEW(orte_msg_packet_t); \ >> pkt->sender.jobid = (sndr)->jobid; \ >> pkt->sender.vpid = (sndr)->vpid; \ >> - pkt->sender.epoch = (sndr)->epoch; \ >> + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ >> if ((crt)) { \ >> pkt->buffer = OBJ_NEW(opal_buffer_t); \ >> opal_dss.copy_payload(pkt->buffer, *(buf)); \ >> @@ -191,8 +191,10 @@ >> >> #define ORTE_RML_TAG_SUBSCRIBE 46 >> >> +#if ORTE_ENABLE_EPOCH >> /* For Epoch Updates */ >> #define ORTE_RML_TAG_EPOCH_CHANGE 47 >> +#endif >> >> /* Notify of failed processes */ >> #define ORTE_RML_TAG_FAILURE_NOTICE 48 >> >> Modified: trunk/orte/mca/routed/base/routed_base_components.c >> ============================================================================== >> --- trunk/orte/mca/routed/base/routed_base_components.c (original) >> +++ trunk/orte/mca/routed/base/routed_base_components.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -65,7 +65,7 @@ >> { >> ptr->route.jobid = ORTE_JOBID_INVALID; >> ptr->route.vpid = ORTE_VPID_INVALID; >> - ptr->route.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN); >> ptr->hnp_uri = NULL; >> } >> static void jfamdest(orte_routed_jobfam_t *ptr) >> @@ -117,7 +117,7 @@ >> jfam = OBJ_NEW(orte_routed_jobfam_t); >> jfam->route.jobid = ORTE_PROC_MY_HNP->jobid; >> jfam->route.vpid = ORTE_PROC_MY_HNP->vpid; >> - jfam->route.epoch = ORTE_PROC_MY_HNP->epoch; >> + ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch); >> jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); >> if (NULL != orte_process_info.my_hnp_uri) { >> jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri); >> @@ -252,7 +252,7 @@ >> jfam->job_family = jobfamily; >> jfam->route.jobid = name.jobid; >> jfam->route.vpid = name.vpid; >> - jfam->route.epoch = name.epoch; >> + ORTE_EPOCH_SET(jfam->route.epoch,name.epoch); >> jfam->hnp_uri = strdup(uri); >> done: >> free(uri); >> >> Modified: trunk/orte/mca/routed/base/routed_base_register_sync.c >> ============================================================================== >> --- trunk/orte/mca/routed/base/routed_base_register_sync.c (original) >> +++ trunk/orte/mca/routed/base/routed_base_register_sync.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -127,7 +127,9 @@ >> orte_std_cntr_t cnt; >> char *rml_uri; >> orte_vpid_t vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; >> +#endif >> int rc; >> >> if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { >> @@ -146,11 +148,13 @@ >> cnt = 1; >> while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, >> ORTE_VPID))) { >> >> +#if ORTE_ENABLE_EPOCH >> cnt = 1; >> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, >> ORTE_EPOCH))) { >> ORTE_ERROR_LOG(rc); >> continue; >> } >> +#endif >> >> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, >> OPAL_STRING))) { >> ORTE_ERROR_LOG(rc); >> >> Modified: trunk/orte/mca/routed/binomial/routed_binomial.c >> ============================================================================== >> --- trunk/orte/mca/routed/binomial/routed_binomial.c (original) >> +++ trunk/orte/mca/routed/binomial/routed_binomial.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -33,6 +33,7 @@ >> #include "orte/runtime/orte_globals.h" >> #include "orte/runtime/orte_wait.h" >> #include "orte/runtime/runtime.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rml/base/rml_contact.h" >> >> @@ -147,7 +148,7 @@ >> >> if (proc->jobid == ORTE_JOBID_INVALID || >> proc->vpid == ORTE_VPID_INVALID || >> - proc->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -216,7 +217,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -274,8 +275,7 @@ >> ORTE_NAME_PRINT(route))); >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = ORTE_EPOCH_INVALID; >> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >> + >> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >> >> return ORTE_SUCCESS; >> } >> @@ -290,8 +290,7 @@ >> jfam->job_family = jfamily; >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = ORTE_EPOCH_INVALID; >> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >> + >> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >> >> opal_pointer_array_add(&orte_routed_jobfams, jfam); >> return ORTE_SUCCESS; >> @@ -317,11 +316,21 @@ >> /* initialize */ >> daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; >> daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; >> - daemon.epoch = ORTE_PROC_MY_DAEMON->epoch; >> + ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch); >> >> +#if ORTE_ENABLE_EPOCH >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> target->epoch == ORTE_EPOCH_INVALID) { >> +#else >> + if (target->jobid == ORTE_JOBID_INVALID || >> + target->vpid == ORTE_VPID_INVALID) { >> +#endif >> + ret = ORTE_NAME_INVALID; >> + goto found; >> + } >> + >> + if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) >> { >> ret = ORTE_NAME_INVALID; >> goto found; >> } >> @@ -443,7 +452,7 @@ >> >> /* If the daemon to which we should be routing is dead, then >> update >> * the routing tree and start over. */ >> - if (!orte_util_proc_is_running(&daemon)) { >> + if (!PROC_IS_RUNNING(&daemon)) { >> update_routing_tree(daemon.jobid); >> goto startover; >> } >> @@ -461,8 +470,7 @@ >> ret = &daemon; >> >> found: >> - daemon.epoch = ORTE_EPOCH_INVALID; >> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, >> "%s routed_binomial_get(%s) --> %s", >> @@ -879,7 +887,7 @@ >> */ >> local_lifeline.jobid = proc->jobid; >> local_lifeline.vpid = proc->vpid; >> - local_lifeline.epoch = proc->epoch; >> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >> lifeline = &local_lifeline; >> >> return ORTE_SUCCESS; >> @@ -924,11 +932,11 @@ >> * that process so we can check it's state. >> */ >> proc_name.vpid = peer; >> - proc_name.epoch = orte_util_lookup_epoch(&proc_name); >> + >> ORTE_EPOCH_SET(proc_name.epoch,orte_util_lookup_epoch(&proc_name)); >> >> - if (!orte_util_proc_is_running(&proc_name) >> - && ORTE_EPOCH_MIN < proc_name.epoch >> - && ORTE_EPOCH_INVALID != proc_name.epoch) { >> + if (!PROC_IS_RUNNING(&proc_name) >> + && 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,proc_name.epoch) >> + && 0 != >> ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc_name.epoch)) { >> OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >> "%s routed:binomial child %s is >> dead", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> @@ -967,7 +975,7 @@ >> } >> >> /* find the children of this rank */ >> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >> "%s routed:binomial find children of rank %d", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank)); >> bitmap = opal_cube_dim(num_procs); >> @@ -977,24 +985,25 @@ >> >> for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { >> peer = rank | mask; >> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >> "%s routed:binomial find children checking peer >> %d", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer)); >> if (peer < num_procs) { >> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >> "%s routed:binomial find children computing >> tree", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> /* execute compute on this child */ >> if (0 <= (found = binomial_tree(peer, rank, me, num_procs, >> nchildren, childrn, relatives, mine, jobid))) { >> proc_name.vpid = found; >> >> - if (!orte_util_proc_is_running(&proc_name) && >> ORTE_EPOCH_MIN < orte_util_lookup_epoch(&proc_name)) { >> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >> + if (!PROC_IS_RUNNING(&proc_name) >> + && 0 < >> ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,orte_util_lookup_epoch(&proc_name))) { >> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >> "%s routed:binomial find children >> proc out of date - returning parent %d", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> parent)); >> return parent; >> } >> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >> "%s routed:binomial find children >> returning found value %d", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> found)); >> return found; >> @@ -1029,8 +1038,7 @@ >> ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid, >> orte_process_info.max_procs, >> &num_children, &my_children, NULL, true, >> jobid); >> - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_PARENT->epoch = >> orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); >> >> if (0 < opal_output_get_verbosity(orte_routed_base_output)) { >> opal_output(0, "%s: parent %d num_children %d", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children); >> >> Modified: trunk/orte/mca/routed/cm/routed_cm.c >> ============================================================================== >> --- trunk/orte/mca/routed/cm/routed_cm.c (original) >> +++ trunk/orte/mca/routed/cm/routed_cm.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -35,6 +35,7 @@ >> #include "orte/runtime/orte_globals.h" >> #include "orte/runtime/orte_wait.h" >> #include "orte/runtime/runtime.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rml/base/rml_contact.h" >> >> @@ -139,7 +140,7 @@ >> >> if (proc->jobid == ORTE_JOBID_INVALID || >> proc->vpid == ORTE_VPID_INVALID || >> - proc->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -200,7 +201,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -257,8 +258,7 @@ >> ORTE_NAME_PRINT(route))); >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = ORTE_EPOCH_INVALID; >> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >> + >> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >> >> return ORTE_SUCCESS; >> } >> @@ -273,8 +273,7 @@ >> jfam->job_family = jfamily; >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = ORTE_EPOCH_INVALID; >> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >> + >> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >> >> opal_pointer_array_add(&orte_routed_jobfams, jfam); >> return ORTE_SUCCESS; >> @@ -299,7 +298,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> ret = ORTE_NAME_INVALID; >> goto found; >> } >> @@ -367,8 +366,7 @@ >> } >> >> /* Initialize daemon's epoch, based on its current vpid/jobid */ >> - daemon.epoch = ORTE_EPOCH_INVALID; >> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >> >> /* if the daemon is me, then send direct to the target! */ >> if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { >> @@ -814,8 +812,7 @@ >> */ >> local_lifeline.jobid = proc->jobid; >> local_lifeline.vpid = proc->vpid; >> - local_lifeline.epoch = ORTE_EPOCH_INVALID; >> - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); >> + >> ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); >> >> lifeline = &local_lifeline; >> >> >> Modified: trunk/orte/mca/routed/direct/routed_direct.c >> ============================================================================== >> --- trunk/orte/mca/routed/direct/routed_direct.c (original) >> +++ trunk/orte/mca/routed/direct/routed_direct.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -24,6 +24,7 @@ >> #include "orte/util/name_fns.h" >> #include "orte/util/proc_info.h" >> #include "orte/runtime/orte_globals.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rml/base/rml_contact.h" >> >> @@ -135,7 +136,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> ret = ORTE_NAME_INVALID; >> } else { >> /* all routes are direct */ >> >> Modified: trunk/orte/mca/routed/linear/routed_linear.c >> ============================================================================== >> --- trunk/orte/mca/routed/linear/routed_linear.c (original) >> +++ trunk/orte/mca/routed/linear/routed_linear.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -31,6 +31,7 @@ >> #include "orte/runtime/orte_globals.h" >> #include "orte/runtime/orte_wait.h" >> #include "orte/runtime/runtime.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rml/base/rml_contact.h" >> >> @@ -132,7 +133,7 @@ >> >> if (proc->jobid == ORTE_JOBID_INVALID || >> proc->vpid == ORTE_VPID_INVALID || >> - proc->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -201,7 +202,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -259,7 +260,7 @@ >> ORTE_NAME_PRINT(route))); >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = route->epoch; >> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >> return ORTE_SUCCESS; >> } >> } >> @@ -273,7 +274,7 @@ >> jfam->job_family = jfamily; >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = route->epoch; >> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >> opal_pointer_array_add(&orte_routed_jobfams, jfam); >> return ORTE_SUCCESS; >> } >> @@ -373,8 +374,7 @@ >> } >> >> /* Initialize daemon's epoch, based on its current vpid/jobid */ >> - daemon.epoch = ORTE_EPOCH_INVALID; >> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >> >> /* if the daemon is me, then send direct to the target! */ >> if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { >> @@ -395,8 +395,7 @@ >> /* we are at end of chain - wrap around */ >> daemon.vpid = 0; >> } >> - daemon.epoch = ORTE_EPOCH_INVALID; >> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >> ret = &daemon; >> } >> } >> @@ -741,7 +740,7 @@ >> */ >> local_lifeline.jobid = proc->jobid; >> local_lifeline.vpid = proc->vpid; >> - local_lifeline.epoch = proc->epoch; >> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >> lifeline = &local_lifeline; >> >> return ORTE_SUCCESS; >> >> Modified: trunk/orte/mca/routed/radix/routed_radix.c >> ============================================================================== >> --- trunk/orte/mca/routed/radix/routed_radix.c (original) >> +++ trunk/orte/mca/routed/radix/routed_radix.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -31,6 +31,7 @@ >> #include "orte/runtime/orte_globals.h" >> #include "orte/runtime/orte_wait.h" >> #include "orte/runtime/runtime.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rml/base/rml_contact.h" >> >> @@ -145,7 +146,7 @@ >> >> if (proc->jobid == ORTE_JOBID_INVALID || >> proc->vpid == ORTE_VPID_INVALID || >> - proc->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -214,7 +215,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> return ORTE_ERR_BAD_PARAM; >> } >> >> @@ -272,7 +273,7 @@ >> ORTE_NAME_PRINT(route))); >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = route->epoch; >> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >> return ORTE_SUCCESS; >> } >> } >> @@ -286,7 +287,7 @@ >> jfam->job_family = jfamily; >> jfam->route.jobid = route->jobid; >> jfam->route.vpid = route->vpid; >> - jfam->route.epoch = route->epoch; >> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >> opal_pointer_array_add(&orte_routed_jobfams, jfam); >> return ORTE_SUCCESS; >> } >> @@ -310,7 +311,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> ret = ORTE_NAME_INVALID; >> goto found; >> } >> @@ -413,8 +414,7 @@ >> if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { >> /* yep - we need to step through this child */ >> daemon.vpid = child->vpid; >> - daemon.epoch = ORTE_EPOCH_INVALID; >> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >> + >> ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >> ret = &daemon; >> goto found; >> } >> @@ -425,8 +425,7 @@ >> * any of our children, so we have to step up through our parent >> */ >> daemon.vpid = ORTE_PROC_MY_PARENT->vpid; >> - daemon.epoch = ORTE_EPOCH_INVALID; >> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >> >> ret = &daemon; >> >> @@ -788,7 +787,7 @@ >> */ >> local_lifeline.jobid = proc->jobid; >> local_lifeline.vpid = proc->vpid; >> - local_lifeline.epoch = proc->epoch; >> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >> lifeline = &local_lifeline; >> >> return ORTE_SUCCESS; >> @@ -881,8 +880,7 @@ >> ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel; >> ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel); >> } >> - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; >> - ORTE_PROC_MY_PARENT->epoch = >> orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); >> + >> ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); >> >> /* compute my direct children and the bitmap that shows which vpids >> * lie underneath their branch >> >> Modified: trunk/orte/mca/routed/slave/routed_slave.c >> ============================================================================== >> --- trunk/orte/mca/routed/slave/routed_slave.c (original) >> +++ trunk/orte/mca/routed/slave/routed_slave.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -26,6 +26,7 @@ >> #include "orte/runtime/orte_globals.h" >> #include "orte/runtime/orte_wait.h" >> #include "orte/runtime/runtime.h" >> +#include "orte/runtime/data_type_support/orte_dt_support.h" >> >> #include "orte/mca/rml/base/rml_contact.h" >> >> @@ -134,7 +135,7 @@ >> >> if (target->jobid == ORTE_JOBID_INVALID || >> target->vpid == ORTE_VPID_INVALID || >> - target->epoch == ORTE_EPOCH_INVALID) { >> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >> ret = ORTE_NAME_INVALID; >> } else { >> /* a slave must always route via its parent daemon */ >> @@ -275,8 +276,7 @@ >> */ >> local_lifeline.jobid = proc->jobid; >> local_lifeline.vpid = proc->vpid; >> - local_lifeline.epoch = ORTE_EPOCH_INVALID; >> - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); >> + >> ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); >> >> lifeline = &local_lifeline; >> >> >> Modified: trunk/orte/mca/sensor/file/sensor_file.c >> ============================================================================== >> --- trunk/orte/mca/sensor/file/sensor_file.c (original) >> +++ trunk/orte/mca/sensor/file/sensor_file.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -70,7 +70,9 @@ >> opal_list_item_t super; >> orte_jobid_t jobid; >> orte_vpid_t vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; >> +#endif >> char *file; >> int tick; >> bool check_size; >> >> Modified: trunk/orte/mca/snapc/base/snapc_base_fns.c >> ============================================================================== >> --- trunk/orte/mca/snapc/base/snapc_base_fns.c (original) >> +++ trunk/orte/mca/snapc/base/snapc_base_fns.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -81,7 +81,7 @@ >> { >> snapshot->process_name.jobid = 0; >> snapshot->process_name.vpid = 0; >> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >> >> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >> >> @@ -92,7 +92,7 @@ >> { >> snapshot->process_name.jobid = 0; >> snapshot->process_name.vpid = 0; >> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >> >> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >> >> >> Modified: trunk/orte/mca/snapc/full/snapc_full_global.c >> ============================================================================== >> --- trunk/orte/mca/snapc/full/snapc_full_global.c (original) >> +++ trunk/orte/mca/snapc/full/snapc_full_global.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -427,7 +427,7 @@ >> new_proc = OBJ_NEW(orte_proc_t); >> new_proc->name.jobid = proc->name.jobid; >> new_proc->name.vpid = proc->name.vpid; >> - new_proc->name.epoch = proc->name.epoch; >> + ORTE_EPOCH_SET(new_proc->name.epoch,proc->name.epoch); >> new_proc->node = OBJ_NEW(orte_node_t); >> new_proc->node->name = proc->node->name; >> opal_list_append(migrating_procs, &new_proc->super); >> @@ -618,7 +618,7 @@ >> >> orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; >> orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; >> - orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch; >> + >> ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); >> >> mask = ORTE_NS_CMP_JOBID; >> >> @@ -636,7 +636,7 @@ >> >> app_snapshot->process_name.jobid = procs[p]->name.jobid; >> app_snapshot->process_name.vpid = procs[p]->name.vpid; >> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >> + >> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >> >> opal_list_append(&(orted_snapshot->super.local_snapshots), >> &(app_snapshot->super)); >> } >> @@ -800,7 +800,7 @@ >> >> app_snapshot->process_name.jobid = procs[p]->name.jobid; >> app_snapshot->process_name.vpid = procs[p]->name.vpid; >> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >> + >> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >> >> opal_list_append(&(orted_snapshot->super.local_snapshots), >> &(app_snapshot->super)); >> } >> @@ -816,7 +816,7 @@ >> >> orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; >> orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; >> - orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch; >> + >> ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); >> >> mask = ORTE_NS_CMP_ALL; >> >> @@ -837,7 +837,7 @@ >> >> app_snapshot->process_name.jobid = procs[p]->name.jobid; >> app_snapshot->process_name.vpid = procs[p]->name.vpid; >> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >> + >> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >> >> opal_list_append(&(orted_snapshot->super.local_snapshots), >> &(app_snapshot->super)); >> } >> >> Modified: trunk/orte/mca/snapc/full/snapc_full_local.c >> ============================================================================== >> --- trunk/orte/mca/snapc/full/snapc_full_local.c (original) >> +++ trunk/orte/mca/snapc/full/snapc_full_local.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -2033,7 +2033,7 @@ >> vpid_snapshot->process_pid = child->pid; >> vpid_snapshot->super . process_name . jobid = child->name->jobid; >> vpid_snapshot->super . process_name . vpid = child->name->vpid; >> - vpid_snapshot->super . process_name . epoch = >> child->name->epoch; >> + ORTE_EPOCH_SET(vpid_snapshot->super . process_name . >> epoch,child->name->epoch); >> } >> } >> >> @@ -2095,7 +2095,7 @@ >> vpid_snapshot->process_pid = child->pid; >> vpid_snapshot->super . process_name . jobid = child->name->jobid; >> vpid_snapshot->super . process_name . vpid = child->name->vpid; >> - vpid_snapshot->super . process_name . epoch = >> child->name->epoch; >> + ORTE_EPOCH_SET(vpid_snapshot->super . process_name . >> epoch,child->name->epoch); >> /*vpid_snapshot->migrating = true;*/ >> >> opal_list_append(&(local_global_snapshot.local_snapshots), >> &(vpid_snapshot->super.super)); >> @@ -2111,7 +2111,7 @@ >> vpid_snapshot->process_pid = child->pid; >> vpid_snapshot->super . process_name . jobid = child->name->jobid; >> vpid_snapshot->super . process_name . vpid = child->name->vpid; >> - vpid_snapshot->super . process_name . epoch = >> child->name->epoch; >> + ORTE_EPOCH_SET(vpid_snapshot->super . process_name . >> epoch,child->name->epoch); >> } >> } >> >> >> Modified: trunk/orte/mca/snapc/full/snapc_full_module.c >> ============================================================================== >> --- trunk/orte/mca/snapc/full/snapc_full_module.c (original) >> +++ trunk/orte/mca/snapc/full/snapc_full_module.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -83,7 +83,7 @@ >> void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t >> *snapshot) { >> snapshot->process_name.jobid = 0; >> snapshot->process_name.vpid = 0; >> - snapshot->process_name.epoch = 0; >> + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); >> >> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >> } >> @@ -91,7 +91,7 @@ >> void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t >> *snapshot) { >> snapshot->process_name.jobid = 0; >> snapshot->process_name.vpid = 0; >> - snapshot->process_name.epoch = 0; >> + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); >> >> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >> } >> >> Modified: trunk/orte/mca/sstore/base/sstore_base_fns.c >> ============================================================================== >> --- trunk/orte/mca/sstore/base/sstore_base_fns.c (original) >> +++ trunk/orte/mca/sstore/base/sstore_base_fns.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -62,7 +62,7 @@ >> { >> snapshot->process_name.jobid = 0; >> snapshot->process_name.vpid = 0; >> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >> >> snapshot->crs_comp = NULL; >> snapshot->compress_comp = NULL; >> @@ -76,7 +76,7 @@ >> { >> snapshot->process_name.jobid = 0; >> snapshot->process_name.vpid = 0; >> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >> >> if( NULL != snapshot->crs_comp ) { >> free(snapshot->crs_comp); >> @@ -637,7 +637,7 @@ >> >> vpid_snapshot->process_name.jobid = proc.jobid; >> vpid_snapshot->process_name.vpid = proc.vpid; >> - vpid_snapshot->process_name.epoch = proc.epoch; >> + ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch); >> } >> else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, >> strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) { >> vpid_snapshot->crs_comp = strdup(value); >> >> Modified: trunk/orte/mca/sstore/central/sstore_central_global.c >> ============================================================================== >> --- trunk/orte/mca/sstore/central/sstore_central_global.c (original) >> +++ trunk/orte/mca/sstore/central/sstore_central_global.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -1216,8 +1216,7 @@ >> >> vpid_snapshot->process_name.jobid = handle_info->jobid; >> vpid_snapshot->process_name.vpid = i; >> - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; >> - vpid_snapshot->process_name.epoch = >> orte_ess.proc_get_epoch(&vpid_snapshot->process_name); >> + >> ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); >> >> vpid_snapshot->crs_comp = NULL; >> global_snapshot->start_time = NULL; >> >> Modified: trunk/orte/mca/sstore/central/sstore_central_local.c >> ============================================================================== >> --- trunk/orte/mca/sstore/central/sstore_central_local.c (original) >> +++ trunk/orte/mca/sstore/central/sstore_central_local.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -210,7 +210,7 @@ >> { >> info->name.jobid = ORTE_JOBID_INVALID; >> info->name.vpid = ORTE_VPID_INVALID; >> - info->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >> >> info->local_location = NULL; >> info->metadata_filename = NULL; >> @@ -222,7 +222,7 @@ >> { >> info->name.jobid = ORTE_JOBID_INVALID; >> info->name.vpid = ORTE_VPID_INVALID; >> - info->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >> >> if( NULL != info->local_location ) { >> free(info->local_location); >> @@ -535,7 +535,7 @@ >> >> app_info->name.jobid = name->jobid; >> app_info->name.vpid = name->vpid; >> - app_info->name.epoch = name->epoch; >> + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); >> >> opal_list_append(handle_info->app_info_handle, &(app_info->super)); >> >> >> Modified: trunk/orte/mca/sstore/stage/sstore_stage_global.c >> ============================================================================== >> --- trunk/orte/mca/sstore/stage/sstore_stage_global.c (original) >> +++ trunk/orte/mca/sstore/stage/sstore_stage_global.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -1218,10 +1218,10 @@ >> p_set = OBJ_NEW(orte_filem_base_process_set_t); >> p_set->source.jobid = peer->jobid; >> p_set->source.vpid = peer->vpid; >> - p_set->source.epoch = peer->epoch; >> + ORTE_EPOCH_SET(p_set->source.epoch,peer->epoch); >> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >> } >> >> @@ -1706,8 +1706,7 @@ >> >> vpid_snapshot->process_name.jobid = handle_info->jobid; >> vpid_snapshot->process_name.vpid = i; >> - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; >> - vpid_snapshot->process_name.epoch = >> orte_ess.proc_get_epoch(&vpid_snapshot->process_name); >> + >> ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); >> >> /* JJH: Currently we do not have this information since we do not save >> * individual vpid info in the Global SStore. It is in the metadata >> >> Modified: trunk/orte/mca/sstore/stage/sstore_stage_local.c >> ============================================================================== >> --- trunk/orte/mca/sstore/stage/sstore_stage_local.c (original) >> +++ trunk/orte/mca/sstore/stage/sstore_stage_local.c 2011-08-26 18:16:14 >> EDT (Fri, 26 Aug 2011) >> @@ -287,7 +287,7 @@ >> { >> info->name.jobid = ORTE_JOBID_INVALID; >> info->name.vpid = ORTE_VPID_INVALID; >> - info->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >> >> info->local_location = NULL; >> info->compressed_local_location = NULL; >> @@ -302,7 +302,7 @@ >> { >> info->name.jobid = ORTE_JOBID_INVALID; >> info->name.vpid = ORTE_VPID_INVALID; >> - info->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >> >> if( NULL != info->local_location ) { >> free(info->local_location); >> @@ -1014,7 +1014,7 @@ >> >> app_info->name.jobid = name->jobid; >> app_info->name.vpid = name->vpid; >> - app_info->name.epoch = name->epoch; >> + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); >> >> opal_list_append(handle_info->app_info_handle, &(app_info->super)); >> >> @@ -2057,17 +2057,17 @@ >> /* if I am the HNP, then use me as the source */ >> p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; >> p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; >> - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); >> } >> else { >> /* otherwise, set the HNP as the source */ >> p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; >> p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; >> - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; >> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); >> } >> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >> >> /* Define the file set */ >> >> Modified: trunk/orte/orted/orted_comm.c >> ============================================================================== >> --- trunk/orte/orted/orted_comm.c (original) >> +++ trunk/orte/orted/orted_comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -123,18 +123,13 @@ >> nm = (orte_routed_tree_t*)item; >> >> target.vpid = nm->vpid; >> - target.epoch = orte_util_lookup_epoch(&target); >> + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); >> >> - if (!orte_util_proc_is_running(&target)) { >> + if (!PROC_IS_RUNNING(&target)) { >> continue; >> } >> >> - target.epoch = ORTE_EPOCH_INVALID; >> - if (ORTE_NODE_RANK_INVALID == (target.epoch = >> orte_ess.proc_get_epoch(&target))) { >> - /* If we are trying to send to a previously failed process it's >> - * better to fail silently. */ >> - continue; >> - } >> + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); >> >> OPAL_OUTPUT_VERBOSE((1, orte_debug_output, >> "%s orte:daemon:send_relay sending relay msg to >> %s", >> @@ -422,7 +417,8 @@ >> proct = OBJ_NEW(orte_proc_t); >> proct->name.jobid = proc.jobid; >> proct->name.vpid = proc.vpid; >> - proct->name.epoch = proc.epoch; >> + ORTE_EPOCH_SET(proct->name.epoch,proc.epoch); >> + >> opal_pointer_array_add(&procarray, proct); >> num_replies++; >> } >> @@ -1059,7 +1055,9 @@ >> orte_job_t *jdata; >> orte_proc_t *proc; >> orte_vpid_t vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; >> +#endif >> int32_t i, num_procs; >> >> /* setup the answer */ >> @@ -1086,12 +1084,14 @@ >> goto CLEANUP; >> } >> >> +#if ORTE_ENABLE_EPOCH >> /* unpack the epoch */ >> n = 1; >> if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &epoch, &n, >> ORTE_EPOCH))) { >> ORTE_ERROR_LOG(ret); >> goto CLEANUP; >> } >> +#endif >> >> /* if they asked for a specific proc, then just get that info */ >> if (ORTE_VPID_WILDCARD != vpid) { >> @@ -1201,7 +1201,7 @@ >> /* loop across all daemons */ >> proc2.jobid = ORTE_PROC_MY_NAME->jobid; >> for (proc2.vpid=1; proc2.vpid < >> orte_process_info.num_procs; proc2.vpid++) { >> - proc2.epoch = orte_util_lookup_epoch(&proc2); >> + >> ORTE_EPOCH_SET(proc2.epoch,orte_util_lookup_epoch(&proc2)); >> >> /* setup the cmd */ >> relay_msg = OBJ_NEW(opal_buffer_t); >> >> Modified: trunk/orte/orted/orted_main.c >> ============================================================================== >> --- trunk/orte/orted/orted_main.c (original) >> +++ trunk/orte/orted/orted_main.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -388,14 +388,14 @@ >> orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); >> ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; >> ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; >> - ORTE_PROC_MY_DAEMON->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_EPOCH_MIN); >> >> /* if I am also the hnp, then update that contact info field too */ >> if (ORTE_PROC_IS_HNP) { >> orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); >> ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; >> ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; >> - ORTE_PROC_MY_HNP->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_EPOCH_MIN); >> } >> >> /* setup the primary daemon command receive function */ >> @@ -495,7 +495,8 @@ >> proc = OBJ_NEW(orte_proc_t); >> proc->name.jobid = jdata->jobid; >> proc->name.vpid = 0; >> - proc->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >> + >> proc->state = ORTE_PROC_STATE_RUNNING; >> proc->app_idx = 0; >> proc->node = nodes[0]; /* hnp node must be there */ >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >> (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -76,6 +76,7 @@ >> } >> } >> >> +#if ORTE_ENABLE_EPOCH >> /** check the epochs - if one of them is WILDCARD, then ignore >> * this field since anything is okay >> */ >> @@ -87,6 +88,7 @@ >> return OPAL_VALUE1_GREATER; >> } >> } >> +#endif >> >> /** only way to get here is if all fields are equal or WILDCARD */ >> return OPAL_EQUAL; >> @@ -122,6 +124,7 @@ >> return OPAL_EQUAL; >> } >> >> +#if ORTE_ENABLE_EPOCH >> int orte_dt_compare_epoch(orte_epoch_t *value1, >> orte_epoch_t *value2, >> opal_data_type_t type) >> @@ -136,6 +139,7 @@ >> >> return OPAL_EQUAL; >> } >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> /** >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -61,7 +61,7 @@ >> >> val->jobid = src->jobid; >> val->vpid = src->vpid; >> - val->epoch = src->epoch; >> + ORTE_EPOCH_SET(val->epoch,src->epoch); >> >> *dest = val; >> return ORTE_SUCCESS; >> @@ -105,6 +105,7 @@ >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_ENABLE_EPOCH >> /* >> * EPOCH >> */ >> @@ -123,6 +124,7 @@ >> >> return ORTE_SUCCESS; >> } >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >> (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -58,7 +58,9 @@ >> orte_process_name_t* proc; >> orte_jobid_t *jobid; >> orte_vpid_t *vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t *epoch; >> +#endif >> >> /* collect all the jobids in a contiguous array */ >> jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t)); >> @@ -100,6 +102,7 @@ >> } >> free(vpid); >> >> +#if ORTE_ENABLE_EPOCH >> /* Collect all the epochs in a contiguous array */ >> epoch = (orte_epoch_t *) malloc(num_vals * sizeof(orte_epoch_t)); >> if (NULL == epoch) { >> @@ -118,6 +121,7 @@ >> return rc; >> } >> free(epoch); >> +#endif >> >> return ORTE_SUCCESS; >> } >> @@ -156,6 +160,7 @@ >> return ret; >> } >> >> +#if ORTE_ENABLE_EPOCH >> /* >> * EPOCH >> */ >> @@ -171,6 +176,7 @@ >> >> return ret; >> } >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> /* >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_print_fns.c >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_print_fns.c (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_print_fns.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -125,8 +125,10 @@ >> orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, >> ORTE_STD_CNTR_T); >> break; >> >> +#if ORTE_ENABLE_EPOCH >> case ORTE_EPOCH: >> orte_dt_quick_print(output, "ORTE_EPOCH", prefix, src, >> ORTE_EPOCH_T); >> +#endif >> >> case ORTE_VPID: >> orte_dt_quick_print(output, "ORTE_VPID", prefix, src, >> ORTE_VPID_T); >> @@ -478,11 +480,21 @@ >> if (orte_xml_output) { >> /* need to create the output in XML format */ >> if (0 == src->pid) { >> +#if ORTE_ENABLE_EPOCH >> asprintf(output, "%s<process rank=\"%s\" status=\"%s\" >> epoch=\"%s\"/>\n", pfx2, >> ORTE_VPID_PRINT(src->name.vpid), >> orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); >> +#else >> + asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", >> pfx2, >> + ORTE_VPID_PRINT(src->name.vpid), >> orte_proc_state_to_str(src->state)); >> +#endif >> } else { >> +#if ORTE_ENABLE_EPOCH >> asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\" >> epoch=\"%s\"/>\n", pfx2, >> ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, >> orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); >> +#else >> + asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" >> status=\"%s\"/>\n", pfx2, >> + ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, >> orte_proc_state_to_str(src->state)); >> +#endif >> } >> free(pfx2); >> return ORTE_SUCCESS; >> @@ -490,10 +502,17 @@ >> >> if (!orte_devel_level_output) { >> /* just print a very simple output for users */ >> +#if ORTE_ENABLE_EPOCH >> asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: >> %s", pfx2, >> ORTE_JOBID_PRINT(src->name.jobid), >> ORTE_VPID_PRINT(src->name.vpid), >> ORTE_EPOCH_PRINT(src->name.epoch)); >> +#else >> + asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: >> %s", pfx2, >> + ORTE_JOBID_PRINT(src->name.jobid), >> + ORTE_VPID_PRINT(src->name.vpid)); >> +#endif >> + >> /* set the return */ >> *output = tmp; >> free(pfx2); >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_size_fns.c >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_size_fns.c (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_size_fns.c 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -45,9 +45,11 @@ >> *size = sizeof(orte_std_cntr_t); >> break; >> >> +#if ORTE_ENABLE_EPOCH >> case ORTE_EPOCH: >> *size = sizeof(orte_epoch_t); >> break; >> +#endif >> >> case ORTE_VPID: >> *size = sizeof(orte_vpid_t); >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_support.h >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_support.h (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_support.h 2011-08-26 >> 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -52,9 +52,14 @@ >> int orte_dt_compare_vpid(orte_vpid_t *value1, >> orte_vpid_t *value2, >> opal_data_type_t type); >> +#if ORTE_ENABLE_EPOCH >> int orte_dt_compare_epoch(orte_epoch_t *value1, >> orte_epoch_t *value2, >> opal_data_type_t type); >> +#define ORTE_EPOCH_CMP(n,m) ( (m) - (n) ) >> +#else >> +#define ORTE_EPOCH_CMP(n,m) ( 0 ) >> +#endif >> #if !ORTE_DISABLE_FULL_SUPPORT >> int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, >> opal_data_type_t type); >> int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, >> opal_data_type_t type); >> @@ -86,7 +91,9 @@ >> int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, >> opal_data_type_t type); >> int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, >> opal_data_type_t type); >> int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t >> type); >> +#if ORTE_ENABLE_EPOCH >> int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, >> opal_data_type_t type); >> +#endif >> #if !ORTE_DISABLE_FULL_SUPPORT >> int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t >> type); >> int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, opal_data_type_t >> type); >> @@ -116,8 +123,10 @@ >> int32_t num_vals, opal_data_type_t type); >> int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src, >> int32_t num_vals, opal_data_type_t type); >> +#if ORTE_ENABLE_EPOCH >> int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src, >> int32_t num_vals, opal_data_type_t type); >> +#endif >> #if !ORTE_DISABLE_FULL_SUPPORT >> int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, >> int32_t num_vals, opal_data_type_t type); >> @@ -185,8 +194,10 @@ >> int32_t *num_vals, opal_data_type_t type); >> int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest, >> int32_t *num_vals, opal_data_type_t type); >> +#if ORTE_ENABLE_EPOCH >> int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest, >> int32_t *num_vals, opal_data_type_t type); >> +#endif >> #if !ORTE_DISABLE_FULL_SUPPORT >> int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, >> int32_t *num_vals, opal_data_type_t type); >> >> Modified: trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >> ============================================================================== >> --- trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >> (original) >> +++ trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -54,7 +54,9 @@ >> orte_process_name_t* proc; >> orte_jobid_t *jobid; >> orte_vpid_t *vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t *epoch; >> +#endif >> >> num = *num_vals; >> >> @@ -92,6 +94,7 @@ >> return rc; >> } >> >> +#if ORTE_ENABLE_EPOCH >> /* collect all the epochs in a contiguous array */ >> epoch= (orte_epoch_t*)malloc(num * sizeof(orte_epoch_t)); >> if (NULL == epoch) { >> @@ -109,18 +112,21 @@ >> free(jobid); >> return rc; >> } >> +#endif >> >> /* build the names from the jobid/vpid/epoch arrays */ >> proc = (orte_process_name_t*)dest; >> for (i=0; i < num; i++) { >> proc->jobid = jobid[i]; >> proc->vpid = vpid[i]; >> - proc->epoch = epoch[i]; >> + ORTE_EPOCH_SET(proc->epoch,epoch[i]); >> proc++; >> } >> >> /* cleanup */ >> +#if ORTE_ENABLE_EPOCH >> free(epoch); >> +#endif >> free(vpid); >> free(jobid); >> >> @@ -159,6 +165,7 @@ >> return ret; >> } >> >> +#if ORTE_ENABLE_EPOCH >> /* >> * EPOCH >> */ >> @@ -174,6 +181,7 @@ >> >> return ret; >> } >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> /* >> >> Modified: trunk/orte/runtime/orte_data_server.c >> ============================================================================== >> --- trunk/orte/runtime/orte_data_server.c (original) >> +++ trunk/orte/runtime/orte_data_server.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -220,7 +220,7 @@ >> data->port = port_name; >> data->owner.jobid = sender->jobid; >> data->owner.vpid = sender->vpid; >> - data->owner.epoch = sender->epoch; >> + ORTE_EPOCH_SET(data->owner.epoch,sender->epoch); >> >> /* store the data */ >> data->index = opal_pointer_array_add(orte_data_server_store, >> data); >> >> Modified: trunk/orte/runtime/orte_globals.c >> ============================================================================== >> --- trunk/orte/runtime/orte_globals.c (original) >> +++ trunk/orte/runtime/orte_globals.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -277,6 +277,7 @@ >> return rc; >> } >> >> +#if ORTE_ENABLE_EPOCH >> tmp = ORTE_EPOCH; >> if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_epoch, >> orte_dt_unpack_epoch, >> @@ -290,6 +291,7 @@ >> ORTE_ERROR_LOG(rc); >> return rc; >> } >> +#endif >> >> #if !ORTE_DISABLE_FULL_SUPPORT >> tmp = ORTE_JOB; >> @@ -933,7 +935,7 @@ >> proc->beat = 0; >> OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t); >> opal_ring_buffer_init(&proc->stats, orte_stat_history_size); >> - proc->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >> #if OPAL_ENABLE_FT_CR == 1 >> proc->ckpt_state = 0; >> proc->ckpt_snapshot_ref = NULL; >> >> Modified: trunk/orte/runtime/orte_init.c >> ============================================================================== >> --- trunk/orte/runtime/orte_init.c (original) >> +++ trunk/orte/runtime/orte_init.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -57,8 +57,17 @@ >> char *orte_prohibited_session_dirs = NULL; >> bool orte_create_session_dirs = true; >> >> +#if ORTE_ENABLE_EPOCH >> +orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, >> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD}; >> +#else >> orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, >> ORTE_VPID_WILDCARD}; >> +#endif >> + >> +#if ORTE_ENABLE_EPOCH >> +orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, >> ORTE_VPID_INVALID, ORTE_EPOCH_INVALID}; >> +#else >> orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, >> ORTE_VPID_INVALID}; >> +#endif >> >> >> #if OPAL_CC_USE_PRAGMA_IDENT >> >> Modified: trunk/orte/runtime/orte_wait.h >> ============================================================================== >> --- trunk/orte/runtime/orte_wait.h (original) >> +++ trunk/orte/runtime/orte_wait.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -204,7 +204,7 @@ >> mev = OBJ_NEW(orte_message_event_t); \ >> mev->sender.jobid = (sndr)->jobid; \ >> mev->sender.vpid = (sndr)->vpid; \ >> - mev->sender.epoch = (sndr)->epoch; \ >> + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ >> opal_dss.copy_payload(mev->buffer, (buf)); \ >> mev->tag = (tg); \ >> mev->file = strdup((buf)->parent.cls_init_file_name); \ >> @@ -228,7 +228,7 @@ >> mev = OBJ_NEW(orte_message_event_t); \ >> mev->sender.jobid = (sndr)->jobid; \ >> mev->sender.vpid = (sndr)->vpid; \ >> - mev->sender.epoch = (sndr)->epoch; \ >> + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ >> opal_dss.copy_payload(mev->buffer, (buf)); \ >> mev->tag = (tg); \ >> opal_event_evtimer_set(opal_event_base, \ >> @@ -258,7 +258,7 @@ >> tmp = OBJ_NEW(orte_notify_event_t); \ >> tmp->proc.jobid = (data)->jobid; \ >> tmp->proc.vpid = (data)->vpid; \ >> - tmp->proc.epoch = (data)->epoch; \ >> + ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \ >> opal_event.evtimer_set(opal_event_base, \ >> tmp->ev, (cbfunc), tmp); \ >> now.tv_sec = 0; \ >> >> Modified: trunk/orte/test/system/oob_stress.c >> ============================================================================== >> --- trunk/orte/test/system/oob_stress.c (original) >> +++ trunk/orte/test/system/oob_stress.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -74,8 +74,7 @@ >> >> for (j=1; j < count+1; j++) { >> peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % >> orte_process_info.num_procs; >> - peer.epoch = ORTE_EPOCH_INVALID; >> - peer.epoch = orte_ess.proc_get_epoch(&peer); >> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >> >> /* rank0 starts ring */ >> if (ORTE_PROC_MY_NAME->vpid == 0) { >> >> Modified: trunk/orte/test/system/orte_ring.c >> ============================================================================== >> --- trunk/orte/test/system/orte_ring.c (original) >> +++ trunk/orte/test/system/orte_ring.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -41,16 +41,14 @@ >> if( right_peer_orte_name.vpid >= num_peers ) { >> right_peer_orte_name.vpid = 0; >> } >> - right_peer_orte_name.epoch = ORTE_EPOCH_INVALID; >> - right_peer_orte_name.epoch = >> orte_ess.proc_get_epoch(&right_peer_orte_name); >> + >> ORTE_EPOCH_SET(right_peer_orte_name.epoch,orte_ess.proc_get_epoch(&right_peer_orte_name)); >> >> left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid; >> left_peer_orte_name.vpid = ORTE_PROC_MY_NAME->vpid - 1; >> if( ORTE_PROC_MY_NAME->vpid == 0 ) { >> left_peer_orte_name.vpid = num_peers - 1; >> } >> - left_peer_orte_name.epoch = ORTE_EPOCH_INVALID; >> - left_peer_orte_name.epoch = >> orte_ess.proc_get_epoch(&left_peer_orte_name); >> + >> ORTE_EPOCH_SET(left_peer_orte_name.epoch,orte_ess.proc_get_epoch(&left_peer_orte_name)); >> >> printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is >> %s\n", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), getpid(), >> >> Modified: trunk/orte/test/system/orte_spawn.c >> ============================================================================== >> --- trunk/orte/test/system/orte_spawn.c (original) >> +++ trunk/orte/test/system/orte_spawn.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -74,8 +74,8 @@ >> for (i=0; i < app->num_procs; i++) { >> name.vpid = i; >> >> - name.epoch = ORTE_EPOCH_INVALID; >> - name.epoch = orte_ess.proc_get_epoch(&name); >> + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >> + >> fprintf(stderr, "Parent: sending message to child %s\n", >> ORTE_NAME_PRINT(&name)); >> if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) { >> ORTE_ERROR_LOG(rc); >> >> Modified: trunk/orte/tools/orte-ps/orte-ps.c >> ============================================================================== >> --- trunk/orte/tools/orte-ps/orte-ps.c (original) >> +++ trunk/orte/tools/orte-ps/orte-ps.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -869,8 +869,14 @@ >> } >> >> /* query the HNP for info on the procs in this job */ >> - if (ORTE_SUCCESS != (ret = >> orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid, >> - >> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD, &cnt, &procs))) { >> + if (ORTE_SUCCESS != (ret = >> orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), >> + >> job->jobid, >> + >> ORTE_VPID_WILDCARD, >> +#if ORTE_ENABLE_EPOCH >> + >> ORTE_EPOCH_WILDCARD, >> +#endif >> + &cnt, >> + &procs))) >> { >> ORTE_ERROR_LOG(ret); >> } >> job->procs->addr = (void**)procs; >> >> Modified: trunk/orte/tools/orte-top/orte-top.c >> ============================================================================== >> --- trunk/orte/tools/orte-top/orte-top.c (original) >> +++ trunk/orte/tools/orte-top/orte-top.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -471,7 +471,7 @@ >> if (NULL == ranks) { >> /* take all ranks */ >> proc.vpid = ORTE_VPID_WILDCARD; >> - proc.epoch = ORTE_EPOCH_WILDCARD; >> + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_WILDCARD); >> if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, >> ORTE_NAME))) { >> ORTE_ERROR_LOG(ret); >> goto cleanup; >> >> Modified: trunk/orte/util/comm/comm.c >> ============================================================================== >> --- trunk/orte/util/comm/comm.c (original) >> +++ trunk/orte/util/comm/comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -433,8 +433,13 @@ >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_ENABLE_EPOCH >> int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, >> orte_jobid_t job, orte_vpid_t vpid, >> orte_epoch_t epoch, int *num_procs, >> orte_proc_t ***proc_info_array) >> +#else >> +int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, >> orte_jobid_t job, orte_vpid_t vpid, >> + int *num_procs, orte_proc_t >> ***proc_info_array) >> +#endif >> { >> int ret; >> int32_t cnt, cnt_procs, n; >> @@ -463,11 +468,13 @@ >> OBJ_RELEASE(cmd); >> return ret; >> } >> +#if ORTE_ENABLE_EPOCH >> if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { >> ORTE_ERROR_LOG(ret); >> OBJ_RELEASE(cmd); >> return ret; >> } >> +#endif >> /* define a max time to wait for send to complete */ >> timer_fired = false; >> error_exit = ORTE_SUCCESS; >> >> Modified: trunk/orte/util/comm/comm.h >> ============================================================================== >> --- trunk/orte/util/comm/comm.h (original) >> +++ trunk/orte/util/comm/comm.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -52,7 +52,10 @@ >> int *num_nodes, orte_node_t >> ***node_info_array); >> >> ORTE_DECLSPEC int orte_util_comm_query_proc_info(const orte_process_name_t >> *hnp, orte_jobid_t job, orte_vpid_t vpid, >> - orte_epoch_t epoch, int >> *num_procs, orte_proc_t ***proc_info_array); >> +#if ORTE_ENABLE_EPOCH >> + orte_epoch_t epoch, >> +#endif >> + int *num_procs, >> orte_proc_t ***proc_info_array); >> >> ORTE_DECLSPEC int orte_util_comm_spawn_job(const orte_process_name_t *hnp, >> orte_job_t *jdata); >> >> >> Modified: trunk/orte/util/hnp_contact.c >> ============================================================================== >> --- trunk/orte/util/hnp_contact.c (original) >> +++ trunk/orte/util/hnp_contact.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -55,7 +55,8 @@ >> { >> ptr->name.jobid = ORTE_JOBID_INVALID; >> ptr->name.vpid = ORTE_VPID_INVALID; >> - ptr->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >> + >> ptr->rml_uri = NULL; >> } >> static void orte_hnp_contact_destruct(orte_hnp_contact_t *ptr) >> >> Modified: trunk/orte/util/name_fns.c >> ============================================================================== >> --- trunk/orte/util/name_fns.c (original) >> +++ trunk/orte/util/name_fns.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -46,7 +46,7 @@ >> { >> list->name.jobid = ORTE_JOBID_INVALID; >> list->name.vpid = ORTE_VPID_INVALID; >> - list->name.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(list->name.epoch,ORTE_EPOCH_MIN); >> } >> >> /* destructor - used to free any resources held by instance */ >> @@ -116,7 +116,10 @@ >> char* orte_util_print_name_args(const orte_process_name_t *name) >> { >> orte_print_args_buffers_t *ptr; >> - char *job, *vpid, *epoch; >> + char *job, *vpid; >> +#if ORTE_ENABLE_EPOCH >> + char *epoch; >> +#endif >> >> /* protect against NULL names */ >> if (NULL == name) { >> @@ -141,7 +144,7 @@ >> */ >> job = orte_util_print_jobids(name->jobid); >> vpid = orte_util_print_vpids(name->vpid); >> - epoch = orte_util_print_epoch(name->epoch); >> + ORTE_EPOCH_SET(epoch,orte_util_print_epoch(name->epoch)); >> >> /* get the next buffer */ >> ptr = get_print_name_buffer(); >> @@ -156,9 +159,15 @@ >> ptr->cntr = 0; >> } >> >> +#if ORTE_ENABLE_EPOCH >> snprintf(ptr->buffers[ptr->cntr++], >> ORTE_PRINT_NAME_ARGS_MAX_SIZE, >> "[%s,%s,%s]", job, vpid, epoch); >> +#else >> + snprintf(ptr->buffers[ptr->cntr++], >> + ORTE_PRINT_NAME_ARGS_MAX_SIZE, >> + "[%s,%s]", job, vpid); >> +#endif >> >> return ptr->buffers[ptr->cntr-1]; >> } >> @@ -282,6 +291,7 @@ >> return ptr->buffers[ptr->cntr-1]; >> } >> >> +#if ORTE_ENABLE_EPOCH >> char* orte_util_print_epoch(const orte_epoch_t epoch) >> { >> orte_print_args_buffers_t *ptr; >> @@ -309,6 +319,7 @@ >> } >> return ptr->buffers[ptr->cntr-1]; >> } >> +#endif >> >> >> >> @@ -403,6 +414,7 @@ >> return ORTE_SUCCESS; >> } >> >> +#if ORTE_ENABLE_EPOCH >> int orte_util_convert_epoch_to_string(char **epoch_string, const >> orte_epoch_t epoch) >> { >> /* check for wildcard value - handle appropriately */ >> @@ -425,7 +437,6 @@ >> return ORTE_SUCCESS; >> } >> >> - >> int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* >> epoch_string) >> { >> if (NULL == epoch_string) { /* got an error */ >> @@ -450,6 +461,7 @@ >> >> return ORTE_SUCCESS; >> } >> +#endif >> >> int orte_util_convert_string_to_process_name(orte_process_name_t *name, >> const char* name_string) >> @@ -457,13 +469,15 @@ >> char *temp, *token; >> orte_jobid_t job; >> orte_vpid_t vpid; >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t epoch; >> +#endif >> int return_code=ORTE_SUCCESS; >> - >> + >> /* set default */ >> name->jobid = ORTE_JOBID_INVALID; >> name->vpid = ORTE_VPID_INVALID; >> - name->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(name->epoch,ORTE_EPOCH_MIN); >> >> /* check for NULL string - error */ >> if (NULL == name_string) { >> @@ -510,6 +524,7 @@ >> vpid = strtoul(token, NULL, 10); >> } >> >> +#if ORTE_ENABLE_EPOCH >> token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field >> -> epoch*/ >> >> /* check for error */ >> @@ -528,10 +543,11 @@ >> } else { >> epoch = strtoul(token, NULL, 10); >> } >> +#endif >> >> name->jobid = job; >> name->vpid = vpid; >> - name->epoch = epoch; >> + ORTE_EPOCH_SET(name->epoch,epoch); >> >> free(temp); >> >> @@ -568,6 +584,7 @@ >> asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned >> long)name->vpid); >> } >> >> +#if ORTE_ENABLE_EPOCH >> if (ORTE_EPOCH_WILDCARD == name->epoch) { >> asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, >> ORTE_SCHEMA_WILDCARD_STRING); >> } else if (ORTE_EPOCH_INVALID == name->epoch) { >> @@ -575,6 +592,10 @@ >> } else { >> asprintf(name_string, "%s%c%lu", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, >> (unsigned long)name->epoch); >> } >> +#else >> + asprintf(name_string, "%s", tmp2); >> +#endif >> + >> free(tmp); >> free(tmp2); >> >> @@ -585,8 +606,11 @@ >> /**** CREATE PROCESS NAME ****/ >> int orte_util_create_process_name(orte_process_name_t **name, >> orte_jobid_t job, >> - orte_vpid_t vpid, >> - orte_epoch_t epoch) >> + orte_vpid_t vpid >> +#if ORTE_ENABLE_EPOCH >> + ,orte_epoch_t epoch >> +#endif >> + ) >> { >> *name = NULL; >> >> @@ -598,7 +622,8 @@ >> >> (*name)->jobid = job; >> (*name)->vpid = vpid; >> - (*name)->epoch = epoch; >> + ORTE_EPOCH_SET((*name)->epoch,epoch); >> + >> return ORTE_SUCCESS; >> } >> >> @@ -655,6 +680,7 @@ >> } >> } >> >> +#if ORTE_ENABLE_EPOCH >> /* Get here if jobid's and vpid's are equal, or not being checked. >> * Now check epoch. >> */ >> @@ -666,6 +692,7 @@ >> return OPAL_VALUE1_GREATER; >> } >> } >> +#endif >> >> /* only way to get here is if all fields are being checked and are equal, >> * or jobid not checked, but vpid equal, >> >> Modified: trunk/orte/util/name_fns.h >> ============================================================================== >> --- trunk/orte/util/name_fns.h (original) >> +++ trunk/orte/util/name_fns.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -61,9 +61,13 @@ >> #define ORTE_VPID_PRINT(n) \ >> orte_util_print_vpids(n) >> >> +#if ORTE_ENABLE_EPOCH >> ORTE_DECLSPEC char* orte_util_print_epoch(const orte_epoch_t epoch); >> #define ORTE_EPOCH_PRINT(n) \ >> orte_util_print_epoch(n) >> +#else >> +#define ORTE_EPOCH_PRINT(n) >> +#endif >> >> ORTE_DECLSPEC char* orte_util_print_job_family(const orte_jobid_t job); >> #define ORTE_JOB_FAMILY_PRINT(n) \ >> @@ -104,6 +108,24 @@ >> #define ORTE_JOBID_IS_DAEMON(n) \ >> !((n) & 0x0000ffff) >> >> +/* Macro for getting the epoch out of the process name */ >> +#if ORTE_ENABLE_EPOCH >> +#define ORTE_EPOCH_GET(n) \ >> + ((n)->epoch) >> +#else >> +#define ORTE_EPOCH_GET(n) >> +#endif >> + >> +/* Macro for setting the epoch in the process name */ >> +#if ORTE_ENABLE_EPOCH >> +#define ORTE_EPOCH_SET(n,m) \ >> + ( (n) = (m) ) >> +#else >> +#define ORTE_EPOCH_SET(n,m) \ >> + do { \ >> + } while(0); >> +#endif >> + >> /* List of names for general use */ >> struct orte_namelist_t { >> opal_list_item_t item; /**< Allows this item to be placed on a list >> */ >> @@ -117,16 +139,24 @@ >> ORTE_DECLSPEC int orte_util_convert_string_to_jobid(orte_jobid_t *jobid, >> const char* jobidstring); >> ORTE_DECLSPEC int orte_util_convert_vpid_to_string(char **vpid_string, const >> orte_vpid_t vpid); >> ORTE_DECLSPEC int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const >> char* vpidstring); >> +#if ORTE_ENABLE_EPOCH >> ORTE_DECLSPEC int orte_util_convert_epoch_to_string(char **epoch_string, >> const orte_epoch_t epoch); >> ORTE_DECLSPEC int orte_util_convert_string_to_epoch(orte_vpid_t *epoch, >> const char* epochstring); >> +#endif >> ORTE_DECLSPEC int >> orte_util_convert_string_to_process_name(orte_process_name_t *name, >> const char* name_string); >> ORTE_DECLSPEC int orte_util_convert_process_name_to_string(char** >> name_string, >> const orte_process_name_t *name); >> +#if ORTE_ENABLE_EPOCH >> ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, >> orte_jobid_t job, >> orte_vpid_t vpid, >> orte_epoch_t epoch); >> +#else >> +ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, >> + orte_jobid_t job, >> + orte_vpid_t vpid); >> +#endif >> ORTE_DECLSPEC int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields, >> const orte_process_name_t* name1, >> const orte_process_name_t* name2); >> >> Modified: trunk/orte/util/nidmap.c >> ============================================================================== >> --- trunk/orte/util/nidmap.c (original) >> +++ trunk/orte/util/nidmap.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -249,7 +249,7 @@ >> */ >> /* construct the URI */ >> proc.vpid = node->daemon; >> - proc.epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN); >> >> orte_util_convert_process_name_to_string(&proc_name, &proc); >> asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, >> (int)orte_process_info.my_port); >> @@ -1001,6 +1001,7 @@ >> } >> #endif >> >> +#if ORTE_ENABLE_EPOCH >> /* Look up the current epoch value that we have stored locally. >> * >> * Note that this will not ping the HNP to get the most up to date epoch >> stored >> @@ -1023,7 +1024,9 @@ >> /*print_orte_job_data();*/ >> return e; >> } >> +#endif >> >> +#if ORTE_RESIL_ORTE >> bool orte_util_proc_is_running(orte_process_name_t *proc) { >> int i; >> unsigned int j; >> @@ -1078,7 +1081,9 @@ >> >> return ORTE_ERROR; >> } >> +#endif >> >> +#if ORTE_ENABLE_EPOCH >> /* >> * This function performs both the get and set operations on the epoch for a >> * sepcific process name. If the epoch passed into the function is >> @@ -1091,6 +1096,11 @@ >> orte_job_t *jdata; >> orte_proc_t *pdata; >> >> + if (ORTE_JOBID_INVALID == proc->jobid || >> + ORTE_VPID_INVALID == proc->vpid) { >> + return ORTE_EPOCH_INVALID; >> + } >> + >> /* Sanity check just to make sure we don't overwrite our existing >> * orte_job_data. >> */ >> @@ -1165,4 +1175,5 @@ >> return ORTE_EPOCH_MIN; >> } >> } >> +#endif >> >> >> Modified: trunk/orte/util/nidmap.h >> ============================================================================== >> --- trunk/orte/util/nidmap.h (original) >> +++ trunk/orte/util/nidmap.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >> @@ -48,11 +48,19 @@ >> ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc); >> ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc); >> >> +#if ORTE_ENABLE_EPOCH >> ORTE_DECLSPEC orte_epoch_t orte_util_lookup_epoch(orte_process_name_t *proc); >> ORTE_DECLSPEC orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, >> orte_epoch_t epoch); >> +#endif >> >> ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, >> orte_proc_state_t state); >> + >> +#if ORTE_RESIL_ORTE >> +#define PROC_IS_RUNNING(n) orte_util_proc_is_running(n) >> ORTE_DECLSPEC bool orte_util_proc_is_running(orte_process_name_t *proc); >> +#else >> +#define PROC_IS_RUNNING(n) ( true ) >> +#endif >> >> ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr); >> ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); >> @@ -72,5 +80,8 @@ >> END_C_DECLS >> >> /* Local functions */ >> +#if ORTE_ENABLE_EPOCH >> orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, >> orte_epoch_t epoch); >> #endif >> + >> +#endif >> >> Modified: trunk/orte/util/proc_info.c >> ============================================================================== >> --- trunk/orte/util/proc_info.c (original) >> +++ trunk/orte/util/proc_info.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >> 2011) >> @@ -36,13 +36,19 @@ >> >> #include "orte/util/proc_info.h" >> >> +#if ORTE_ENABLE_EPOCH >> +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, >> ORTE_EPOCH_MIN} >> +#else >> +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID} >> +#endif >> + >> ORTE_DECLSPEC orte_proc_info_t orte_process_info = { >> - /* .my_name = */ {ORTE_JOBID_INVALID, >> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >> - /* .my_daemon = */ {ORTE_JOBID_INVALID, >> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >> + /* .my_name = */ ORTE_NAME_INVALID, >> + /* .my_daemon = */ ORTE_NAME_INVALID, >> /* .my_daemon_uri = */ NULL, >> - /* .my_hnp = */ {ORTE_JOBID_INVALID, >> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >> + /* .my_hnp = */ ORTE_NAME_INVALID, >> /* .my_hnp_uri = */ NULL, >> - /* .my_parent = */ {ORTE_JOBID_INVALID, >> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >> + /* .my_parent = */ ORTE_NAME_INVALID, >> /* .hnp_pid = */ 0, >> /* .app_num = */ 0, >> /* .num_procs = */ 1, >> >> Modified: trunk/test/util/orte_session_dir.c >> ============================================================================== >> --- trunk/test/util/orte_session_dir.c (original) >> +++ trunk/test/util/orte_session_dir.c 2011-08-26 18:16:14 EDT (Fri, >> 26 Aug 2011) >> @@ -57,7 +57,7 @@ >> orte_process_info.my_name->cellid = 0; >> orte_process_info.my_name->jobid = 0; >> orte_process_info.my_name->vpid = 0; >> - orte_process_info.my_name->epoch = ORTE_EPOCH_MIN; >> + ORTE_EPOCH_SET(orte_process_info.my_name->epoch,ORTE_EPOCH_MIN); >> >> test_init("orte_session_dir_t"); >> test_out = fopen( "test_session_dir_out", "w+" ); >> _______________________________________________ >> svn-full mailing list >> svn-f...@open-mpi.org >> hxxp://www.open-mpi.org/mailman/listinfo.cgi/svn-full > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > hxxp://www.open-mpi.org/mailman/listinfo.cgi/devel >