You're right. Sorry about the typo. It was just corrected. On Aug 30, 2011, at 2:27 PM, Shamis, Pavel wrote:
> Hi all, > I'm not sure, if it is relevant to this specific commit, but it is relevant > for some of epoch changes. > I was not able to compile latest trunk version on our cray system, the > failure was in ess/alps component, for me it seems like simple typo. I did > not have chance to check my fix on our system, because I have been fighting > with Open MPI - VT component compilation on Cray. Please let me know if the > patch is ok. > > Please see the patch below: > > Index: orte/mca/ess/alps/ess_alps_module.c > =================================================================== > --- orte/mca/ess/alps/ess_alps_module.c (revision 25108) > +++ orte/mca/ess/alps/ess_alps_module.c (working copy) > @@ -363,8 +363,7 @@ > > ORTE_PROC_MY_NAME->jobid = jobid; > ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; > - ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); > - > ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "ess:alps set name to %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > > Pavel (Pasha) Shamis > --- > Application Performance Tools Group > Computer Science and Math Division > Oak Ridge National Laboratory > > > > > > > On Aug 26, 2011, at 6:18 PM, Wesley Bland wrote: > >> The epoch and resilient rote code is now macro'd away. To enable use >> >> --enable-resilient-orte >> >> which defines: >> >> ORTE_ENABLE_EPOCH >> ORTE_RESIL_ORTE >> >> -- >> >> Wesley >> >> On Aug 26, 2011, at 6:16 PM, wbl...@osl.iu.edu wrote: >> >>> Author: wbland >>> Date: 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> New Revision: 25093 >>> URL: hxxps://svn.open-mpi.org/trac/ompi/changeset/25093 >>> >>> Log: >>> By popular demand the epoch code is now disabled by default. >>> >>> To enable the epochs and the resilient orte code, use the configure flag: >>> >>> --enable-resilient-orte >>> >>> This will define both: >>> >>> ORTE_ENABLE_EPOCH >>> ORTE_RESIL_ORTE >>> >>> Text files modified: >>> trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c | 12 ++++ >>> trunk/ompi/mca/coll/sm2/coll_sm2_module.c | 3 >>> trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 49 >>> ++++++++---------- >>> trunk/ompi/mca/dpm/orte/dpm_orte.c | 2 >>> trunk/ompi/mca/pml/bfo/pml_bfo_failover.c | 10 +-- >>> trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h | 6 -- >>> trunk/ompi/proc/proc.c | 6 +- >>> trunk/opal/config/opal_configure_options.m4 | 8 +++ >>> trunk/orte/include/orte/types.h | 24 >>> +++++++++ >>> trunk/orte/mca/db/daemon/db_daemon.c | 2 >>> trunk/orte/mca/errmgr/app/errmgr_app.c | 19 ++++++- >>> trunk/orte/mca/errmgr/base/errmgr_base_fns.c | 12 ++-- >>> trunk/orte/mca/errmgr/base/errmgr_base_tool.c | 6 +- >>> trunk/orte/mca/errmgr/hnp/errmgr_hnp.c | 99 >>> +++++++++++++++++++++++++++------------ >>> trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 6 +- >>> trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c | 6 +- >>> trunk/orte/mca/errmgr/orted/errmgr_orted.c | 71 >>> +++++++++++++++++++++------- >>> trunk/orte/mca/ess/alps/ess_alps_module.c | 4 >>> trunk/orte/mca/ess/base/base.h | 4 + >>> trunk/orte/mca/ess/base/ess_base_select.c | 14 ++--- >>> trunk/orte/mca/ess/env/ess_env_module.c | 3 >>> trunk/orte/mca/ess/ess.h | 4 + >>> trunk/orte/mca/ess/generic/ess_generic_module.c | 6 +- >>> trunk/orte/mca/ess/hnp/ess_hnp_module.c | 2 >>> trunk/orte/mca/ess/lsf/ess_lsf_module.c | 3 >>> trunk/orte/mca/ess/singleton/ess_singleton_module.c | 2 >>> trunk/orte/mca/ess/slave/ess_slave_module.c | 3 >>> trunk/orte/mca/ess/slurm/ess_slurm_module.c | 3 >>> trunk/orte/mca/ess/slurmd/ess_slurmd_module.c | 4 >>> trunk/orte/mca/ess/tm/ess_tm_module.c | 2 >>> trunk/orte/mca/filem/rsh/filem_rsh_module.c | 6 +- >>> trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c | 21 ++----- >>> trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c | 8 +- >>> trunk/orte/mca/iof/base/base.h | 8 +- >>> trunk/orte/mca/iof/base/iof_base_open.c | 2 >>> trunk/orte/mca/iof/hnp/iof_hnp.c | 7 +- >>> trunk/orte/mca/iof/hnp/iof_hnp_receive.c | 6 +- >>> trunk/orte/mca/iof/orted/iof_orted.c | 2 >>> trunk/orte/mca/odls/base/odls_base_default_fns.c | 7 +- >>> trunk/orte/mca/odls/base/odls_base_open.c | 5 - >>> trunk/orte/mca/odls/base/odls_base_state.c | 6 +- >>> trunk/orte/mca/oob/tcp/oob_tcp_msg.c | 2 >>> trunk/orte/mca/oob/tcp/oob_tcp_peer.c | 5 ++ >>> trunk/orte/mca/plm/base/plm_base_jobid.c | 4 >>> trunk/orte/mca/plm/base/plm_base_launch_support.c | 3 >>> trunk/orte/mca/plm/base/plm_base_orted_cmds.c | 8 +-- >>> trunk/orte/mca/plm/base/plm_base_receive.c | 7 ++ >>> trunk/orte/mca/plm/base/plm_base_rsh_support.c | 4 + >>> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 23 >>> +++++---- >>> trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c | 3 >>> trunk/orte/mca/rmaps/seq/rmaps_seq.c | 3 >>> trunk/orte/mca/rmcast/base/rmcast_base_open.c | 6 +- >>> trunk/orte/mca/rmcast/tcp/rmcast_tcp.c | 4 >>> trunk/orte/mca/rmcast/udp/rmcast_udp.c | 4 >>> trunk/orte/mca/rml/base/rml_base_components.c | 5 + >>> trunk/orte/mca/rml/rml_types.h | 6 + >>> trunk/orte/mca/routed/base/routed_base_components.c | 6 +- >>> trunk/orte/mca/routed/base/routed_base_register_sync.c | 4 + >>> trunk/orte/mca/routed/binomial/routed_binomial.c | 54 >>> ++++++++++++--------- >>> trunk/orte/mca/routed/cm/routed_cm.c | 19 +++---- >>> trunk/orte/mca/routed/direct/routed_direct.c | 3 >>> trunk/orte/mca/routed/linear/routed_linear.c | 17 +++--- >>> trunk/orte/mca/routed/radix/routed_radix.c | 22 >>> ++++---- >>> trunk/orte/mca/routed/slave/routed_slave.c | 6 +- >>> trunk/orte/mca/sensor/file/sensor_file.c | 2 >>> trunk/orte/mca/snapc/base/snapc_base_fns.c | 4 >>> trunk/orte/mca/snapc/full/snapc_full_global.c | 12 ++-- >>> trunk/orte/mca/snapc/full/snapc_full_local.c | 6 +- >>> trunk/orte/mca/snapc/full/snapc_full_module.c | 4 >>> trunk/orte/mca/sstore/base/sstore_base_fns.c | 6 +- >>> trunk/orte/mca/sstore/central/sstore_central_global.c | 3 >>> trunk/orte/mca/sstore/central/sstore_central_local.c | 6 +- >>> trunk/orte/mca/sstore/stage/sstore_stage_global.c | 7 +- >>> trunk/orte/mca/sstore/stage/sstore_stage_local.c | 12 ++-- >>> trunk/orte/orted/orted_comm.c | 20 >>> ++++---- >>> trunk/orte/orted/orted_main.c | 7 +- >>> trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c | 4 + >>> trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c | 4 + >>> trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c | 6 ++ >>> trunk/orte/runtime/data_type_support/orte_dt_print_fns.c | 19 +++++++ >>> trunk/orte/runtime/data_type_support/orte_dt_size_fns.c | 2 >>> trunk/orte/runtime/data_type_support/orte_dt_support.h | 11 ++++ >>> trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c | 10 +++ >>> trunk/orte/runtime/orte_data_server.c | 2 >>> trunk/orte/runtime/orte_globals.c | 4 + >>> trunk/orte/runtime/orte_init.c | 9 +++ >>> trunk/orte/runtime/orte_wait.h | 6 +- >>> trunk/orte/test/system/oob_stress.c | 3 >>> trunk/orte/test/system/orte_ring.c | 6 - >>> trunk/orte/test/system/orte_spawn.c | 4 >>> trunk/orte/tools/orte-ps/orte-ps.c | 10 +++ >>> trunk/orte/tools/orte-top/orte-top.c | 2 >>> trunk/orte/util/comm/comm.c | 7 ++ >>> trunk/orte/util/comm/comm.h | 5 + >>> trunk/orte/util/hnp_contact.c | 3 >>> trunk/orte/util/name_fns.c | 47 >>> ++++++++++++++---- >>> trunk/orte/util/name_fns.h | 30 >>> ++++++++++++ >>> trunk/orte/util/nidmap.c | 13 ++++ >>> trunk/orte/util/nidmap.h | 11 ++++ >>> trunk/orte/util/proc_info.c | 14 ++++- >>> trunk/test/util/orte_session_dir.c | 2 >>> 101 files changed, 652 insertions(+), 362 deletions(-) >>> >>> Modified: trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >>> ============================================================================== >>> --- trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >>> (original) >>> +++ trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -693,8 +693,16 @@ >>> bool found = false; >>> >>> BTL_VERBOSE(("Searching for ep and proc with follow parameters:" >>> - "jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d", >>> - process_name->jobid, process_name->vpid, >>> process_name->epoch, subnet_id, lid)); >>> + "jobid %d, vpid %d, " >>> +#if ORTE_ENABLE_EPOCH >>> + "epoch %d, " >>> +#endif >>> + "sid %" PRIx64 ", lid %d", >>> + process_name->jobid, process_name->vpid, >>> +#if ORTE_ENABLE_EPOCH >>> + process_name->epoch, >>> +#endif >>> + subnet_id, lid)); >>> /* find ibproc */ >>> OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); >>> for (ib_proc = (mca_btl_openib_proc_t*) >>> >>> Modified: trunk/ompi/mca/coll/sm2/coll_sm2_module.c >>> ============================================================================== >>> --- trunk/ompi/mca/coll/sm2/coll_sm2_module.c (original) >>> +++ trunk/ompi/mca/coll/sm2/coll_sm2_module.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -1208,7 +1208,8 @@ >>> peer = OBJ_NEW(orte_namelist_t); >>> peer->name.jobid = >>> comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid; >>> peer->name.vpid = >>> comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid; >>> - peer->name.epoch = >>> comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch; >>> + >>> ORTE_EPOCH_SET(peer->name.epoch,comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch); >>> + >>> opal_list_append(&peers, &peer->item); >>> } >>> /* prepare send data */ >>> >>> Modified: trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c >>> ============================================================================== >>> --- trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c (original) >>> +++ trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -702,7 +702,7 @@ >>> void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t >>> *peer_ref) { >>> peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>> peer_ref->proc_name.vpid = ORTE_VPID_INVALID; >>> - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>> >>> OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t); >>> OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t); >>> @@ -730,7 +730,7 @@ >>> >>> peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>> peer_ref->proc_name.vpid = ORTE_VPID_INVALID; >>> - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>> >>> while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) { >>> HOKE_TRAFFIC_MSG_REF_RETURN(item); >>> @@ -840,7 +840,7 @@ >>> >>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>> >>> msg_ref->matched = INVALID_INT; >>> msg_ref->done = INVALID_INT; >>> @@ -868,7 +868,7 @@ >>> >>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>> >>> msg_ref->matched = INVALID_INT; >>> msg_ref->done = INVALID_INT; >>> @@ -902,7 +902,7 @@ >>> >>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>> >>> msg_ref->done = INVALID_INT; >>> msg_ref->active = INVALID_INT; >>> @@ -934,7 +934,7 @@ >>> >>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>> >>> msg_ref->done = INVALID_INT; >>> msg_ref->active = INVALID_INT; >>> @@ -954,7 +954,7 @@ >>> >>> msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; >>> msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; >>> - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); >>> } >>> >>> void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( >>> ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) { >>> @@ -962,7 +962,7 @@ >>> >>> msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; >>> msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; >>> - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); >>> } >>> >>> >>> @@ -1015,7 +1015,7 @@ >>> } >>> >>> >>> -#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>> v_rank, v_comm, p_jobid, p_vpid, p_epoch) \ >>> +#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>> v_rank, v_comm, p_jobid, p_vpid) \ >>> { \ >>> HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \ >>> \ >>> @@ -1034,7 +1034,7 @@ >>> \ >>> msg_ref->proc_name.jobid = p_jobid; \ >>> msg_ref->proc_name.vpid = p_vpid; \ >>> - msg_ref->proc_name.epoch = p_epoch; \ >>> + >>> ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); >>> \ >>> \ >>> msg_ref->matched = 0; \ >>> msg_ref->done = 0; \ >>> @@ -1043,7 +1043,7 @@ >>> msg_ref->active_drain = 0; \ >>> } >>> >>> -#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>> v_rank, v_comm, p_jobid, p_vpid, p_epoch) \ >>> +#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>> v_rank, v_comm, p_jobid, p_vpid) \ >>> { \ >>> HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \ >>> \ >>> @@ -1063,7 +1063,7 @@ >>> \ >>> msg_ref->proc_name.jobid = p_jobid; \ >>> msg_ref->proc_name.vpid = p_vpid; \ >>> - msg_ref->proc_name.epoch = p_epoch; \ >>> + >>> ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); >>> \ >>> } >>> >>> >>> @@ -1466,7 +1466,7 @@ >>> >>> new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid; >>> new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid; >>> - new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch; >>> + >>> ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch); >>> >>> opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, >>> &(new_peer_ref->super)); >>> } >>> @@ -3237,13 +3237,11 @@ >>> CREATE_NEW_MSG((*msg_ref), msg_type, >>> count, ddt_size, tag, dest, comm, >>> peer_ref->proc_name.jobid, >>> - peer_ref->proc_name.vpid, >>> - peer_ref->proc_name.epoch); >>> + peer_ref->proc_name.vpid); >>> } else { >>> CREATE_NEW_MSG((*msg_ref), msg_type, >>> count, ddt_size, tag, dest, comm, >>> - ORTE_JOBID_INVALID, ORTE_VPID_INVALID, >>> - ORTE_EPOCH_INVALID); >>> + ORTE_JOBID_INVALID, ORTE_VPID_INVALID); >>> } >>> >>> if( msg_type == COORD_MSG_TYPE_P_SEND || >>> @@ -3377,7 +3375,7 @@ >>> if( NULL == from_peer_ref && NULL != to_peer_ref ) { >>> (*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid; >>> (*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid; >>> - (*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch; >>> + >>> ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch); >>> } >>> >>> return exit_status; >>> @@ -3808,8 +3806,7 @@ >>> CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type, >>> count, NULL, tag, dest, comm, >>> peer_ref->proc_name.jobid, >>> - peer_ref->proc_name.vpid, >>> - peer_ref->proc_name.epoch); >>> + peer_ref->proc_name.vpid); >>> >>> (*msg_ref)->done = 0; >>> (*msg_ref)->active = 0; >>> @@ -5284,8 +5281,7 @@ >>> */ >>> peer_name.jobid = ORTE_PROC_MY_NAME->jobid; >>> peer_name.vpid = peer_idx; >>> - peer_name.epoch = ORTE_EPOCH_INVALID; >>> - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); >>> + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); >>> >>> if( NULL == (peer_ref = find_peer(peer_name))) { >>> opal_output(mca_crcp_bkmrk_component . super . output_handle, >>> @@ -5346,8 +5342,7 @@ >>> >>> peer_name.jobid = ORTE_PROC_MY_NAME->jobid; >>> peer_name.vpid = peer_idx; >>> - peer_name.epoch = ORTE_EPOCH_INVALID; >>> - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); >>> + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); >>> >>> if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name, >>> OMPI_CRCP_COORD_BOOKMARK_TAG, >>> @@ -5529,7 +5524,8 @@ >>> HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret); >>> d_msg_ack->peer.jobid = peer_ref->proc_name.jobid; >>> d_msg_ack->peer.vpid = peer_ref->proc_name.vpid; >>> - d_msg_ack->peer.epoch = peer_ref->proc_name.epoch; >>> + ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch); >>> + >>> d_msg_ack->complete = false; >>> opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super)); >>> OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component . super . output_handle, >>> @@ -6169,8 +6165,7 @@ >>> count, datatype_size, tag, rank, >>> ompi_comm_lookup(comm_id), >>> peer_ref->proc_name.jobid, >>> - peer_ref->proc_name.vpid, >>> - peer_ref->proc_name.epoch); >>> + peer_ref->proc_name.vpid); >>> >>> traffic_message_create_drain_message(true, num_left_unresolved, >>> peer_ref, >>> >>> Modified: trunk/ompi/mca/dpm/orte/dpm_orte.c >>> ============================================================================== >>> --- trunk/ompi/mca/dpm/orte/dpm_orte.c (original) >>> +++ trunk/ompi/mca/dpm/orte/dpm_orte.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -1130,7 +1130,7 @@ >>> /* flag the identity of the remote proc */ >>> carport.jobid = mev->sender.jobid; >>> carport.vpid = mev->sender.vpid; >>> - carport.epoch = mev->sender.epoch; >>> + ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch); >>> >>> /* release the event */ >>> OBJ_RELEASE(mev); >>> >>> Modified: trunk/ompi/mca/pml/bfo/pml_bfo_failover.c >>> ============================================================================== >>> --- trunk/ompi/mca/pml/bfo/pml_bfo_failover.c (original) >>> +++ trunk/ompi/mca/pml/bfo/pml_bfo_failover.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -1,8 +1,5 @@ >>> /* >>> * Copyright (c) 2010 Oracle and/or its affiliates. All rights >>> reserved. >>> - * Copyright (c) 2004-2011 The University of Tennessee and The University >>> - * of Tennessee Research Foundation. All rights >>> - * reserved. >>> * $COPYRIGHT$ >>> * >>> * Additional copyrights may follow >>> @@ -398,13 +395,13 @@ >>> (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { >>> orte_proc.jobid = hdr->hdr_restart.hdr_jobid; >>> orte_proc.vpid = hdr->hdr_restart.hdr_vpid; >>> - orte_proc.epoch = hdr->hdr_restart.hdr_epoch; >>> + >>> ompi_proc = ompi_proc_find(&orte_proc); >>> opal_output_verbose(20, mca_pml_bfo_output, >>> "RNDVRESTARTNOTIFY: received: does not match >>> request, sending NACK back " >>> "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d >>> SRC:req=%d,hdr=%d " >>> "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, >>> peer=%d, hdr->hdr_jobid=%d, " >>> - "hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, >>> ompi_proc->proc_hostname=%s", >>> + "hdr->hdr_vpid=%d, >>> ompi_proc->proc_hostname=%s", >>> (uint16_t)recvreq->req_msgseq, >>> hdr->hdr_match.hdr_seq, >>> recvreq->req_recv . req_base . >>> req_comm->c_contextid, hdr->hdr_match.hdr_ctx, >>> recvreq->req_recv . req_base . req_ompi . >>> req_status . MPI_SOURCE, >>> @@ -413,7 +410,7 @@ >>> recvreq->remote_req_send.pval, (void *)recvreq, >>> recvreq->req_recv . req_base . req_ompi . >>> req_status . MPI_SOURCE, >>> hdr->hdr_restart.hdr_jobid, >>> hdr->hdr_restart.hdr_vpid, >>> - hdr->hdr_restart.hdr_epoch, >>> ompi_proc->proc_hostname); >>> + ompi_proc->proc_hostname); >>> mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); >>> return; >>> } >>> @@ -715,7 +712,6 @@ >>> restart->hdr_dst_rank = sendreq->req_send . req_base . req_peer; /* >>> Needed for NACKs */ >>> restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid; >>> restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid; >>> - restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch; >>> >>> bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc); >>> >>> >>> Modified: trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h >>> ============================================================================== >>> --- trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h (original) >>> +++ trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -2,9 +2,6 @@ >>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >>> * University Research and Technology >>> * Corporation. All rights reserved. >>> - * Copyright (c) 2004-2011 The University of Tennessee and The University >>> - * of Tennessee Research Foundation. All rights >>> - * reserved. >>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >>> * University of Stuttgart. All rights reserved. >>> * Copyright (c) 2004-2005 The Regents of the University of California. >>> @@ -415,7 +412,6 @@ >>> int32_t hdr_dst_rank; /**< needed to send NACK */ >>> uint32_t hdr_jobid; /**< needed to send NACK */ >>> uint32_t hdr_vpid; /**< needed to send NACK */ >>> - uint32_t hdr_epoch; /**< needed to send NACK */ >>> }; >>> typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t; >>> >>> @@ -428,7 +424,6 @@ >>> (h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \ >>> (h).hdr_jobid = ntohl((h).hdr_jobid); \ >>> (h).hdr_vpid = ntohl((h).hdr_vpid); \ >>> - (h).hdr_epoch = ntohl((h).hdr_epoch); \ >>> } while (0) >>> >>> #define MCA_PML_BFO_RESTART_HDR_HTON(h) \ >>> @@ -437,7 +432,6 @@ >>> (h).hdr_dst_rank = htonl((h).hdr_dst_rank); \ >>> (h).hdr_jobid = htonl((h).hdr_jobid); \ >>> (h).hdr_vpid = htonl((h).hdr_vpid); \ >>> - (h).hdr_epoch = htonl((h).hdr_epoch); \ >>> } while (0) >>> >>> #endif /* PML_BFO */ >>> >>> Modified: trunk/ompi/proc/proc.c >>> ============================================================================== >>> --- trunk/ompi/proc/proc.c (original) >>> +++ trunk/ompi/proc/proc.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -108,7 +108,8 @@ >>> >>> proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; >>> proc->proc_name.vpid = i; >>> - proc->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN); >>> + >>> if (i == ORTE_PROC_MY_NAME->vpid) { >>> ompi_proc_local_proc = proc; >>> proc->proc_flags = OPAL_PROC_ALL_LOCAL; >>> @@ -362,8 +363,7 @@ >>> >>> /* Does not change: proc->proc_name.vpid */ >>> proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; >>> - proc->proc_name.epoch = ORTE_EPOCH_INVALID; >>> - proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name); >>> + >>> ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name)); >>> >>> /* Make sure to clear the local flag before we set it below */ >>> proc->proc_flags = 0; >>> >>> Modified: trunk/opal/config/opal_configure_options.m4 >>> ============================================================================== >>> --- trunk/opal/config/opal_configure_options.m4 (original) >>> +++ trunk/opal/config/opal_configure_options.m4 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -416,6 +416,14 @@ >>> AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1") >>> >>> # >>> +# Compile in resilient runtime code >>> +# >>> +AC_ARG_ENABLE(resilient-orte, >>> + [AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient >>> runtime code.])]) >>> +AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"]) >>> +AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"]) >>> + >>> +# >>> # Do we want to install binaries? >>> # >>> AC_ARG_ENABLE([binaries], >>> >>> Modified: trunk/orte/include/orte/types.h >>> ============================================================================== >>> --- trunk/orte/include/orte/types.h (original) >>> +++ trunk/orte/include/orte/types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -81,24 +81,43 @@ >>> #define ORTE_VPID_T OPAL_UINT32 >>> #define ORTE_VPID_MAX UINT32_MAX-2 >>> #define ORTE_VPID_MIN 0 >>> + >>> +#if ORTE_ENABLE_EPOCH >>> typedef uint32_t orte_epoch_t; >>> #define ORTE_EPOCH_T OPAL_UINT32 >>> #define ORTE_EPOCH_MAX UINT32_MAX-2 >>> #define ORTE_EPOCH_MIN 0 >>> +#endif >>> >>> +#if ORTE_ENABLE_EPOCH >>> #define ORTE_PROCESS_NAME_HTON(n) \ >>> do { \ >>> n.jobid = htonl(n.jobid); \ >>> n.vpid = htonl(n.vpid); \ >>> n.epoch = htonl(n.epoch); \ >>> } while (0) >>> +#else >>> +#define ORTE_PROCESS_NAME_HTON(n) \ >>> +do { \ >>> + n.jobid = htonl(n.jobid); \ >>> + n.vpid = htonl(n.vpid); \ >>> +} while (0) >>> +#endif >>> >>> +#if ORTE_ENABLE_EPOCH >>> #define ORTE_PROCESS_NAME_NTOH(n) \ >>> do { \ >>> n.jobid = ntohl(n.jobid); \ >>> n.vpid = ntohl(n.vpid); \ >>> n.epoch = ntohl(n.epoch); \ >>> } while (0) >>> +#else >>> +#define ORTE_PROCESS_NAME_NTOH(n) \ >>> +do { \ >>> + n.jobid = ntohl(n.jobid); \ >>> + n.vpid = ntohl(n.vpid); \ >>> +} while (0) >>> +#endif >>> >>> #define ORTE_NAME_ARGS(n) \ >>> (unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : >>> (unsigned long)(n)->jobid), \ >>> @@ -127,6 +146,7 @@ >>> struct orte_process_name_t { >>> orte_jobid_t jobid; /**< Job number */ >>> orte_vpid_t vpid; /**< Process id - equivalent to rank */ >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; /**< Epoch - used to measure the generation of a >>> recovered process. >>> * The epoch will start at ORTE_EPOCH_MIN and >>> * increment every time the process is detected >>> as >>> @@ -135,6 +155,7 @@ >>> * processes that did not directly detect the >>> * failure to increment their epochs. >>> */ >>> +#endif >>> }; >>> typedef struct orte_process_name_t orte_process_name_t; >>> >>> @@ -157,7 +178,10 @@ >>> #define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an >>> orte_process_name_t */ >>> #define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid >>> */ >>> #define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid >>> */ >>> + >>> +#if ORTE_ENABLE_EPOCH >>> #define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an >>> epoch */ >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> /* State-related types */ >>> >>> Modified: trunk/orte/mca/db/daemon/db_daemon.c >>> ============================================================================== >>> --- trunk/orte/mca/db/daemon/db_daemon.c (original) >>> +++ trunk/orte/mca/db/daemon/db_daemon.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -386,7 +386,7 @@ >>> dat = OBJ_NEW(orte_db_data_t); >>> dat->name.jobid = sender->jobid; >>> dat->name.vpid = sender->vpid; >>> - dat->name.epoch= sender->epoch; >>> + ORTE_EPOCH_SET(dat->name.epoch,sender->epoch); >>> dat->key = key; >>> count=1; >>> opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32); >>> >>> Modified: trunk/orte/mca/errmgr/app/errmgr_app.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/app/errmgr_app.c (original) >>> +++ trunk/orte/mca/errmgr/app/errmgr_app.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -82,8 +82,10 @@ >>> NULL, >>> NULL, >>> NULL, >>> - orte_errmgr_base_register_migration_warning, >>> - orte_errmgr_base_set_fault_callback >>> + orte_errmgr_base_register_migration_warning >>> +#if ORTE_RESIL_ORTE >>> + ,orte_errmgr_base_set_fault_callback >>> +#endif >>> }; >>> >>> /************************ >>> @@ -93,18 +95,23 @@ >>> { >>> int ret = ORTE_SUCCESS; >>> >>> +#if ORTE_RESIL_ORTE >>> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>> ORTE_RML_TAG_EPOCH_CHANGE, >>> ORTE_RML_PERSISTENT, >>> epoch_change_recv, >>> NULL); >>> +#endif >>> + >>> return ret; >>> } >>> >>> static int finalize(void) >>> { >>> +#if ORTE_RESIL_ORTE >>> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, >>> ORTE_RML_TAG_EPOCH_CHANGE); >>> +#endif >>> >>> return ORTE_SUCCESS; >>> } >>> @@ -151,6 +158,7 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> void epoch_change_recv(int status, >>> orte_process_name_t *sender, >>> opal_buffer_t *buffer, >>> @@ -209,15 +217,20 @@ >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> >>> (*fault_cbfunc)(procs); >>> + } else if (NULL == fault_cbfunc) { >>> + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >>> + "%s errmgr:app Calling fault callback failed (NULL >>> pointer)!", >>> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> } else { >>> OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >>> - "%s errmgr:app Calling fault callback failed!", >>> + "%s errmgr:app Calling fault callback failed (num_dead >>> <= 0)!", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> } >>> >>> free(proc); >>> OBJ_RELEASE(procs); >>> } >>> +#endif >>> >>> static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, >>> orte_std_cntr_t num_procs) >>> { >>> >>> Modified: trunk/orte/mca/errmgr/base/errmgr_base_fns.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/base/errmgr_base_fns.c (original) >>> +++ trunk/orte/mca/errmgr/base/errmgr_base_fns.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -97,13 +97,13 @@ >>> { >>> item->proc_name.vpid = ORTE_VPID_INVALID; >>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>> - item->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); >>> } >>> >>> void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t >>> *item) >>> { >>> item->proc_name.vpid = ORTE_VPID_INVALID; >>> - item->proc_name.epoch = ORTE_EPOCH_INVALID; >>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); >>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>> } >>> >>> @@ -139,13 +139,13 @@ >>> void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) >>> { >>> item->proc_name.vpid = ORTE_VPID_INVALID; >>> - item->proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); >>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>> >>> item->node_name = NULL; >>> >>> item->map_proc_name.vpid = ORTE_VPID_INVALID; >>> - item->map_proc_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN); >>> item->map_proc_name.jobid = ORTE_JOBID_INVALID; >>> >>> item->map_node_name = NULL; >>> @@ -156,7 +156,7 @@ >>> void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) >>> { >>> item->proc_name.vpid = ORTE_VPID_INVALID; >>> - item->proc_name.epoch = ORTE_EPOCH_INVALID; >>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); >>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>> >>> if( NULL != item->node_name ) { >>> @@ -165,7 +165,7 @@ >>> } >>> >>> item->map_proc_name.vpid = ORTE_VPID_INVALID; >>> - item->map_proc_name.epoch = ORTE_EPOCH_INVALID; >>> + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID); >>> item->map_proc_name.jobid = ORTE_JOBID_INVALID; >>> >>> if( NULL != item->map_node_name ) { >>> >>> Modified: trunk/orte/mca/errmgr/base/errmgr_base_tool.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/base/errmgr_base_tool.c (original) >>> +++ trunk/orte/mca/errmgr/base/errmgr_base_tool.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -267,7 +267,7 @@ >>> */ >>> errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID; >>> errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID; >>> - errmgr_cmdline_sender.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN); >>> if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>> ORTE_RML_TAG_MIGRATE, >>> 0, >>> @@ -379,14 +379,14 @@ >>> if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, >>> ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) { >>> swap_dest.jobid = errmgr_cmdline_sender.jobid; >>> swap_dest.vpid = errmgr_cmdline_sender.vpid; >>> - swap_dest.epoch = errmgr_cmdline_sender.epoch; >>> + ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch); >>> >>> errmgr_cmdline_sender = *sender; >>> >>> orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS); >>> >>> errmgr_cmdline_sender.jobid = swap_dest.jobid; >>> errmgr_cmdline_sender.vpid = swap_dest.vpid; >>> - errmgr_cmdline_sender.epoch = swap_dest.epoch; >>> + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch); >>> >>> goto cleanup; >>> } >>> >>> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp.c (original) >>> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -53,6 +53,7 @@ >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/runtime/orte_locks.h" >>> #include "orte/runtime/orte_quit.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/errmgr/errmgr.h" >>> #include "orte/mca/errmgr/base/base.h" >>> @@ -83,9 +84,11 @@ >>> orte_errmgr_hnp_global_suggest_map_targets, >>> /* FT Event hook */ >>> orte_errmgr_hnp_global_ft_event, >>> - orte_errmgr_base_register_migration_warning, >>> + orte_errmgr_base_register_migration_warning >>> +#if ORTE_RESIL_ORTE >>> /* Set the callback */ >>> - orte_errmgr_base_set_fault_callback >>> + ,orte_errmgr_base_set_fault_callback >>> +#endif >>> }; >>> >>> >>> @@ -97,14 +100,16 @@ >>> static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t >>> jobstate, >>> orte_proc_state_t state, >>> orte_exit_code_t exit_code); >>> static void check_job_complete(orte_job_t *jdata); >>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>> epoch); >>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); >>> static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, >>> orte_proc_state_t state, orte_exit_code_t exit_code); >>> static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); >>> +#if ORTE_RESIL_ORTE >>> static int send_to_local_applications(opal_pointer_array_t *dead_names); >>> static void failure_notification(int status, orte_process_name_t* sender, >>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>> void* cbdata); >>> +#endif >>> >>> /************************ >>> * API Definitions >>> @@ -380,16 +385,21 @@ >>> **********************/ >>> int orte_errmgr_hnp_base_global_init(void) >>> { >>> - int ret; >>> + int ret = ORTE_SUCCESS; >>> >>> +#if ORTE_RESIL_ORTE >>> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>> ORTE_RML_TAG_FAILURE_NOTICE, >>> ORTE_RML_PERSISTENT, failure_notification, >>> NULL); >>> +#endif >>> + >>> return ret; >>> } >>> >>> int orte_errmgr_hnp_base_global_finalize(void) >>> { >>> +#if ORTE_RESIL_ORTE >>> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); >>> +#endif >>> >>> return ORTE_SUCCESS; >>> } >>> @@ -406,6 +416,7 @@ >>> orte_odls_child_t *child; >>> int rc; >>> orte_app_context_t *app; >>> + orte_proc_t *pdat; >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >>> "%s errmgr:hnp: job %s reported state %s" >>> @@ -538,7 +549,7 @@ >>> ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, >>> exit_code); >>> /* order all local procs for this job to be killed */ >>> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >>> check_job_complete(jdata); /* set the local proc states */ >>> /* the job object for this job will have been NULL'd >>> * in the array if the job was solely local. If it isn't >>> @@ -550,7 +561,7 @@ >>> break; >>> case ORTE_JOB_STATE_COMM_FAILED: >>> /* order all local procs for this job to be killed */ >>> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >>> check_job_complete(jdata); /* set the local proc states */ >>> /* the job object for this job will have been NULL'd >>> * in the array if the job was solely local. If it isn't >>> @@ -562,7 +573,7 @@ >>> break; >>> case ORTE_JOB_STATE_HEARTBEAT_FAILED: >>> /* order all local procs for this job to be killed */ >>> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >>> check_job_complete(jdata); /* set the local proc states */ >>> /* the job object for this job will have been NULL'd >>> * in the array if the job was solely local. If it isn't >>> @@ -632,10 +643,6 @@ >>> } >>> } >>> >>> - if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { >>> - exit_code = 0; >>> - } >>> - >>> orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); >>> check_job_complete(jdata); /* need to set the job state */ >>> /* the job object for this job will have been NULL'd >>> @@ -679,7 +686,7 @@ >>> >>> case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: >>> if (jdata->enable_recovery) { >>> - killprocs(proc->jobid, proc->vpid, proc->epoch); >>> + killprocs(proc->jobid, proc->vpid); >>> /* is this a local proc */ >>> if (NULL != (child = proc_is_local(proc))) { >>> /* local proc - see if it has reached its restart limit */ >>> @@ -778,18 +785,37 @@ >>> opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM >>> FAILED DAEMON %s", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> ORTE_NAME_PRINT(proc)); >>> /* kill all local procs */ >>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>> /* kill all jobs */ >>> hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >>> /* check if all is complete so we can terminate */ >>> check_job_complete(jdata); >>> } >>> } else { >>> +#if !ORTE_RESIL_ORTE >>> + if (NULL == (pdat = >>> (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { >>> + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >>> + orte_show_help("help-orte-errmgr-hnp.txt", >>> "errmgr-hnp:daemon-died", true, >>> + ORTE_VPID_PRINT(proc->vpid), >>> "Unknown"); >>> + } else { >>> + orte_show_help("help-orte-errmgr-hnp.txt", >>> "errmgr-hnp:daemon-died", true, >>> + ORTE_VPID_PRINT(proc->vpid), >>> + (NULL == pdat->node) ? "Unknown" : >>> + ((NULL == pdat->node->name) ? >>> "Unknown" : pdat->node->name)); >>> + } >>> +#endif >>> if (ORTE_SUCCESS != >>> orte_errmgr_hnp_record_dead_process(proc)) { >>> /* The process is already dead so don't keep trying >>> to do >>> * this stuff. */ >>> return ORTE_SUCCESS; >>> } >>> + >>> +#if !ORTE_RESIL_ORTE >>> + /* kill all local procs */ >>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>> + /* kill all jobs */ >>> + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >>> +#endif >>> /* We'll check if the job was complete when we get the >>> * message back from the HNP notifying us of the dead >>> * process */ >>> @@ -805,7 +831,7 @@ >>> } else { >>> orte_errmgr_hnp_record_dead_process(proc); >>> /* kill all local procs */ >>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>> /* kill all jobs */ >>> hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >>> return ORTE_ERR_UNRECOVERABLE; >>> @@ -824,6 +850,7 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> static void failure_notification(int status, orte_process_name_t* sender, >>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>> void* cbdata) >>> @@ -984,6 +1011,7 @@ >>> >>> OBJ_RELEASE(dead_names); >>> } >>> +#endif >>> >>> /***************** >>> * Local Functions >>> @@ -1354,7 +1382,6 @@ >>> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >>> } >>> break; >>> -#if 0 >>> case ORTE_PROC_STATE_ABORTED_BY_SIG: >>> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>> "%s errmgr:hnp:check_job_completed proc %s >>> aborted by signal", >>> @@ -1370,7 +1397,6 @@ >>> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >>> } >>> break; >>> -#endif >>> case ORTE_PROC_STATE_TERM_WO_SYNC: >>> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>> "%s errmgr:hnp:check_job_completed proc %s >>> terminated without sync", >>> @@ -1393,7 +1419,6 @@ >>> } >>> break; >>> case ORTE_PROC_STATE_COMM_FAILED: >>> -#if 1 >>> if (!jdata->abort) { >>> jdata->state = ORTE_JOB_STATE_COMM_FAILED; >>> /* point to the lowest rank to cause the problem */ >>> @@ -1403,7 +1428,6 @@ >>> jdata->abort = true; >>> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >>> } >>> -#endif >>> break; >>> case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: >>> if (!jdata->abort) { >>> @@ -1530,9 +1554,6 @@ >>> */ >>> CHECK_DAEMONS: >>> if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { >>> -#if 0 >>> - if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract >>> one for the HNP */ >>> -#endif >>> if (0 == orte_routed.num_routes()) { >>> /* orteds are done! */ >>> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>> @@ -1696,7 +1717,7 @@ >>> } >>> } >>> >>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>> epoch) >>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) >>> { >>> opal_pointer_array_t cmd; >>> orte_proc_t proc; >>> @@ -1707,7 +1728,9 @@ >>> orte_sensor.stop(job); >>> } >>> >>> - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && >>> ORTE_EPOCH_WILDCARD == epoch) { >>> + if (ORTE_JOBID_WILDCARD == job >>> + && ORTE_VPID_WILDCARD == vpid >>> + && ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { >>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { >>> ORTE_ERROR_LOG(rc); >>> } >>> @@ -1718,7 +1741,7 @@ >>> OBJ_CONSTRUCT(&proc, orte_proc_t); >>> proc . name . jobid = job; >>> proc . name . vpid = vpid; >>> - proc . name . epoch = epoch; >>> + ORTE_EPOCH_SET(proc . name . epoch,epoch); >>> opal_pointer_array_add(&cmd, &proc); >>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { >>> ORTE_ERROR_LOG(rc); >>> @@ -1913,13 +1936,15 @@ >>> } >>> >>> if (NULL != (pdat = >>> (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && >>> - ORTE_PROC_STATE_TERMINATED < pdat->state) { >>> + ORTE_PROC_STATE_TERMINATED > pdat->state) { >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* Make sure that the epochs match. */ >>> if (proc->epoch != pdat->name.epoch) { >>> opal_output(1, "The epoch does not match the current epoch. >>> Throwing the request out."); >>> return ORTE_SUCCESS; >>> } >>> +#endif >>> >>> dead_names = OBJ_NEW(opal_pointer_array_t); >>> >>> @@ -1935,6 +1960,7 @@ >>> } >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> if (!mca_errmgr_hnp_component.term_in_progress) { >>> /* >>> * Send a message to the other daemons so they know that a daemon >>> has >>> @@ -1949,7 +1975,7 @@ >>> OBJ_RELEASE(buffer); >>> } else { >>> >>> - /* Iterate of the list of dead procs and send them along >>> with >>> + /* Iterate over the list of dead procs and send them along >>> with >>> * the rest. The HNP needs this info so it can tell the other >>> * ORTEDs and they can inform the appropriate applications. >>> */ >>> @@ -1973,6 +1999,9 @@ >>> } else { >>> orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); >>> } >>> +#else >>> + orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); >>> +#endif >>> } >>> >>> return ORTE_SUCCESS; >>> @@ -2011,6 +2040,7 @@ >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> ORTE_NAME_PRINT(&pdat->name))); >>> >>> +#if ORTE_RESIL_ORTE >>> /* Make sure the epochs match, if not it probably means that we >>> * already reported this failure. */ >>> if (name_item->epoch != pdat->name.epoch) { >>> @@ -2018,6 +2048,7 @@ >>> } >>> >>> orte_util_set_epoch(name_item, name_item->epoch + 1); >>> +#endif >>> >>> /* Remove it from the job array */ >>> opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); >>> @@ -2034,6 +2065,7 @@ >>> >>> OBJ_RELEASE(pdat); >>> >>> +#if ORTE_RESIL_ORTE >>> /* Create a new proc object that will keep track of the epoch >>> * information */ >>> pdat = OBJ_NEW(orte_proc_t); >>> @@ -2041,14 +2073,15 @@ >>> pdat->name.vpid = name_item->vpid; >>> pdat->name.epoch = name_item->epoch + 1; >>> >>> - /* Set the state as terminated so we'll know the process isn't >>> - * actually there. */ >>> - pdat->state = ORTE_PROC_STATE_TERMINATED; >>> - >>> opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); >>> jdat->num_procs++; >>> jdat->num_terminated++; >>> +#endif >>> + /* Set the state as terminated so we'll know the process isn't >>> + * actually there. */ >>> + pdat->state = ORTE_PROC_STATE_TERMINATED; >>> } else { >>> +#if ORTE_RESIL_ORTE >>> opal_output(0, "Proc data not found for %s", >>> ORTE_NAME_PRINT(name_item)); >>> /* Create a new proc object that will keep track of the epoch >>> * information */ >>> @@ -2064,11 +2097,13 @@ >>> opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); >>> jdat->num_procs++; >>> jdat->num_terminated++; >>> +#endif >>> } >>> >>> check_job_complete(jdat); >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> if (!orte_orteds_term_ordered) { >>> /* Need to update the orted routing module. */ >>> orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); >>> @@ -2077,10 +2112,12 @@ >>> (*fault_cbfunc)(dead_procs); >>> } >>> } >>> +#endif >>> >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> int send_to_local_applications(opal_pointer_array_t *dead_names) { >>> opal_buffer_t *buf; >>> int ret = ORTE_SUCCESS; >>> @@ -2121,3 +2158,5 @@ >>> >>> return ret; >>> } >>> +#endif >>> + >>> >>> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c (original) >>> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -522,7 +522,7 @@ >>> wp_item = OBJ_NEW(errmgr_autor_wp_item_t); >>> wp_item->name.jobid = proc->jobid; >>> wp_item->name.vpid = proc->vpid; >>> - wp_item->name.epoch = proc->epoch; >>> + ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); >>> wp_item->state = state; >>> >>> opal_list_append(procs_pending_recovery, &(wp_item->super)); >>> @@ -626,7 +626,7 @@ >>> { >>> wp->name.jobid = ORTE_JOBID_INVALID; >>> wp->name.vpid = ORTE_VPID_INVALID; >>> - wp->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN); >>> >>> wp->state = 0; >>> } >>> @@ -635,7 +635,7 @@ >>> { >>> wp->name.jobid = ORTE_JOBID_INVALID; >>> wp->name.vpid = ORTE_VPID_INVALID; >>> - wp->name.epoch = ORTE_EPOCH_INVALID; >>> + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); >>> >>> wp->state = 0; >>> } >>> >>> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c (original) >>> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -750,7 +750,7 @@ >>> close_iof_stdin = true; >>> iof_name.jobid = proc->name.jobid; >>> iof_name.vpid = proc->name.vpid; >>> - iof_name.epoch = proc->name.epoch; >>> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >>> } >>> } >>> } >>> @@ -807,7 +807,7 @@ >>> close_iof_stdin = true; >>> iof_name.jobid = proc->name.jobid; >>> iof_name.vpid = proc->name.vpid; >>> - iof_name.epoch = proc->name.epoch; >>> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >>> } >>> } >>> } >>> @@ -855,7 +855,7 @@ >>> close_iof_stdin = true; >>> iof_name.jobid = proc->name.jobid; >>> iof_name.vpid = proc->name.vpid; >>> - iof_name.epoch = proc->name.epoch; >>> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >>> } >>> } >>> } >>> >>> Modified: trunk/orte/mca/errmgr/orted/errmgr_orted.c >>> ============================================================================== >>> --- trunk/orte/mca/errmgr/orted/errmgr_orted.c (original) >>> +++ trunk/orte/mca/errmgr/orted/errmgr_orted.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -34,6 +34,7 @@ >>> #include "orte/util/show_help.h" >>> #include "orte/util/nidmap.h" >>> #include "orte/runtime/orte_globals.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> #include "orte/mca/rml/rml.h" >>> #include "orte/mca/odls/odls.h" >>> #include "orte/mca/odls/base/base.h" >>> @@ -41,7 +42,9 @@ >>> #include "orte/mca/plm/plm_types.h" >>> #include "orte/mca/routed/routed.h" >>> #include "orte/mca/sensor/sensor.h" >>> +#include "orte/mca/ess/ess.h" >>> #include "orte/runtime/orte_quit.h" >>> +#include "orte/runtime/orte_globals.h" >>> >>> #include "orte/mca/errmgr/errmgr.h" >>> #include "orte/mca/errmgr/base/base.h" >>> @@ -59,13 +62,15 @@ >>> static void update_local_children(orte_odls_job_t *jobdat, >>> orte_job_state_t jobstate, >>> orte_proc_state_t state); >>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>> epoch); >>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); >>> static int record_dead_process(orte_process_name_t *proc); >>> -static int send_to_local_applications(opal_pointer_array_t *dead_names); >>> static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); >>> +#if ORTE_RESIL_ORTE >>> +static int send_to_local_applications(opal_pointer_array_t *dead_names); >>> static void failure_notification(int status, orte_process_name_t* sender, >>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>> void* cbdata); >>> +#endif >>> >>> /* >>> * Module functions: Global >>> @@ -104,8 +109,10 @@ >>> predicted_fault, >>> suggest_map_targets, >>> ft_event, >>> - orte_errmgr_base_register_migration_warning, >>> - orte_errmgr_base_set_fault_callback /* Set callback function */ >>> + orte_errmgr_base_register_migration_warning >>> +#if ORTE_RESIL_ORTE >>> + ,orte_errmgr_base_set_fault_callback /* Set callback function */ >>> +#endif >>> }; >>> >>> /************************ >>> @@ -113,16 +120,22 @@ >>> ************************/ >>> static int init(void) >>> { >>> - int ret; >>> + int ret = ORTE_SUCCESS; >>> >>> +#if ORTE_RESIL_ORTE >>> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>> ORTE_RML_TAG_FAILURE_NOTICE, >>> ORTE_RML_PERSISTENT, failure_notification, >>> NULL); >>> +#endif >>> + >>> return ret; >>> } >>> >>> static int finalize(void) >>> { >>> +#if ORTE_RESIL_ORTE >>> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); >>> +#endif >>> + >>> return ORTE_SUCCESS; >>> } >>> >>> @@ -228,10 +241,10 @@ >>> /* update all procs in job */ >>> update_local_children(jobdat, jobstate, >>> ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); >>> /* order all local procs for this job to be killed */ >>> - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); >>> case ORTE_JOB_STATE_COMM_FAILED: >>> /* kill all local procs */ >>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>> /* tell the caller we can't recover */ >>> return ORTE_ERR_UNRECOVERABLE; >>> break; >>> @@ -276,7 +289,7 @@ >>> /* see if this was a lifeline */ >>> if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { >>> /* kill our children */ >>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>> ORTE_EPOCH_WILDCARD); >>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>> /* terminate - our routed children will see >>> * us leave and automatically die >>> */ >>> @@ -290,10 +303,18 @@ >>> if (0 == orte_routed.num_routes() && >>> 0 == opal_list_get_size(&orte_local_children)) { >>> orte_quit(); >>> + } else { >>> + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>> + "%s errmgr:orted not exiting, num_routes() == >>> %d, num children == %d", >>> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> + orte_routed.num_routes(), >>> + opal_list_get_size(&orte_local_children))); >>> } >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> record_dead_process(proc); >>> +#endif >>> >>> /* if not, then indicate we can continue */ >>> return ORTE_SUCCESS; >>> @@ -344,7 +365,7 @@ >>> /* Decrement the number of local procs */ >>> jobdat->num_local_procs--; >>> /* kill this proc */ >>> - killprocs(proc->jobid, proc->vpid, proc->epoch); >>> + killprocs(proc->jobid, proc->vpid); >>> } >>> app = >>> (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, >>> child->app_idx); >>> if( jobdat->enable_recovery && child->restarts < >>> app->max_restarts ) { >>> @@ -526,10 +547,12 @@ >>> ORTE_ERROR_LOG(rc); >>> goto FINAL_CLEANUP; >>> } >>> +#if ORTE_ENABLE_EPOCH >>> if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, >>> &child->name->epoch, 1, ORTE_EPOCH))) { >>> ORTE_ERROR_LOG(rc); >>> goto FINAL_CLEANUP; >>> } >>> +#endif >>> } >>> } >>> /* pack an invalid marker */ >>> @@ -660,7 +683,7 @@ >>> continue; >>> } >>> >>> - if (name_item->epoch < orte_util_lookup_epoch(name_item)) { >>> + if (0 < >>> ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { >>> continue; >>> } >>> >>> @@ -669,9 +692,11 @@ >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> ORTE_NAME_PRINT(name_item))); >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* Increment the epoch */ >>> orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); >>> orte_util_set_epoch(name_item, name_item->epoch + 1); >>> +#endif >>> >>> OPAL_THREAD_LOCK(&orte_odls_globals.mutex); >>> >>> @@ -706,6 +731,7 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> static void failure_notification(int status, orte_process_name_t* sender, >>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>> void* cbdata) >>> @@ -714,7 +740,7 @@ >>> orte_std_cntr_t n; >>> int ret = ORTE_SUCCESS, num_failed; >>> int32_t i; >>> - orte_process_name_t *name_item, proc; >>> + orte_process_name_t *name_item; >>> >>> dead_names = OBJ_NEW(opal_pointer_array_t); >>> >>> @@ -746,7 +772,7 @@ >>> /* There shouldn't be an issue of receiving this message multiple >>> * times but it doesn't hurt to double check. >>> */ >>> - if (proc.epoch < orte_util_lookup_epoch(name_item)) { >>> + if (0 < >>> ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { >>> opal_output(1, "Received from proc %s local epoch %d", >>> ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); >>> continue; >>> } >>> @@ -767,6 +793,7 @@ >>> free(name_item); >>> } >>> } >>> +#endif >>> >>> /***************** >>> * Local Functions >>> @@ -948,11 +975,13 @@ >>> ORTE_ERROR_LOG(rc); >>> return rc; >>> } >>> +#if ORTE_ENABLE_EPOCH >>> /* Pack the child's epoch. */ >>> if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, >>> &(child->name->epoch), 1, ORTE_EPOCH))) { >>> ORTE_ERROR_LOG(rc); >>> return rc; >>> } >>> +#endif >>> /* pack the contact info */ >>> if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, >>> OPAL_STRING))) { >>> ORTE_ERROR_LOG(rc); >>> @@ -1015,7 +1044,7 @@ >>> } >>> } >>> >>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>> epoch) >>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) >>> { >>> opal_pointer_array_t cmd; >>> orte_proc_t proc; >>> @@ -1026,7 +1055,9 @@ >>> orte_sensor.stop(job); >>> } >>> >>> - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && >>> ORTE_EPOCH_WILDCARD == epoch) { >>> + if (ORTE_JOBID_WILDCARD == job >>> + && ORTE_VPID_WILDCARD == vpid >>> + && 0 == ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { >>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { >>> ORTE_ERROR_LOG(rc); >>> } >>> @@ -1037,7 +1068,7 @@ >>> OBJ_CONSTRUCT(&proc, orte_proc_t); >>> proc . name . jobid = job; >>> proc . name . vpid = vpid; >>> - proc . name . epoch = epoch; >>> + ORTE_EPOCH_SET(proc . name . epoch,epoch); >>> opal_pointer_array_add(&cmd, &proc); >>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { >>> ORTE_ERROR_LOG(rc); >>> @@ -1082,20 +1113,21 @@ >>> return rc; >>> } >>> >>> +#if ORTE_RESIL_ORTE >>> int send_to_local_applications(opal_pointer_array_t *dead_names) { >>> opal_buffer_t *buf; >>> int ret; >>> orte_process_name_t *name_item; >>> int size, i; >>> >>> - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, >>> - "%s Sending failure to local applications.", >>> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> - >>> buf = OBJ_NEW(opal_buffer_t); >>> >>> size = opal_pointer_array_get_size(dead_names); >>> >>> + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, >>> + "%s Sending %d failure(s) to local applications.", >>> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size)); >>> + >>> if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { >>> ORTE_ERROR_LOG(ret); >>> OBJ_RELEASE(buf); >>> @@ -1122,4 +1154,5 @@ >>> >>> return ORTE_SUCCESS; >>> } >>> +#endif >>> >>> >>> Modified: trunk/orte/mca/ess/alps/ess_alps_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/alps/ess_alps_module.c (original) >>> +++ trunk/orte/mca/ess/alps/ess_alps_module.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -363,8 +363,8 @@ >>> >>> ORTE_PROC_MY_NAME->jobid = jobid; >>> ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>> + ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); >>> + >>> ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "ess:alps set name to %s", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> >>> Modified: trunk/orte/mca/ess/base/base.h >>> ============================================================================== >>> --- trunk/orte/mca/ess/base/base.h (original) >>> +++ trunk/orte/mca/ess/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -57,7 +57,11 @@ >>> >>> ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available; >>> >>> +#if ORTE_ENABLE_EPOCH >>> ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t >>> *proc); >>> +#else >>> +ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc); >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> >>> >>> Modified: trunk/orte/mca/ess/base/ess_base_select.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/base/ess_base_select.c (original) >>> +++ trunk/orte/mca/ess/base/ess_base_select.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -36,21 +36,19 @@ >>> * Generic function to retrieve the epoch of a specific process >>> * from the job data. >>> */ >>> +#if !ORTE_ENABLE_EPOCH >>> +int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { >>> + return 0; >>> +} >>> +#else >>> orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { >>> orte_epoch_t epoch = ORTE_EPOCH_INVALID; >>> >>> -#if !ORTE_DISABLE_FULL_SUPPORT >>> epoch = orte_util_lookup_epoch(proc); >>> -#endif >>> - >>> - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, >>> - "%s ess:generic: proc %s has epoch %d", >>> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> - ORTE_NAME_PRINT(proc), >>> - epoch)); >>> >>> return epoch; >>> } >>> +#endif >>> >>> int >>> orte_ess_base_select(void) >>> >>> Modified: trunk/orte/mca/ess/env/ess_env_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/env/ess_env_module.c (original) >>> +++ trunk/orte/mca/ess/env/ess_env_module.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -392,8 +392,7 @@ >>> >>> ORTE_PROC_MY_NAME->jobid = jobid; >>> ORTE_PROC_MY_NAME->vpid = vpid; >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "ess:env set name to %s", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> >>> Modified: trunk/orte/mca/ess/ess.h >>> ============================================================================== >>> --- trunk/orte/mca/ess/ess.h (original) >>> +++ trunk/orte/mca/ess/ess.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -111,7 +111,11 @@ >>> * will get the most up to date version stored within the orte_proc_t struct. >>> * Obviously the epoch of the proc that is passed in will be ignored. >>> */ >>> +#if ORTE_ENABLE_EPOCH >>> typedef orte_epoch_t >>> (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); >>> +#else >>> +typedef int >>> (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); >>> +#endif >>> >>> /** >>> * Update the pidmap >>> >>> Modified: trunk/orte/mca/ess/generic/ess_generic_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/generic/ess_generic_module.c (original) >>> +++ trunk/orte/mca/ess/generic/ess_generic_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -155,7 +155,7 @@ >>> goto error; >>> } >>> ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "%s completed name definition", >>> @@ -273,7 +273,7 @@ >>> if (vpid == ORTE_PROC_MY_NAME->vpid) { >>> ORTE_PROC_MY_DAEMON->jobid = 0; >>> ORTE_PROC_MY_DAEMON->vpid = i; >>> - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >>> } >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "%s node %d name %s rank %s", >>> @@ -304,7 +304,7 @@ >>> if (vpid == ORTE_PROC_MY_NAME->vpid) { >>> ORTE_PROC_MY_DAEMON->jobid = 0; >>> ORTE_PROC_MY_DAEMON->vpid = i; >>> - ORTE_PROC_MY_DAEMON->epoch = >>> ORTE_PROC_MY_NAME->epoch; >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >>> } >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "%s node %d name %s rank %d", >>> >>> Modified: trunk/orte/mca/ess/hnp/ess_hnp_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/hnp/ess_hnp_module.c (original) >>> +++ trunk/orte/mca/ess/hnp/ess_hnp_module.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -494,7 +494,7 @@ >>> proc = OBJ_NEW(orte_proc_t); >>> proc->name.jobid = ORTE_PROC_MY_NAME->jobid; >>> proc->name.vpid = ORTE_PROC_MY_NAME->vpid; >>> - proc->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>> >>> proc->pid = orte_process_info.pid; >>> proc->rml_uri = orte_rml.get_contact_info(); >>> >>> Modified: trunk/orte/mca/ess/lsf/ess_lsf_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/lsf/ess_lsf_module.c (original) >>> +++ trunk/orte/mca/ess/lsf/ess_lsf_module.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -357,8 +357,7 @@ >>> >>> ORTE_PROC_MY_NAME->jobid = jobid; >>> ORTE_PROC_MY_NAME->vpid = vpid; >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>> >>> /* fix up the base name and make it the "real" name */ >>> lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); >>> >>> Modified: trunk/orte/mca/ess/singleton/ess_singleton_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/singleton/ess_singleton_module.c (original) >>> +++ trunk/orte/mca/ess/singleton/ess_singleton_module.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -188,7 +188,7 @@ >>> /* set the name */ >>> ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); >>> ORTE_PROC_MY_NAME->vpid = 0; >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>> >>> } else { >>> /* >>> >>> Modified: trunk/orte/mca/ess/slave/ess_slave_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/slave/ess_slave_module.c (original) >>> +++ trunk/orte/mca/ess/slave/ess_slave_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -280,8 +280,7 @@ >>> >>> ORTE_PROC_MY_NAME->jobid = jobid; >>> ORTE_PROC_MY_NAME->vpid = vpid; >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "ess:slave set name to %s", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> >>> Modified: trunk/orte/mca/ess/slurm/ess_slurm_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/slurm/ess_slurm_module.c (original) >>> +++ trunk/orte/mca/ess/slurm/ess_slurm_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -368,8 +368,7 @@ >>> /* fix up the vpid and make it the "real" vpid */ >>> slurm_nodeid = atoi(getenv("SLURM_NODEID")); >>> ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid; >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "ess:slurm set name to %s", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> >>> Modified: trunk/orte/mca/ess/slurmd/ess_slurmd_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/slurmd/ess_slurmd_module.c (original) >>> +++ trunk/orte/mca/ess/slurmd/ess_slurmd_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -195,7 +195,7 @@ >>> } >>> ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); >>> #endif >>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>> /* get our local rank */ >>> if (NULL == (envar = getenv("SLURM_LOCALID"))) { >>> error = "could not get SLURM_LOCALID"; >>> @@ -260,7 +260,7 @@ >>> nodeid = strtol(envar, NULL, 10); >>> ORTE_PROC_MY_DAEMON->jobid = 0; >>> ORTE_PROC_MY_DAEMON->vpid = nodeid; >>> - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >>> >>> /* get the number of ppn */ >>> if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) { >>> >>> Modified: trunk/orte/mca/ess/tm/ess_tm_module.c >>> ============================================================================== >>> --- trunk/orte/mca/ess/tm/ess_tm_module.c (original) >>> +++ trunk/orte/mca/ess/tm/ess_tm_module.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -364,7 +364,7 @@ >>> >>> ORTE_PROC_MY_NAME->jobid = jobid; >>> ORTE_PROC_MY_NAME->vpid = vpid; >>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>> "ess:tm set name to %s", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> >>> Modified: trunk/orte/mca/filem/rsh/filem_rsh_module.c >>> ============================================================================== >>> --- trunk/orte/mca/filem/rsh/filem_rsh_module.c (original) >>> +++ trunk/orte/mca/filem/rsh/filem_rsh_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -1097,11 +1097,11 @@ >>> if( NULL != proc_set ) { >>> wp_item->proc_set . source . jobid = proc_set->source.jobid; >>> wp_item->proc_set . source . vpid = proc_set->source.vpid; >>> - wp_item->proc_set . source . epoch = proc_set->source.epoch; >>> + ORTE_EPOCH_SET(wp_item->proc_set . source . >>> epoch,proc_set->source.epoch); >>> >>> wp_item->proc_set . sink . jobid = proc_set->sink.jobid; >>> wp_item->proc_set . sink . vpid = proc_set->sink.vpid; >>> - wp_item->proc_set . sink . epoch = proc_set->sink.epoch; >>> + ORTE_EPOCH_SET(wp_item->proc_set . sink . >>> epoch,proc_set->sink.epoch); >>> } >>> /* Copy the File Set */ >>> if( NULL != file_set ) { >>> @@ -1396,7 +1396,7 @@ >>> wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t); >>> wp_item->proc_set . source . jobid = sender->jobid; >>> wp_item->proc_set . source . vpid = sender->vpid; >>> - wp_item->proc_set . source . epoch = sender->epoch; >>> + ORTE_EPOCH_SET(wp_item->proc_set . source . >>> epoch,sender->epoch); >>> >>> opal_list_append(&work_pool_waiting, &(wp_item->super)); >>> } >>> >>> Modified: trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c >>> ============================================================================== >>> --- trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c (original) >>> +++ trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -168,8 +168,7 @@ >>> if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { >>> /* I send first */ >>> peer.vpid = vpids[1]; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> /* setup a temp buffer so I can inform the other side as to the >>> * number of entries in my buffer >>> @@ -226,8 +225,7 @@ >>> opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); >>> opal_dss.copy_payload(&buf, sendbuf); >>> peer.vpid = vpids[0]; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, >>> "%s grpcomm:coll:two-proc sending to %s", >>> @@ -320,8 +318,7 @@ >>> /* first send my current contents */ >>> nv = (rank - distance + np) % np; >>> peer.vpid = vpids[nv]; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> OBJ_CONSTRUCT(&buf, opal_buffer_t); >>> opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); >>> @@ -340,8 +337,7 @@ >>> num_recvd = 0; >>> nv = (rank + distance) % np; >>> peer.vpid = vpids[nv]; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> OBJ_CONSTRUCT(&bucket, opal_buffer_t); >>> if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, >>> @@ -439,8 +435,7 @@ >>> /* first send my current contents */ >>> nv = rank ^ distance; >>> peer.vpid = vpids[nv]; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> OBJ_CONSTRUCT(&buf, opal_buffer_t); >>> opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); >>> @@ -646,8 +641,7 @@ >>> proc.jobid = jobid; >>> proc.vpid = 0; >>> while (proc.vpid < jobdat->num_procs && 0 < >>> opal_list_get_size(&daemon_tree)) { >>> - proc.epoch = ORTE_EPOCH_INVALID; >>> - proc.epoch = orte_ess.proc_get_epoch(&proc); >>> + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); >>> >>> /* get the daemon that hosts this proc */ >>> daemonvpid = orte_ess.proc_get_daemon(&proc); >>> @@ -713,8 +707,7 @@ >>> /* send it */ >>> my_parent.jobid = ORTE_PROC_MY_NAME->jobid; >>> my_parent.vpid = orte_routed.get_routing_tree(NULL); >>> - my_parent.epoch = ORTE_EPOCH_INVALID; >>> - my_parent.epoch = orte_ess.proc_get_epoch(&my_parent); >>> + >>> ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); >>> >>> OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, >>> "%s grpcomm:base:daemon_coll: daemon collective >>> not the HNP - sending to parent %s", >>> >>> Modified: trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c >>> ============================================================================== >>> --- trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c (original) >>> +++ trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -95,7 +95,7 @@ >>> >>> my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; >>> my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; >>> - my_local_rank_zero_proc.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); >>> >>> if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { >>> ORTE_ERROR_LOG(rc); >>> @@ -270,7 +270,7 @@ >>> proc.jobid = ORTE_PROC_MY_NAME->jobid; >>> for (v=0; v < orte_process_info.num_procs; v++) { >>> proc.vpid = v; >>> - proc.epoch = orte_util_lookup_epoch(&proc); >>> + ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); >>> >>> /* is this proc local_rank=0 on its node? */ >>> if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { >>> @@ -285,7 +285,7 @@ >>> nm = OBJ_NEW(orte_namelist_t); >>> nm->name.jobid = proc.jobid; >>> nm->name.vpid = proc.vpid; >>> - nm->name.epoch = proc.epoch; >>> + ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); >>> >>> opal_list_append(&my_local_peers, &nm->item); >>> /* if I am not local_rank=0, is this one? */ >>> @@ -293,7 +293,7 @@ >>> 0 == orte_ess.get_local_rank(&proc)) { >>> my_local_rank_zero_proc.jobid = proc.jobid; >>> my_local_rank_zero_proc.vpid = proc.vpid; >>> - my_local_rank_zero_proc.epoch = proc.epoch; >>> + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); >>> } >>> } >>> >>> >>> Modified: trunk/orte/mca/iof/base/base.h >>> ============================================================================== >>> --- trunk/orte/mca/iof/base/base.h (original) >>> +++ trunk/orte/mca/iof/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -135,7 +135,7 @@ >>> ep = OBJ_NEW(orte_iof_sink_t); \ >>> ep->name.jobid = (nm)->jobid; \ >>> ep->name.vpid = (nm)->vpid; \ >>> - ep->name.epoch = (nm)->epoch; \ >>> + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ >>> ep->tag = (tg); \ >>> if (0 <= (fid)) { \ >>> ep->wev->fd = (fid); \ >>> @@ -169,7 +169,7 @@ >>> rev = OBJ_NEW(orte_iof_read_event_t); \ >>> rev->name.jobid = (nm)->jobid; \ >>> rev->name.vpid = (nm)->vpid; \ >>> - rev->name.epoch = (nm)->epoch; \ >>> + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ >>> rev->tag = (tg); \ >>> rev->fd = (fid); \ >>> *(rv) = rev; \ >>> @@ -194,7 +194,7 @@ >>> ep = OBJ_NEW(orte_iof_sink_t); \ >>> ep->name.jobid = (nm)->jobid; \ >>> ep->name.vpid = (nm)->vpid; \ >>> - ep->name.epoch = (nm)->epoch; \ >>> + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ >>> ep->tag = (tg); \ >>> if (0 <= (fid)) { \ >>> ep->wev->fd = (fid); \ >>> @@ -215,7 +215,7 @@ >>> rev = OBJ_NEW(orte_iof_read_event_t); \ >>> rev->name.jobid = (nm)->jobid; \ >>> rev->name.vpid = (nm)->vpid; \ >>> - rev->name.epoch= (nm)->epoch; \ >>> + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ >>> rev->tag = (tg); \ >>> *(rv) = rev; \ >>> opal_event_set(opal_event_base, \ >>> >>> Modified: trunk/orte/mca/iof/base/iof_base_open.c >>> ============================================================================== >>> --- trunk/orte/mca/iof/base/iof_base_open.c (original) >>> +++ trunk/orte/mca/iof/base/iof_base_open.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -91,7 +91,7 @@ >>> { >>> ptr->daemon.jobid = ORTE_JOBID_INVALID; >>> ptr->daemon.vpid = ORTE_VPID_INVALID; >>> - ptr->daemon.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN); >>> ptr->wev = OBJ_NEW(orte_iof_write_event_t); >>> } >>> static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) >>> >>> Modified: trunk/orte/mca/iof/hnp/iof_hnp.c >>> ============================================================================== >>> --- trunk/orte/mca/iof/hnp/iof_hnp.c (original) >>> +++ trunk/orte/mca/iof/hnp/iof_hnp.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -186,7 +186,7 @@ >>> proct = OBJ_NEW(orte_iof_proc_t); >>> proct->name.jobid = dst_name->jobid; >>> proct->name.vpid = dst_name->vpid; >>> - proct->name.epoch = dst_name->epoch; >>> + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); >>> opal_list_append(&mca_iof_hnp_component.procs, &proct->super); >>> /* see if we are to output to a file */ >>> if (NULL != orte_output_filename) { >>> @@ -281,8 +281,7 @@ >>> &mca_iof_hnp_component.sinks); >>> sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; >>> sink->daemon.vpid = proc->node->daemon->name.vpid; >>> - sink->daemon.epoch = ORTE_EPOCH_INVALID; >>> - sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon); >>> + >>> ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); >>> } >>> } >>> >>> @@ -389,7 +388,7 @@ >>> &mca_iof_hnp_component.sinks); >>> sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; >>> sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; >>> - sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); >>> >>> return ORTE_SUCCESS; >>> } >>> >>> Modified: trunk/orte/mca/iof/hnp/iof_hnp_receive.c >>> ============================================================================== >>> --- trunk/orte/mca/iof/hnp/iof_hnp_receive.c (original) >>> +++ trunk/orte/mca/iof/hnp/iof_hnp_receive.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -109,21 +109,21 @@ >>> NULL, &mca_iof_hnp_component.sinks); >>> sink->daemon.jobid = mev->sender.jobid; >>> sink->daemon.vpid = mev->sender.vpid; >>> - sink->daemon.epoch = mev->sender.epoch; >>> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >>> } >>> if (ORTE_IOF_STDERR & stream) { >>> ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, >>> NULL, &mca_iof_hnp_component.sinks); >>> sink->daemon.jobid = mev->sender.jobid; >>> sink->daemon.vpid = mev->sender.vpid; >>> - sink->daemon.epoch = mev->sender.epoch; >>> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >>> } >>> if (ORTE_IOF_STDDIAG & stream) { >>> ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, >>> NULL, &mca_iof_hnp_component.sinks); >>> sink->daemon.jobid = mev->sender.jobid; >>> sink->daemon.vpid = mev->sender.vpid; >>> - sink->daemon.epoch = mev->sender.epoch; >>> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >>> } >>> goto CLEAN_RETURN; >>> } >>> >>> Modified: trunk/orte/mca/iof/orted/iof_orted.c >>> ============================================================================== >>> --- trunk/orte/mca/iof/orted/iof_orted.c (original) >>> +++ trunk/orte/mca/iof/orted/iof_orted.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -163,7 +163,7 @@ >>> proct = OBJ_NEW(orte_iof_proc_t); >>> proct->name.jobid = dst_name->jobid; >>> proct->name.vpid = dst_name->vpid; >>> - proct->name.epoch = dst_name->epoch; >>> + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); >>> opal_list_append(&mca_iof_orted_component.procs, &proct->super); >>> /* see if we are to output to a file */ >>> if (NULL != orte_output_filename) { >>> >>> Modified: trunk/orte/mca/odls/base/odls_base_default_fns.c >>> ============================================================================== >>> --- trunk/orte/mca/odls/base/odls_base_default_fns.c (original) >>> +++ trunk/orte/mca/odls/base/odls_base_default_fns.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -734,8 +734,7 @@ >>> proc.jobid = jobdat->jobid; >>> for (j=0; j < jobdat->num_procs; j++) { >>> proc.vpid = j; >>> - proc.epoch = ORTE_EPOCH_INVALID; >>> - proc.epoch = orte_ess.proc_get_epoch(&proc); >>> + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); >>> /* get the vpid of the daemon that is to host this proc */ >>> if (ORTE_VPID_INVALID == (host_daemon = >>> orte_ess.proc_get_daemon(&proc))) { >>> ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >>> @@ -1044,6 +1043,7 @@ >>> free(param); >>> free(value); >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* setup the epoch */ >>> if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, >>> child->name->epoch))) { >>> ORTE_ERROR_LOG(rc); >>> @@ -1057,6 +1057,7 @@ >>> opal_setenv(param, value, true, env); >>> free(param); >>> free(value); >>> +#endif >>> >>> /* setup the vpid */ >>> if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, >>> child->name->vpid))) { >>> @@ -2721,7 +2722,7 @@ >>> OBJ_CONSTRUCT(&proctmp, orte_proc_t); >>> proctmp . name . jobid = ORTE_JOBID_WILDCARD; >>> proctmp . name . vpid = ORTE_VPID_WILDCARD; >>> - proctmp . name . epoch = ORTE_EPOCH_WILDCARD; >>> + ORTE_EPOCH_SET(proctmp . name . epoch,ORTE_EPOCH_WILDCARD); >>> opal_pointer_array_add(&procarray, &proctmp); >>> procptr = &procarray; >>> do_cleanup = true; >>> >>> Modified: trunk/orte/mca/odls/base/odls_base_open.c >>> ============================================================================== >>> --- trunk/orte/mca/odls/base/odls_base_open.c (original) >>> +++ trunk/orte/mca/odls/base/odls_base_open.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -187,7 +187,7 @@ >>> if (-1 == rank) { >>> /* wildcard */ >>> nm->name.vpid = ORTE_VPID_WILDCARD; >>> - nm->name.epoch = ORTE_EPOCH_WILDCARD; >>> + ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD); >>> } else if (rank < 0) { >>> /* error out on bozo case */ >>> orte_show_help("help-odls-base.txt", >>> @@ -200,8 +200,7 @@ >>> * will be in the job - we'll check later >>> */ >>> nm->name.vpid = rank; >>> - nm->name.epoch = ORTE_EPOCH_INVALID; >>> - nm->name.epoch = orte_ess.proc_get_epoch(&nm->name); >>> + >>> ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); >>> } >>> opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); >>> } >>> >>> Modified: trunk/orte/mca/odls/base/odls_base_state.c >>> ============================================================================== >>> --- trunk/orte/mca/odls/base/odls_base_state.c (original) >>> +++ trunk/orte/mca/odls/base/odls_base_state.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -77,17 +77,17 @@ >>> /* if I am the HNP, then use me as the source */ >>> p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; >>> p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; >>> - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); >>> } >>> else { >>> /* otherwise, set the HNP as the source */ >>> p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; >>> p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; >>> - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; >>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); >>> } >>> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >>> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >>> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >>> >>> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >>> >>> >>> Modified: trunk/orte/mca/oob/tcp/oob_tcp_msg.c >>> ============================================================================== >>> --- trunk/orte/mca/oob/tcp/oob_tcp_msg.c (original) >>> +++ trunk/orte/mca/oob/tcp/oob_tcp_msg.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -137,6 +137,7 @@ >>> bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct >>> mca_oob_tcp_peer_t * peer) >>> { >>> int rc; >>> + >>> while(1) { >>> rc = writev(peer->peer_sd, msg->msg_rwptr, msg->msg_rwnum); >>> if(rc < 0) { >>> @@ -338,6 +339,7 @@ >>> orte_process_name_t src = msg->msg_hdr.msg_src; >>> >>> OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); >>> + >>> if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, >>> &src) != OPAL_EQUAL) { >>> opal_hash_table_remove_value_uint64(&mca_oob_tcp_component.tcp_peers, >>> >>> orte_util_hash_name(&peer->peer_name)); >>> >>> Modified: trunk/orte/mca/oob/tcp/oob_tcp_peer.c >>> ============================================================================== >>> --- trunk/orte/mca/oob/tcp/oob_tcp_peer.c (original) >>> +++ trunk/orte/mca/oob/tcp/oob_tcp_peer.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -903,6 +903,11 @@ >>> static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user) >>> { >>> mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user; >>> + >>> + if (orte_abnormal_term_ordered) { >>> + return; >>> + } >>> + >>> OPAL_THREAD_LOCK(&peer->peer_lock); >>> switch(peer->peer_state) { >>> case MCA_OOB_TCP_CONNECT_ACK: >>> >>> Modified: trunk/orte/mca/plm/base/plm_base_jobid.c >>> ============================================================================== >>> --- trunk/orte/mca/plm/base/plm_base_jobid.c (original) >>> +++ trunk/orte/mca/plm/base/plm_base_jobid.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -62,12 +62,12 @@ >>> /* set the name */ >>> ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); >>> ORTE_PROC_MY_NAME->vpid = 0; >>> - ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>> >>> /* copy it to the HNP field */ >>> ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; >>> ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; >>> - ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_PROC_MY_NAME->epoch); >>> >>> /* done */ >>> return ORTE_SUCCESS; >>> >>> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c >>> ============================================================================== >>> --- trunk/orte/mca/plm/base/plm_base_launch_support.c (original) >>> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -377,8 +377,7 @@ >>> /* push stdin - the IOF will know what to do with the specified target */ >>> name.jobid = job; >>> name.vpid = jdata->stdin_target; >>> - name.epoch = ORTE_EPOCH_INVALID; >>> - name.epoch = orte_ess.proc_get_epoch(&name); >>> + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >>> >>> if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) { >>> ORTE_ERROR_LOG(rc); >>> >>> Modified: trunk/orte/mca/plm/base/plm_base_orted_cmds.c >>> ============================================================================== >>> --- trunk/orte/mca/plm/base/plm_base_orted_cmds.c (original) >>> +++ trunk/orte/mca/plm/base/plm_base_orted_cmds.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -163,8 +163,7 @@ >>> continue; >>> } >>> peer.vpid = v; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> /* don't worry about errors on the send here - just >>> * issue it and keep going >>> @@ -242,7 +241,7 @@ >>> OBJ_CONSTRUCT(&proc, orte_proc_t); >>> proc . name . jobid = jobid; >>> proc . name . vpid = ORTE_VPID_WILDCARD; >>> - proc . name . epoch = ORTE_EPOCH_WILDCARD; >>> + ORTE_EPOCH_SET(proc . name . epoch,ORTE_EPOCH_WILDCARD); >>> opal_pointer_array_add(&procs, &proc); >>> if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) { >>> ORTE_ERROR_LOG(rc); >>> @@ -340,8 +339,7 @@ >>> continue; >>> } >>> peer.vpid = v; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> /* check to see if this daemon is known to be "dead" */ >>> if (proc->state > ORTE_PROC_STATE_UNTERMINATED) { >>> /* don't try to send this */ >>> >>> Modified: trunk/orte/mca/plm/base/plm_base_receive.c >>> ============================================================================== >>> --- trunk/orte/mca/plm/base/plm_base_receive.c (original) >>> +++ trunk/orte/mca/plm/base/plm_base_receive.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -146,7 +146,9 @@ >>> orte_job_t *jdata, *parent; >>> opal_buffer_t answer; >>> orte_vpid_t vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; >>> +#endif >>> orte_proc_t *proc; >>> orte_proc_state_t state; >>> orte_exit_code_t exit_code; >>> @@ -394,8 +396,7 @@ >>> break; >>> } >>> name.vpid = vpid; >>> - name.epoch = ORTE_EPOCH_INVALID; >>> - name.epoch = orte_ess.proc_get_epoch(&name); >>> + >>> ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >>> >>> /* unpack the pid */ >>> count = 1; >>> @@ -488,9 +489,11 @@ >>> } >>> name.vpid = vpid; >>> >>> +#if ORTE_ENABLE_EPOCH >>> count=1; >>> opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH); >>> name.epoch = epoch; >>> +#endif >>> >>> OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, >>> "%s plm:base:receive Described rank %s", >>> >>> Modified: trunk/orte/mca/plm/base/plm_base_rsh_support.c >>> ============================================================================== >>> --- trunk/orte/mca/plm/base/plm_base_rsh_support.c (original) >>> +++ trunk/orte/mca/plm/base/plm_base_rsh_support.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -1527,7 +1527,9 @@ >>> { >>> char *param, *path, *tmp, *cmd, *basename, *dest_dir; >>> int i; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; >>> +#endif >>> orte_process_name_t proc; >>> >>> /* if a prefix is set, pass it to the bootproxy in a special way */ >>> @@ -1638,6 +1640,7 @@ >>> opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv); >>> free(cmd); >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* set the epoch */ >>> proc.jobid = jobid; >>> proc.vpid = vpid; >>> @@ -1648,6 +1651,7 @@ >>> opal_setenv(param, cmd, true, argv); >>> free(param); >>> free(cmd); >>> +#endif >>> >>> /* set the number of procs */ >>> asprintf(&cmd, "%d", (int)num_procs); >>> >>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c >>> ============================================================================== >>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) >>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -33,12 +33,14 @@ >>> #include "orte/mca/ess/ess.h" >>> #include "opal/mca/sysinfo/sysinfo_types.h" >>> >>> +#include "orte/types.h" >>> #include "orte/util/show_help.h" >>> #include "orte/util/name_fns.h" >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/util/hostfile/hostfile.h" >>> #include "orte/util/dash_host/dash_host.h" >>> #include "orte/mca/errmgr/errmgr.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rmaps/base/rmaps_private.h" >>> #include "orte/mca/rmaps/base/base.h" >>> @@ -454,7 +456,7 @@ >>> */ >>> >>> /* We do set the epoch here since they all start with the same value. >>> */ >>> - proc->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>> >>> proc->app_idx = app_idx; >>> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >>> @@ -559,11 +561,12 @@ >>> } >>> } >>> proc->name.vpid = vpid; >>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>> - proc->name.epoch = >>> orte_ess.proc_get_epoch(&proc->name); >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>> + >>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>> + >>> /* If there is an invalid epoch here, it's because it >>> doesn't exist yet. */ >>> - if (ORTE_NODE_RANK_INVALID == proc->name.epoch) { >>> - proc->name.epoch = ORTE_EPOCH_MIN; >>> + if (0 == >>> ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>> } >>> } >>> if (NULL == opal_pointer_array_get_item(jdata->procs, >>> proc->name.vpid)) { >>> @@ -601,8 +604,8 @@ >>> } >>> } >>> proc->name.vpid = vpid; >>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>> - proc->name.epoch = >>> orte_ess.proc_get_epoch(&proc->name); >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>> + >>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>> } >>> if (NULL == opal_pointer_array_get_item(jdata->procs, >>> proc->name.vpid)) { >>> if (ORTE_SUCCESS != (rc = >>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >>> @@ -835,7 +838,7 @@ >>> return ORTE_ERR_OUT_OF_RESOURCE; >>> } >>> proc->name.vpid = daemons->num_procs; /* take the next available >>> vpid */ >>> - proc->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>> proc->node = node; >>> proc->nodename = node->name; >>> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >>> @@ -1014,8 +1017,8 @@ >>> return ORTE_ERR_OUT_OF_RESOURCE; >>> } >>> proc->name.vpid = jdata->num_procs; /* take the next available vpid >>> */ >>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>> + >>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>> proc->node = node; >>> proc->nodename = node->name; >>> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >>> >>> Modified: trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c >>> ============================================================================== >>> --- trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c (original) >>> +++ trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -502,8 +502,7 @@ >>> } >>> proc->name.vpid = rank; >>> /* Either init or update the epoch. */ >>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >>> + >>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>> >>> proc->slot_list = strdup(rfmap->slot_list); >>> /* insert the proc into the proper place */ >>> >>> Modified: trunk/orte/mca/rmaps/seq/rmaps_seq.c >>> ============================================================================== >>> --- trunk/orte/mca/rmaps/seq/rmaps_seq.c (original) >>> +++ trunk/orte/mca/rmaps/seq/rmaps_seq.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -235,8 +235,7 @@ >>> } >>> /* assign the vpid */ >>> proc->name.vpid = vpid++; >>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >>> + >>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>> >>> /* add to the jdata proc array */ >>> if (ORTE_SUCCESS != (rc = >>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >>> >>> Modified: trunk/orte/mca/rmcast/base/rmcast_base_open.c >>> ============================================================================== >>> --- trunk/orte/mca/rmcast/base/rmcast_base_open.c (original) >>> +++ trunk/orte/mca/rmcast/base/rmcast_base_open.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -341,7 +341,7 @@ >>> { >>> ptr->name.jobid = ORTE_JOBID_INVALID; >>> ptr->name.vpid = ORTE_VPID_INVALID; >>> - ptr->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >>> ptr->channel = ORTE_RMCAST_INVALID_CHANNEL; >>> OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t); >>> ptr->seq_num = ORTE_RMCAST_SEQ_INVALID; >>> @@ -430,7 +430,7 @@ >>> { >>> ptr->name.jobid = ORTE_JOBID_INVALID; >>> ptr->name.vpid = ORTE_VPID_INVALID; >>> - ptr->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >>> OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t); >>> } >>> static void recvlog_destruct(rmcast_recv_log_t *ptr) >>> @@ -439,7 +439,7 @@ >>> >>> ptr->name.jobid = ORTE_JOBID_INVALID; >>> ptr->name.vpid = ORTE_VPID_INVALID; >>> - ptr->name.epoch = ORTE_EPOCH_INVALID; >>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_INVALID); >>> while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) { >>> OBJ_RELEASE(item); >>> } >>> >>> Modified: trunk/orte/mca/rmcast/tcp/rmcast_tcp.c >>> ============================================================================== >>> --- trunk/orte/mca/rmcast/tcp/rmcast_tcp.c (original) >>> +++ trunk/orte/mca/rmcast/tcp/rmcast_tcp.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -681,7 +681,7 @@ >>> /* caller requested id of sender */ >>> name->jobid = recvptr->name.jobid; >>> name->vpid = recvptr->name.vpid; >>> - name->epoch= recvptr->name.epoch; >>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>> } >>> *seq_num = recvptr->seq_num; >>> *msg = recvptr->iovec_array; >>> @@ -776,7 +776,7 @@ >>> /* caller requested id of sender */ >>> name->jobid = recvptr->name.jobid; >>> name->vpid = recvptr->name.vpid; >>> - name->epoch= recvptr->name.epoch; >>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>> } >>> *seq_num = recvptr->seq_num; >>> if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { >>> >>> Modified: trunk/orte/mca/rmcast/udp/rmcast_udp.c >>> ============================================================================== >>> --- trunk/orte/mca/rmcast/udp/rmcast_udp.c (original) >>> +++ trunk/orte/mca/rmcast/udp/rmcast_udp.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -460,7 +460,7 @@ >>> /* caller requested id of sender */ >>> name->jobid = recvptr->name.jobid; >>> name->vpid = recvptr->name.vpid; >>> - name->epoch= recvptr->name.epoch; >>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>> } >>> *seq_num = recvptr->seq_num; >>> *msg = recvptr->iovec_array; >>> @@ -553,7 +553,7 @@ >>> /* caller requested id of sender */ >>> name->jobid = recvptr->name.jobid; >>> name->vpid = recvptr->name.vpid; >>> - name->epoch= recvptr->name.epoch; >>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>> } >>> *seq_num = recvptr->seq_num; >>> if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { >>> >>> Modified: trunk/orte/mca/rml/base/rml_base_components.c >>> ============================================================================== >>> --- trunk/orte/mca/rml/base/rml_base_components.c (original) >>> +++ trunk/orte/mca/rml/base/rml_base_components.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -20,6 +20,7 @@ >>> #include "opal/util/output.h" >>> >>> #include "orte/mca/rml/rml.h" >>> +#include "orte/util/name_fns.h" >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> >>> @@ -67,14 +68,14 @@ >>> { >>> pkt->sender.jobid = ORTE_JOBID_INVALID; >>> pkt->sender.vpid = ORTE_VPID_INVALID; >>> - pkt->sender.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_MIN); >>> pkt->buffer = NULL; >>> } >>> static void msg_pkt_destructor(orte_msg_packet_t *pkt) >>> { >>> pkt->sender.jobid = ORTE_JOBID_INVALID; >>> pkt->sender.vpid = ORTE_VPID_INVALID; >>> - pkt->sender.epoch = ORTE_EPOCH_INVALID; >>> + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_INVALID); >>> if (NULL != pkt->buffer) { >>> OBJ_RELEASE(pkt->buffer); >>> } >>> >>> Modified: trunk/orte/mca/rml/rml_types.h >>> ============================================================================== >>> --- trunk/orte/mca/rml/rml_types.h (original) >>> +++ trunk/orte/mca/rml/rml_types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -62,7 +62,7 @@ >>> pkt = OBJ_NEW(orte_msg_packet_t); \ >>> pkt->sender.jobid = (sndr)->jobid; \ >>> pkt->sender.vpid = (sndr)->vpid; \ >>> - pkt->sender.epoch = (sndr)->epoch; \ >>> + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ >>> if ((crt)) { \ >>> pkt->buffer = OBJ_NEW(opal_buffer_t); \ >>> opal_dss.copy_payload(pkt->buffer, *(buf)); \ >>> @@ -85,7 +85,7 @@ >>> pkt = OBJ_NEW(orte_msg_packet_t); \ >>> pkt->sender.jobid = (sndr)->jobid; \ >>> pkt->sender.vpid = (sndr)->vpid; \ >>> - pkt->sender.epoch = (sndr)->epoch; \ >>> + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ >>> if ((crt)) { \ >>> pkt->buffer = OBJ_NEW(opal_buffer_t); \ >>> opal_dss.copy_payload(pkt->buffer, *(buf)); \ >>> @@ -191,8 +191,10 @@ >>> >>> #define ORTE_RML_TAG_SUBSCRIBE 46 >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* For Epoch Updates */ >>> #define ORTE_RML_TAG_EPOCH_CHANGE 47 >>> +#endif >>> >>> /* Notify of failed processes */ >>> #define ORTE_RML_TAG_FAILURE_NOTICE 48 >>> >>> Modified: trunk/orte/mca/routed/base/routed_base_components.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/base/routed_base_components.c (original) >>> +++ trunk/orte/mca/routed/base/routed_base_components.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -65,7 +65,7 @@ >>> { >>> ptr->route.jobid = ORTE_JOBID_INVALID; >>> ptr->route.vpid = ORTE_VPID_INVALID; >>> - ptr->route.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN); >>> ptr->hnp_uri = NULL; >>> } >>> static void jfamdest(orte_routed_jobfam_t *ptr) >>> @@ -117,7 +117,7 @@ >>> jfam = OBJ_NEW(orte_routed_jobfam_t); >>> jfam->route.jobid = ORTE_PROC_MY_HNP->jobid; >>> jfam->route.vpid = ORTE_PROC_MY_HNP->vpid; >>> - jfam->route.epoch = ORTE_PROC_MY_HNP->epoch; >>> + ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch); >>> jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); >>> if (NULL != orte_process_info.my_hnp_uri) { >>> jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri); >>> @@ -252,7 +252,7 @@ >>> jfam->job_family = jobfamily; >>> jfam->route.jobid = name.jobid; >>> jfam->route.vpid = name.vpid; >>> - jfam->route.epoch = name.epoch; >>> + ORTE_EPOCH_SET(jfam->route.epoch,name.epoch); >>> jfam->hnp_uri = strdup(uri); >>> done: >>> free(uri); >>> >>> Modified: trunk/orte/mca/routed/base/routed_base_register_sync.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/base/routed_base_register_sync.c (original) >>> +++ trunk/orte/mca/routed/base/routed_base_register_sync.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -127,7 +127,9 @@ >>> orte_std_cntr_t cnt; >>> char *rml_uri; >>> orte_vpid_t vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; >>> +#endif >>> int rc; >>> >>> if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { >>> @@ -146,11 +148,13 @@ >>> cnt = 1; >>> while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, >>> ORTE_VPID))) { >>> >>> +#if ORTE_ENABLE_EPOCH >>> cnt = 1; >>> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, >>> ORTE_EPOCH))) { >>> ORTE_ERROR_LOG(rc); >>> continue; >>> } >>> +#endif >>> >>> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, >>> OPAL_STRING))) { >>> ORTE_ERROR_LOG(rc); >>> >>> Modified: trunk/orte/mca/routed/binomial/routed_binomial.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/binomial/routed_binomial.c (original) >>> +++ trunk/orte/mca/routed/binomial/routed_binomial.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -33,6 +33,7 @@ >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/runtime/orte_wait.h" >>> #include "orte/runtime/runtime.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rml/base/rml_contact.h" >>> >>> @@ -147,7 +148,7 @@ >>> >>> if (proc->jobid == ORTE_JOBID_INVALID || >>> proc->vpid == ORTE_VPID_INVALID || >>> - proc->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -216,7 +217,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -274,8 +275,7 @@ >>> ORTE_NAME_PRINT(route))); >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>> + >>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>> >>> return ORTE_SUCCESS; >>> } >>> @@ -290,8 +290,7 @@ >>> jfam->job_family = jfamily; >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>> + >>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>> >>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>> return ORTE_SUCCESS; >>> @@ -317,11 +316,21 @@ >>> /* initialize */ >>> daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; >>> daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; >>> - daemon.epoch = ORTE_PROC_MY_DAEMON->epoch; >>> + ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch); >>> >>> +#if ORTE_ENABLE_EPOCH >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> target->epoch == ORTE_EPOCH_INVALID) { >>> +#else >>> + if (target->jobid == ORTE_JOBID_INVALID || >>> + target->vpid == ORTE_VPID_INVALID) { >>> +#endif >>> + ret = ORTE_NAME_INVALID; >>> + goto found; >>> + } >>> + >>> + if (0 > ORTE_EPOCH_CMP(target->epoch, >>> orte_ess.proc_get_epoch(target))) { >>> ret = ORTE_NAME_INVALID; >>> goto found; >>> } >>> @@ -443,7 +452,7 @@ >>> >>> /* If the daemon to which we should be routing is dead, then >>> update >>> * the routing tree and start over. */ >>> - if (!orte_util_proc_is_running(&daemon)) { >>> + if (!PROC_IS_RUNNING(&daemon)) { >>> update_routing_tree(daemon.jobid); >>> goto startover; >>> } >>> @@ -461,8 +470,7 @@ >>> ret = &daemon; >>> >>> found: >>> - daemon.epoch = ORTE_EPOCH_INVALID; >>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, >>> "%s routed_binomial_get(%s) --> %s", >>> @@ -879,7 +887,7 @@ >>> */ >>> local_lifeline.jobid = proc->jobid; >>> local_lifeline.vpid = proc->vpid; >>> - local_lifeline.epoch = proc->epoch; >>> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >>> lifeline = &local_lifeline; >>> >>> return ORTE_SUCCESS; >>> @@ -924,11 +932,11 @@ >>> * that process so we can check it's state. >>> */ >>> proc_name.vpid = peer; >>> - proc_name.epoch = orte_util_lookup_epoch(&proc_name); >>> + >>> ORTE_EPOCH_SET(proc_name.epoch,orte_util_lookup_epoch(&proc_name)); >>> >>> - if (!orte_util_proc_is_running(&proc_name) >>> - && ORTE_EPOCH_MIN < proc_name.epoch >>> - && ORTE_EPOCH_INVALID != proc_name.epoch) { >>> + if (!PROC_IS_RUNNING(&proc_name) >>> + && 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,proc_name.epoch) >>> + && 0 != >>> ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc_name.epoch)) { >>> OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>> "%s routed:binomial child %s is >>> dead", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> @@ -967,7 +975,7 @@ >>> } >>> >>> /* find the children of this rank */ >>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>> "%s routed:binomial find children of rank %d", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank)); >>> bitmap = opal_cube_dim(num_procs); >>> @@ -977,24 +985,25 @@ >>> >>> for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { >>> peer = rank | mask; >>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>> "%s routed:binomial find children checking peer >>> %d", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer)); >>> if (peer < num_procs) { >>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>> "%s routed:binomial find children computing >>> tree", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>> /* execute compute on this child */ >>> if (0 <= (found = binomial_tree(peer, rank, me, num_procs, >>> nchildren, childrn, relatives, mine, jobid))) { >>> proc_name.vpid = found; >>> >>> - if (!orte_util_proc_is_running(&proc_name) && >>> ORTE_EPOCH_MIN < orte_util_lookup_epoch(&proc_name)) { >>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>> + if (!PROC_IS_RUNNING(&proc_name) >>> + && 0 < >>> ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,orte_util_lookup_epoch(&proc_name))) { >>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>> "%s routed:binomial find children >>> proc out of date - returning parent %d", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> parent)); >>> return parent; >>> } >>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>> "%s routed:binomial find children >>> returning found value %d", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>> found)); >>> return found; >>> @@ -1029,8 +1038,7 @@ >>> ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid, >>> orte_process_info.max_procs, >>> &num_children, &my_children, NULL, true, >>> jobid); >>> - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_PARENT->epoch = >>> orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); >>> >>> if (0 < opal_output_get_verbosity(orte_routed_base_output)) { >>> opal_output(0, "%s: parent %d num_children %d", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, >>> num_children); >>> >>> Modified: trunk/orte/mca/routed/cm/routed_cm.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/cm/routed_cm.c (original) >>> +++ trunk/orte/mca/routed/cm/routed_cm.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -35,6 +35,7 @@ >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/runtime/orte_wait.h" >>> #include "orte/runtime/runtime.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rml/base/rml_contact.h" >>> >>> @@ -139,7 +140,7 @@ >>> >>> if (proc->jobid == ORTE_JOBID_INVALID || >>> proc->vpid == ORTE_VPID_INVALID || >>> - proc->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -200,7 +201,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -257,8 +258,7 @@ >>> ORTE_NAME_PRINT(route))); >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>> + >>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>> >>> return ORTE_SUCCESS; >>> } >>> @@ -273,8 +273,7 @@ >>> jfam->job_family = jfamily; >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>> + >>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>> >>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>> return ORTE_SUCCESS; >>> @@ -299,7 +298,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> ret = ORTE_NAME_INVALID; >>> goto found; >>> } >>> @@ -367,8 +366,7 @@ >>> } >>> >>> /* Initialize daemon's epoch, based on its current vpid/jobid */ >>> - daemon.epoch = ORTE_EPOCH_INVALID; >>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>> >>> /* if the daemon is me, then send direct to the target! */ >>> if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { >>> @@ -814,8 +812,7 @@ >>> */ >>> local_lifeline.jobid = proc->jobid; >>> local_lifeline.vpid = proc->vpid; >>> - local_lifeline.epoch = ORTE_EPOCH_INVALID; >>> - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); >>> + >>> ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); >>> >>> lifeline = &local_lifeline; >>> >>> >>> Modified: trunk/orte/mca/routed/direct/routed_direct.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/direct/routed_direct.c (original) >>> +++ trunk/orte/mca/routed/direct/routed_direct.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -24,6 +24,7 @@ >>> #include "orte/util/name_fns.h" >>> #include "orte/util/proc_info.h" >>> #include "orte/runtime/orte_globals.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rml/base/rml_contact.h" >>> >>> @@ -135,7 +136,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> ret = ORTE_NAME_INVALID; >>> } else { >>> /* all routes are direct */ >>> >>> Modified: trunk/orte/mca/routed/linear/routed_linear.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/linear/routed_linear.c (original) >>> +++ trunk/orte/mca/routed/linear/routed_linear.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -31,6 +31,7 @@ >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/runtime/orte_wait.h" >>> #include "orte/runtime/runtime.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rml/base/rml_contact.h" >>> >>> @@ -132,7 +133,7 @@ >>> >>> if (proc->jobid == ORTE_JOBID_INVALID || >>> proc->vpid == ORTE_VPID_INVALID || >>> - proc->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -201,7 +202,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -259,7 +260,7 @@ >>> ORTE_NAME_PRINT(route))); >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = route->epoch; >>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>> return ORTE_SUCCESS; >>> } >>> } >>> @@ -273,7 +274,7 @@ >>> jfam->job_family = jfamily; >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = route->epoch; >>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>> return ORTE_SUCCESS; >>> } >>> @@ -373,8 +374,7 @@ >>> } >>> >>> /* Initialize daemon's epoch, based on its current vpid/jobid */ >>> - daemon.epoch = ORTE_EPOCH_INVALID; >>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>> >>> /* if the daemon is me, then send direct to the target! */ >>> if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { >>> @@ -395,8 +395,7 @@ >>> /* we are at end of chain - wrap around */ >>> daemon.vpid = 0; >>> } >>> - daemon.epoch = ORTE_EPOCH_INVALID; >>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>> ret = &daemon; >>> } >>> } >>> @@ -741,7 +740,7 @@ >>> */ >>> local_lifeline.jobid = proc->jobid; >>> local_lifeline.vpid = proc->vpid; >>> - local_lifeline.epoch = proc->epoch; >>> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >>> lifeline = &local_lifeline; >>> >>> return ORTE_SUCCESS; >>> >>> Modified: trunk/orte/mca/routed/radix/routed_radix.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/radix/routed_radix.c (original) >>> +++ trunk/orte/mca/routed/radix/routed_radix.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -31,6 +31,7 @@ >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/runtime/orte_wait.h" >>> #include "orte/runtime/runtime.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rml/base/rml_contact.h" >>> >>> @@ -145,7 +146,7 @@ >>> >>> if (proc->jobid == ORTE_JOBID_INVALID || >>> proc->vpid == ORTE_VPID_INVALID || >>> - proc->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -214,7 +215,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> return ORTE_ERR_BAD_PARAM; >>> } >>> >>> @@ -272,7 +273,7 @@ >>> ORTE_NAME_PRINT(route))); >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = route->epoch; >>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>> return ORTE_SUCCESS; >>> } >>> } >>> @@ -286,7 +287,7 @@ >>> jfam->job_family = jfamily; >>> jfam->route.jobid = route->jobid; >>> jfam->route.vpid = route->vpid; >>> - jfam->route.epoch = route->epoch; >>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>> return ORTE_SUCCESS; >>> } >>> @@ -310,7 +311,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> ret = ORTE_NAME_INVALID; >>> goto found; >>> } >>> @@ -413,8 +414,7 @@ >>> if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { >>> /* yep - we need to step through this child */ >>> daemon.vpid = child->vpid; >>> - daemon.epoch = ORTE_EPOCH_INVALID; >>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>> + >>> ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>> ret = &daemon; >>> goto found; >>> } >>> @@ -425,8 +425,7 @@ >>> * any of our children, so we have to step up through our parent >>> */ >>> daemon.vpid = ORTE_PROC_MY_PARENT->vpid; >>> - daemon.epoch = ORTE_EPOCH_INVALID; >>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>> >>> ret = &daemon; >>> >>> @@ -788,7 +787,7 @@ >>> */ >>> local_lifeline.jobid = proc->jobid; >>> local_lifeline.vpid = proc->vpid; >>> - local_lifeline.epoch = proc->epoch; >>> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >>> lifeline = &local_lifeline; >>> >>> return ORTE_SUCCESS; >>> @@ -881,8 +880,7 @@ >>> ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel; >>> ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel); >>> } >>> - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; >>> - ORTE_PROC_MY_PARENT->epoch = >>> orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); >>> + >>> ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); >>> >>> /* compute my direct children and the bitmap that shows which vpids >>> * lie underneath their branch >>> >>> Modified: trunk/orte/mca/routed/slave/routed_slave.c >>> ============================================================================== >>> --- trunk/orte/mca/routed/slave/routed_slave.c (original) >>> +++ trunk/orte/mca/routed/slave/routed_slave.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -26,6 +26,7 @@ >>> #include "orte/runtime/orte_globals.h" >>> #include "orte/runtime/orte_wait.h" >>> #include "orte/runtime/runtime.h" >>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>> >>> #include "orte/mca/rml/base/rml_contact.h" >>> >>> @@ -134,7 +135,7 @@ >>> >>> if (target->jobid == ORTE_JOBID_INVALID || >>> target->vpid == ORTE_VPID_INVALID || >>> - target->epoch == ORTE_EPOCH_INVALID) { >>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>> ret = ORTE_NAME_INVALID; >>> } else { >>> /* a slave must always route via its parent daemon */ >>> @@ -275,8 +276,7 @@ >>> */ >>> local_lifeline.jobid = proc->jobid; >>> local_lifeline.vpid = proc->vpid; >>> - local_lifeline.epoch = ORTE_EPOCH_INVALID; >>> - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); >>> + >>> ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); >>> >>> lifeline = &local_lifeline; >>> >>> >>> Modified: trunk/orte/mca/sensor/file/sensor_file.c >>> ============================================================================== >>> --- trunk/orte/mca/sensor/file/sensor_file.c (original) >>> +++ trunk/orte/mca/sensor/file/sensor_file.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -70,7 +70,9 @@ >>> opal_list_item_t super; >>> orte_jobid_t jobid; >>> orte_vpid_t vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; >>> +#endif >>> char *file; >>> int tick; >>> bool check_size; >>> >>> Modified: trunk/orte/mca/snapc/base/snapc_base_fns.c >>> ============================================================================== >>> --- trunk/orte/mca/snapc/base/snapc_base_fns.c (original) >>> +++ trunk/orte/mca/snapc/base/snapc_base_fns.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -81,7 +81,7 @@ >>> { >>> snapshot->process_name.jobid = 0; >>> snapshot->process_name.vpid = 0; >>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>> >>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>> >>> @@ -92,7 +92,7 @@ >>> { >>> snapshot->process_name.jobid = 0; >>> snapshot->process_name.vpid = 0; >>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>> >>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>> >>> >>> Modified: trunk/orte/mca/snapc/full/snapc_full_global.c >>> ============================================================================== >>> --- trunk/orte/mca/snapc/full/snapc_full_global.c (original) >>> +++ trunk/orte/mca/snapc/full/snapc_full_global.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -427,7 +427,7 @@ >>> new_proc = OBJ_NEW(orte_proc_t); >>> new_proc->name.jobid = proc->name.jobid; >>> new_proc->name.vpid = proc->name.vpid; >>> - new_proc->name.epoch = proc->name.epoch; >>> + ORTE_EPOCH_SET(new_proc->name.epoch,proc->name.epoch); >>> new_proc->node = OBJ_NEW(orte_node_t); >>> new_proc->node->name = proc->node->name; >>> opal_list_append(migrating_procs, &new_proc->super); >>> @@ -618,7 +618,7 @@ >>> >>> orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; >>> orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; >>> - orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch; >>> + >>> ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); >>> >>> mask = ORTE_NS_CMP_JOBID; >>> >>> @@ -636,7 +636,7 @@ >>> >>> app_snapshot->process_name.jobid = procs[p]->name.jobid; >>> app_snapshot->process_name.vpid = procs[p]->name.vpid; >>> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >>> + >>> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >>> >>> opal_list_append(&(orted_snapshot->super.local_snapshots), >>> &(app_snapshot->super)); >>> } >>> @@ -800,7 +800,7 @@ >>> >>> app_snapshot->process_name.jobid = procs[p]->name.jobid; >>> app_snapshot->process_name.vpid = procs[p]->name.vpid; >>> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >>> + >>> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >>> >>> opal_list_append(&(orted_snapshot->super.local_snapshots), >>> &(app_snapshot->super)); >>> } >>> @@ -816,7 +816,7 @@ >>> >>> orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; >>> orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; >>> - orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch; >>> + >>> ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); >>> >>> mask = ORTE_NS_CMP_ALL; >>> >>> @@ -837,7 +837,7 @@ >>> >>> app_snapshot->process_name.jobid = procs[p]->name.jobid; >>> app_snapshot->process_name.vpid = procs[p]->name.vpid; >>> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >>> + >>> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >>> >>> opal_list_append(&(orted_snapshot->super.local_snapshots), >>> &(app_snapshot->super)); >>> } >>> >>> Modified: trunk/orte/mca/snapc/full/snapc_full_local.c >>> ============================================================================== >>> --- trunk/orte/mca/snapc/full/snapc_full_local.c (original) >>> +++ trunk/orte/mca/snapc/full/snapc_full_local.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -2033,7 +2033,7 @@ >>> vpid_snapshot->process_pid = child->pid; >>> vpid_snapshot->super . process_name . jobid = child->name->jobid; >>> vpid_snapshot->super . process_name . vpid = child->name->vpid; >>> - vpid_snapshot->super . process_name . epoch = >>> child->name->epoch; >>> + ORTE_EPOCH_SET(vpid_snapshot->super . process_name . >>> epoch,child->name->epoch); >>> } >>> } >>> >>> @@ -2095,7 +2095,7 @@ >>> vpid_snapshot->process_pid = child->pid; >>> vpid_snapshot->super . process_name . jobid = child->name->jobid; >>> vpid_snapshot->super . process_name . vpid = child->name->vpid; >>> - vpid_snapshot->super . process_name . epoch = >>> child->name->epoch; >>> + ORTE_EPOCH_SET(vpid_snapshot->super . process_name . >>> epoch,child->name->epoch); >>> /*vpid_snapshot->migrating = true;*/ >>> >>> opal_list_append(&(local_global_snapshot.local_snapshots), >>> &(vpid_snapshot->super.super)); >>> @@ -2111,7 +2111,7 @@ >>> vpid_snapshot->process_pid = child->pid; >>> vpid_snapshot->super . process_name . jobid = child->name->jobid; >>> vpid_snapshot->super . process_name . vpid = child->name->vpid; >>> - vpid_snapshot->super . process_name . epoch = >>> child->name->epoch; >>> + ORTE_EPOCH_SET(vpid_snapshot->super . process_name . >>> epoch,child->name->epoch); >>> } >>> } >>> >>> >>> Modified: trunk/orte/mca/snapc/full/snapc_full_module.c >>> ============================================================================== >>> --- trunk/orte/mca/snapc/full/snapc_full_module.c (original) >>> +++ trunk/orte/mca/snapc/full/snapc_full_module.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -83,7 +83,7 @@ >>> void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t >>> *snapshot) { >>> snapshot->process_name.jobid = 0; >>> snapshot->process_name.vpid = 0; >>> - snapshot->process_name.epoch = 0; >>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); >>> >>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>> } >>> @@ -91,7 +91,7 @@ >>> void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t >>> *snapshot) { >>> snapshot->process_name.jobid = 0; >>> snapshot->process_name.vpid = 0; >>> - snapshot->process_name.epoch = 0; >>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); >>> >>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>> } >>> >>> Modified: trunk/orte/mca/sstore/base/sstore_base_fns.c >>> ============================================================================== >>> --- trunk/orte/mca/sstore/base/sstore_base_fns.c (original) >>> +++ trunk/orte/mca/sstore/base/sstore_base_fns.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -62,7 +62,7 @@ >>> { >>> snapshot->process_name.jobid = 0; >>> snapshot->process_name.vpid = 0; >>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>> >>> snapshot->crs_comp = NULL; >>> snapshot->compress_comp = NULL; >>> @@ -76,7 +76,7 @@ >>> { >>> snapshot->process_name.jobid = 0; >>> snapshot->process_name.vpid = 0; >>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>> >>> if( NULL != snapshot->crs_comp ) { >>> free(snapshot->crs_comp); >>> @@ -637,7 +637,7 @@ >>> >>> vpid_snapshot->process_name.jobid = proc.jobid; >>> vpid_snapshot->process_name.vpid = proc.vpid; >>> - vpid_snapshot->process_name.epoch = proc.epoch; >>> + ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch); >>> } >>> else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, >>> strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) { >>> vpid_snapshot->crs_comp = strdup(value); >>> >>> Modified: trunk/orte/mca/sstore/central/sstore_central_global.c >>> ============================================================================== >>> --- trunk/orte/mca/sstore/central/sstore_central_global.c (original) >>> +++ trunk/orte/mca/sstore/central/sstore_central_global.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -1216,8 +1216,7 @@ >>> >>> vpid_snapshot->process_name.jobid = handle_info->jobid; >>> vpid_snapshot->process_name.vpid = i; >>> - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; >>> - vpid_snapshot->process_name.epoch = >>> orte_ess.proc_get_epoch(&vpid_snapshot->process_name); >>> + >>> ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); >>> >>> vpid_snapshot->crs_comp = NULL; >>> global_snapshot->start_time = NULL; >>> >>> Modified: trunk/orte/mca/sstore/central/sstore_central_local.c >>> ============================================================================== >>> --- trunk/orte/mca/sstore/central/sstore_central_local.c (original) >>> +++ trunk/orte/mca/sstore/central/sstore_central_local.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -210,7 +210,7 @@ >>> { >>> info->name.jobid = ORTE_JOBID_INVALID; >>> info->name.vpid = ORTE_VPID_INVALID; >>> - info->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>> >>> info->local_location = NULL; >>> info->metadata_filename = NULL; >>> @@ -222,7 +222,7 @@ >>> { >>> info->name.jobid = ORTE_JOBID_INVALID; >>> info->name.vpid = ORTE_VPID_INVALID; >>> - info->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>> >>> if( NULL != info->local_location ) { >>> free(info->local_location); >>> @@ -535,7 +535,7 @@ >>> >>> app_info->name.jobid = name->jobid; >>> app_info->name.vpid = name->vpid; >>> - app_info->name.epoch = name->epoch; >>> + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); >>> >>> opal_list_append(handle_info->app_info_handle, &(app_info->super)); >>> >>> >>> Modified: trunk/orte/mca/sstore/stage/sstore_stage_global.c >>> ============================================================================== >>> --- trunk/orte/mca/sstore/stage/sstore_stage_global.c (original) >>> +++ trunk/orte/mca/sstore/stage/sstore_stage_global.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -1218,10 +1218,10 @@ >>> p_set = OBJ_NEW(orte_filem_base_process_set_t); >>> p_set->source.jobid = peer->jobid; >>> p_set->source.vpid = peer->vpid; >>> - p_set->source.epoch = peer->epoch; >>> + ORTE_EPOCH_SET(p_set->source.epoch,peer->epoch); >>> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >>> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >>> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >>> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >>> } >>> >>> @@ -1706,8 +1706,7 @@ >>> >>> vpid_snapshot->process_name.jobid = handle_info->jobid; >>> vpid_snapshot->process_name.vpid = i; >>> - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; >>> - vpid_snapshot->process_name.epoch = >>> orte_ess.proc_get_epoch(&vpid_snapshot->process_name); >>> + >>> ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); >>> >>> /* JJH: Currently we do not have this information since we do not save >>> * individual vpid info in the Global SStore. It is in the metadata >>> >>> Modified: trunk/orte/mca/sstore/stage/sstore_stage_local.c >>> ============================================================================== >>> --- trunk/orte/mca/sstore/stage/sstore_stage_local.c (original) >>> +++ trunk/orte/mca/sstore/stage/sstore_stage_local.c 2011-08-26 18:16:14 >>> EDT (Fri, 26 Aug 2011) >>> @@ -287,7 +287,7 @@ >>> { >>> info->name.jobid = ORTE_JOBID_INVALID; >>> info->name.vpid = ORTE_VPID_INVALID; >>> - info->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>> >>> info->local_location = NULL; >>> info->compressed_local_location = NULL; >>> @@ -302,7 +302,7 @@ >>> { >>> info->name.jobid = ORTE_JOBID_INVALID; >>> info->name.vpid = ORTE_VPID_INVALID; >>> - info->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>> >>> if( NULL != info->local_location ) { >>> free(info->local_location); >>> @@ -1014,7 +1014,7 @@ >>> >>> app_info->name.jobid = name->jobid; >>> app_info->name.vpid = name->vpid; >>> - app_info->name.epoch = name->epoch; >>> + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); >>> >>> opal_list_append(handle_info->app_info_handle, &(app_info->super)); >>> >>> @@ -2057,17 +2057,17 @@ >>> /* if I am the HNP, then use me as the source */ >>> p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; >>> p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; >>> - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); >>> } >>> else { >>> /* otherwise, set the HNP as the source */ >>> p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; >>> p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; >>> - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; >>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); >>> } >>> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >>> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >>> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >>> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >>> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >>> >>> /* Define the file set */ >>> >>> Modified: trunk/orte/orted/orted_comm.c >>> ============================================================================== >>> --- trunk/orte/orted/orted_comm.c (original) >>> +++ trunk/orte/orted/orted_comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -123,18 +123,13 @@ >>> nm = (orte_routed_tree_t*)item; >>> >>> target.vpid = nm->vpid; >>> - target.epoch = orte_util_lookup_epoch(&target); >>> + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); >>> >>> - if (!orte_util_proc_is_running(&target)) { >>> + if (!PROC_IS_RUNNING(&target)) { >>> continue; >>> } >>> >>> - target.epoch = ORTE_EPOCH_INVALID; >>> - if (ORTE_NODE_RANK_INVALID == (target.epoch = >>> orte_ess.proc_get_epoch(&target))) { >>> - /* If we are trying to send to a previously failed process it's >>> - * better to fail silently. */ >>> - continue; >>> - } >>> + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); >>> >>> OPAL_OUTPUT_VERBOSE((1, orte_debug_output, >>> "%s orte:daemon:send_relay sending relay msg to >>> %s", >>> @@ -422,7 +417,8 @@ >>> proct = OBJ_NEW(orte_proc_t); >>> proct->name.jobid = proc.jobid; >>> proct->name.vpid = proc.vpid; >>> - proct->name.epoch = proc.epoch; >>> + ORTE_EPOCH_SET(proct->name.epoch,proc.epoch); >>> + >>> opal_pointer_array_add(&procarray, proct); >>> num_replies++; >>> } >>> @@ -1059,7 +1055,9 @@ >>> orte_job_t *jdata; >>> orte_proc_t *proc; >>> orte_vpid_t vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; >>> +#endif >>> int32_t i, num_procs; >>> >>> /* setup the answer */ >>> @@ -1086,12 +1084,14 @@ >>> goto CLEANUP; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* unpack the epoch */ >>> n = 1; >>> if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &epoch, &n, >>> ORTE_EPOCH))) { >>> ORTE_ERROR_LOG(ret); >>> goto CLEANUP; >>> } >>> +#endif >>> >>> /* if they asked for a specific proc, then just get that info */ >>> if (ORTE_VPID_WILDCARD != vpid) { >>> @@ -1201,7 +1201,7 @@ >>> /* loop across all daemons */ >>> proc2.jobid = ORTE_PROC_MY_NAME->jobid; >>> for (proc2.vpid=1; proc2.vpid < >>> orte_process_info.num_procs; proc2.vpid++) { >>> - proc2.epoch = orte_util_lookup_epoch(&proc2); >>> + >>> ORTE_EPOCH_SET(proc2.epoch,orte_util_lookup_epoch(&proc2)); >>> >>> /* setup the cmd */ >>> relay_msg = OBJ_NEW(opal_buffer_t); >>> >>> Modified: trunk/orte/orted/orted_main.c >>> ============================================================================== >>> --- trunk/orte/orted/orted_main.c (original) >>> +++ trunk/orte/orted/orted_main.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -388,14 +388,14 @@ >>> orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); >>> ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; >>> ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; >>> - ORTE_PROC_MY_DAEMON->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_EPOCH_MIN); >>> >>> /* if I am also the hnp, then update that contact info field too */ >>> if (ORTE_PROC_IS_HNP) { >>> orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); >>> ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; >>> ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; >>> - ORTE_PROC_MY_HNP->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_EPOCH_MIN); >>> } >>> >>> /* setup the primary daemon command receive function */ >>> @@ -495,7 +495,8 @@ >>> proc = OBJ_NEW(orte_proc_t); >>> proc->name.jobid = jdata->jobid; >>> proc->name.vpid = 0; >>> - proc->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>> + >>> proc->state = ORTE_PROC_STATE_RUNNING; >>> proc->app_idx = 0; >>> proc->node = nodes[0]; /* hnp node must be there */ >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >>> (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -76,6 +76,7 @@ >>> } >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /** check the epochs - if one of them is WILDCARD, then ignore >>> * this field since anything is okay >>> */ >>> @@ -87,6 +88,7 @@ >>> return OPAL_VALUE1_GREATER; >>> } >>> } >>> +#endif >>> >>> /** only way to get here is if all fields are equal or WILDCARD */ >>> return OPAL_EQUAL; >>> @@ -122,6 +124,7 @@ >>> return OPAL_EQUAL; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> int orte_dt_compare_epoch(orte_epoch_t *value1, >>> orte_epoch_t *value2, >>> opal_data_type_t type) >>> @@ -136,6 +139,7 @@ >>> >>> return OPAL_EQUAL; >>> } >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> /** >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -61,7 +61,7 @@ >>> >>> val->jobid = src->jobid; >>> val->vpid = src->vpid; >>> - val->epoch = src->epoch; >>> + ORTE_EPOCH_SET(val->epoch,src->epoch); >>> >>> *dest = val; >>> return ORTE_SUCCESS; >>> @@ -105,6 +105,7 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* >>> * EPOCH >>> */ >>> @@ -123,6 +124,7 @@ >>> >>> return ORTE_SUCCESS; >>> } >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >>> (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -58,7 +58,9 @@ >>> orte_process_name_t* proc; >>> orte_jobid_t *jobid; >>> orte_vpid_t *vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t *epoch; >>> +#endif >>> >>> /* collect all the jobids in a contiguous array */ >>> jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t)); >>> @@ -100,6 +102,7 @@ >>> } >>> free(vpid); >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* Collect all the epochs in a contiguous array */ >>> epoch = (orte_epoch_t *) malloc(num_vals * sizeof(orte_epoch_t)); >>> if (NULL == epoch) { >>> @@ -118,6 +121,7 @@ >>> return rc; >>> } >>> free(epoch); >>> +#endif >>> >>> return ORTE_SUCCESS; >>> } >>> @@ -156,6 +160,7 @@ >>> return ret; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* >>> * EPOCH >>> */ >>> @@ -171,6 +176,7 @@ >>> >>> return ret; >>> } >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> /* >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_print_fns.c >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_print_fns.c (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_print_fns.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -125,8 +125,10 @@ >>> orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, >>> ORTE_STD_CNTR_T); >>> break; >>> >>> +#if ORTE_ENABLE_EPOCH >>> case ORTE_EPOCH: >>> orte_dt_quick_print(output, "ORTE_EPOCH", prefix, src, >>> ORTE_EPOCH_T); >>> +#endif >>> >>> case ORTE_VPID: >>> orte_dt_quick_print(output, "ORTE_VPID", prefix, src, >>> ORTE_VPID_T); >>> @@ -478,11 +480,21 @@ >>> if (orte_xml_output) { >>> /* need to create the output in XML format */ >>> if (0 == src->pid) { >>> +#if ORTE_ENABLE_EPOCH >>> asprintf(output, "%s<process rank=\"%s\" status=\"%s\" >>> epoch=\"%s\"/>\n", pfx2, >>> ORTE_VPID_PRINT(src->name.vpid), >>> orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); >>> +#else >>> + asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", >>> pfx2, >>> + ORTE_VPID_PRINT(src->name.vpid), >>> orte_proc_state_to_str(src->state)); >>> +#endif >>> } else { >>> +#if ORTE_ENABLE_EPOCH >>> asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\" >>> epoch=\"%s\"/>\n", pfx2, >>> ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, >>> orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); >>> +#else >>> + asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" >>> status=\"%s\"/>\n", pfx2, >>> + ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, >>> orte_proc_state_to_str(src->state)); >>> +#endif >>> } >>> free(pfx2); >>> return ORTE_SUCCESS; >>> @@ -490,10 +502,17 @@ >>> >>> if (!orte_devel_level_output) { >>> /* just print a very simple output for users */ >>> +#if ORTE_ENABLE_EPOCH >>> asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: >>> %s", pfx2, >>> ORTE_JOBID_PRINT(src->name.jobid), >>> ORTE_VPID_PRINT(src->name.vpid), >>> ORTE_EPOCH_PRINT(src->name.epoch)); >>> +#else >>> + asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: >>> %s", pfx2, >>> + ORTE_JOBID_PRINT(src->name.jobid), >>> + ORTE_VPID_PRINT(src->name.vpid)); >>> +#endif >>> + >>> /* set the return */ >>> *output = tmp; >>> free(pfx2); >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_size_fns.c >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_size_fns.c (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_size_fns.c 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -45,9 +45,11 @@ >>> *size = sizeof(orte_std_cntr_t); >>> break; >>> >>> +#if ORTE_ENABLE_EPOCH >>> case ORTE_EPOCH: >>> *size = sizeof(orte_epoch_t); >>> break; >>> +#endif >>> >>> case ORTE_VPID: >>> *size = sizeof(orte_vpid_t); >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_support.h >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_support.h (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_support.h 2011-08-26 >>> 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -52,9 +52,14 @@ >>> int orte_dt_compare_vpid(orte_vpid_t *value1, >>> orte_vpid_t *value2, >>> opal_data_type_t type); >>> +#if ORTE_ENABLE_EPOCH >>> int orte_dt_compare_epoch(orte_epoch_t *value1, >>> orte_epoch_t *value2, >>> opal_data_type_t type); >>> +#define ORTE_EPOCH_CMP(n,m) ( (m) - (n) ) >>> +#else >>> +#define ORTE_EPOCH_CMP(n,m) ( 0 ) >>> +#endif >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, >>> opal_data_type_t type); >>> int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, >>> opal_data_type_t type); >>> @@ -86,7 +91,9 @@ >>> int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, >>> opal_data_type_t type); >>> int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, >>> opal_data_type_t type); >>> int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, >>> opal_data_type_t type); >>> +#if ORTE_ENABLE_EPOCH >>> int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, >>> opal_data_type_t type); >>> +#endif >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t >>> type); >>> int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, >>> opal_data_type_t type); >>> @@ -116,8 +123,10 @@ >>> int32_t num_vals, opal_data_type_t type); >>> int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src, >>> int32_t num_vals, opal_data_type_t type); >>> +#if ORTE_ENABLE_EPOCH >>> int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src, >>> int32_t num_vals, opal_data_type_t type); >>> +#endif >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, >>> int32_t num_vals, opal_data_type_t type); >>> @@ -185,8 +194,10 @@ >>> int32_t *num_vals, opal_data_type_t type); >>> int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest, >>> int32_t *num_vals, opal_data_type_t type); >>> +#if ORTE_ENABLE_EPOCH >>> int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest, >>> int32_t *num_vals, opal_data_type_t type); >>> +#endif >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, >>> int32_t *num_vals, opal_data_type_t type); >>> >>> Modified: trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >>> ============================================================================== >>> --- trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >>> (original) >>> +++ trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -54,7 +54,9 @@ >>> orte_process_name_t* proc; >>> orte_jobid_t *jobid; >>> orte_vpid_t *vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t *epoch; >>> +#endif >>> >>> num = *num_vals; >>> >>> @@ -92,6 +94,7 @@ >>> return rc; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* collect all the epochs in a contiguous array */ >>> epoch= (orte_epoch_t*)malloc(num * sizeof(orte_epoch_t)); >>> if (NULL == epoch) { >>> @@ -109,18 +112,21 @@ >>> free(jobid); >>> return rc; >>> } >>> +#endif >>> >>> /* build the names from the jobid/vpid/epoch arrays */ >>> proc = (orte_process_name_t*)dest; >>> for (i=0; i < num; i++) { >>> proc->jobid = jobid[i]; >>> proc->vpid = vpid[i]; >>> - proc->epoch = epoch[i]; >>> + ORTE_EPOCH_SET(proc->epoch,epoch[i]); >>> proc++; >>> } >>> >>> /* cleanup */ >>> +#if ORTE_ENABLE_EPOCH >>> free(epoch); >>> +#endif >>> free(vpid); >>> free(jobid); >>> >>> @@ -159,6 +165,7 @@ >>> return ret; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* >>> * EPOCH >>> */ >>> @@ -174,6 +181,7 @@ >>> >>> return ret; >>> } >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> /* >>> >>> Modified: trunk/orte/runtime/orte_data_server.c >>> ============================================================================== >>> --- trunk/orte/runtime/orte_data_server.c (original) >>> +++ trunk/orte/runtime/orte_data_server.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -220,7 +220,7 @@ >>> data->port = port_name; >>> data->owner.jobid = sender->jobid; >>> data->owner.vpid = sender->vpid; >>> - data->owner.epoch = sender->epoch; >>> + ORTE_EPOCH_SET(data->owner.epoch,sender->epoch); >>> >>> /* store the data */ >>> data->index = opal_pointer_array_add(orte_data_server_store, >>> data); >>> >>> Modified: trunk/orte/runtime/orte_globals.c >>> ============================================================================== >>> --- trunk/orte/runtime/orte_globals.c (original) >>> +++ trunk/orte/runtime/orte_globals.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -277,6 +277,7 @@ >>> return rc; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> tmp = ORTE_EPOCH; >>> if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_epoch, >>> orte_dt_unpack_epoch, >>> @@ -290,6 +291,7 @@ >>> ORTE_ERROR_LOG(rc); >>> return rc; >>> } >>> +#endif >>> >>> #if !ORTE_DISABLE_FULL_SUPPORT >>> tmp = ORTE_JOB; >>> @@ -933,7 +935,7 @@ >>> proc->beat = 0; >>> OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t); >>> opal_ring_buffer_init(&proc->stats, orte_stat_history_size); >>> - proc->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>> #if OPAL_ENABLE_FT_CR == 1 >>> proc->ckpt_state = 0; >>> proc->ckpt_snapshot_ref = NULL; >>> >>> Modified: trunk/orte/runtime/orte_init.c >>> ============================================================================== >>> --- trunk/orte/runtime/orte_init.c (original) >>> +++ trunk/orte/runtime/orte_init.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -57,8 +57,17 @@ >>> char *orte_prohibited_session_dirs = NULL; >>> bool orte_create_session_dirs = true; >>> >>> +#if ORTE_ENABLE_EPOCH >>> +orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, >>> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD}; >>> +#else >>> orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, >>> ORTE_VPID_WILDCARD}; >>> +#endif >>> + >>> +#if ORTE_ENABLE_EPOCH >>> +orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, >>> ORTE_VPID_INVALID, ORTE_EPOCH_INVALID}; >>> +#else >>> orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, >>> ORTE_VPID_INVALID}; >>> +#endif >>> >>> >>> #if OPAL_CC_USE_PRAGMA_IDENT >>> >>> Modified: trunk/orte/runtime/orte_wait.h >>> ============================================================================== >>> --- trunk/orte/runtime/orte_wait.h (original) >>> +++ trunk/orte/runtime/orte_wait.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -204,7 +204,7 @@ >>> mev = OBJ_NEW(orte_message_event_t); \ >>> mev->sender.jobid = (sndr)->jobid; \ >>> mev->sender.vpid = (sndr)->vpid; \ >>> - mev->sender.epoch = (sndr)->epoch; \ >>> + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ >>> opal_dss.copy_payload(mev->buffer, (buf)); \ >>> mev->tag = (tg); \ >>> mev->file = strdup((buf)->parent.cls_init_file_name); \ >>> @@ -228,7 +228,7 @@ >>> mev = OBJ_NEW(orte_message_event_t); \ >>> mev->sender.jobid = (sndr)->jobid; \ >>> mev->sender.vpid = (sndr)->vpid; \ >>> - mev->sender.epoch = (sndr)->epoch; \ >>> + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ >>> opal_dss.copy_payload(mev->buffer, (buf)); \ >>> mev->tag = (tg); \ >>> opal_event_evtimer_set(opal_event_base, \ >>> @@ -258,7 +258,7 @@ >>> tmp = OBJ_NEW(orte_notify_event_t); \ >>> tmp->proc.jobid = (data)->jobid; \ >>> tmp->proc.vpid = (data)->vpid; \ >>> - tmp->proc.epoch = (data)->epoch; \ >>> + ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \ >>> opal_event.evtimer_set(opal_event_base, \ >>> tmp->ev, (cbfunc), tmp); \ >>> now.tv_sec = 0; \ >>> >>> Modified: trunk/orte/test/system/oob_stress.c >>> ============================================================================== >>> --- trunk/orte/test/system/oob_stress.c (original) >>> +++ trunk/orte/test/system/oob_stress.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -74,8 +74,7 @@ >>> >>> for (j=1; j < count+1; j++) { >>> peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % >>> orte_process_info.num_procs; >>> - peer.epoch = ORTE_EPOCH_INVALID; >>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>> >>> /* rank0 starts ring */ >>> if (ORTE_PROC_MY_NAME->vpid == 0) { >>> >>> Modified: trunk/orte/test/system/orte_ring.c >>> ============================================================================== >>> --- trunk/orte/test/system/orte_ring.c (original) >>> +++ trunk/orte/test/system/orte_ring.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -41,16 +41,14 @@ >>> if( right_peer_orte_name.vpid >= num_peers ) { >>> right_peer_orte_name.vpid = 0; >>> } >>> - right_peer_orte_name.epoch = ORTE_EPOCH_INVALID; >>> - right_peer_orte_name.epoch = >>> orte_ess.proc_get_epoch(&right_peer_orte_name); >>> + >>> ORTE_EPOCH_SET(right_peer_orte_name.epoch,orte_ess.proc_get_epoch(&right_peer_orte_name)); >>> >>> left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid; >>> left_peer_orte_name.vpid = ORTE_PROC_MY_NAME->vpid - 1; >>> if( ORTE_PROC_MY_NAME->vpid == 0 ) { >>> left_peer_orte_name.vpid = num_peers - 1; >>> } >>> - left_peer_orte_name.epoch = ORTE_EPOCH_INVALID; >>> - left_peer_orte_name.epoch = >>> orte_ess.proc_get_epoch(&left_peer_orte_name); >>> + >>> ORTE_EPOCH_SET(left_peer_orte_name.epoch,orte_ess.proc_get_epoch(&left_peer_orte_name)); >>> >>> printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is >>> %s\n", >>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), getpid(), >>> >>> Modified: trunk/orte/test/system/orte_spawn.c >>> ============================================================================== >>> --- trunk/orte/test/system/orte_spawn.c (original) >>> +++ trunk/orte/test/system/orte_spawn.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -74,8 +74,8 @@ >>> for (i=0; i < app->num_procs; i++) { >>> name.vpid = i; >>> >>> - name.epoch = ORTE_EPOCH_INVALID; >>> - name.epoch = orte_ess.proc_get_epoch(&name); >>> + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >>> + >>> fprintf(stderr, "Parent: sending message to child %s\n", >>> ORTE_NAME_PRINT(&name)); >>> if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) { >>> ORTE_ERROR_LOG(rc); >>> >>> Modified: trunk/orte/tools/orte-ps/orte-ps.c >>> ============================================================================== >>> --- trunk/orte/tools/orte-ps/orte-ps.c (original) >>> +++ trunk/orte/tools/orte-ps/orte-ps.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -869,8 +869,14 @@ >>> } >>> >>> /* query the HNP for info on the procs in this job */ >>> - if (ORTE_SUCCESS != (ret = >>> orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid, >>> - >>> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD, &cnt, &procs))) { >>> + if (ORTE_SUCCESS != (ret = >>> orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), >>> + >>> job->jobid, >>> + >>> ORTE_VPID_WILDCARD, >>> +#if ORTE_ENABLE_EPOCH >>> + >>> ORTE_EPOCH_WILDCARD, >>> +#endif >>> + &cnt, >>> + >>> &procs))) { >>> ORTE_ERROR_LOG(ret); >>> } >>> job->procs->addr = (void**)procs; >>> >>> Modified: trunk/orte/tools/orte-top/orte-top.c >>> ============================================================================== >>> --- trunk/orte/tools/orte-top/orte-top.c (original) >>> +++ trunk/orte/tools/orte-top/orte-top.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -471,7 +471,7 @@ >>> if (NULL == ranks) { >>> /* take all ranks */ >>> proc.vpid = ORTE_VPID_WILDCARD; >>> - proc.epoch = ORTE_EPOCH_WILDCARD; >>> + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_WILDCARD); >>> if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, >>> ORTE_NAME))) { >>> ORTE_ERROR_LOG(ret); >>> goto cleanup; >>> >>> Modified: trunk/orte/util/comm/comm.c >>> ============================================================================== >>> --- trunk/orte/util/comm/comm.c (original) >>> +++ trunk/orte/util/comm/comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -433,8 +433,13 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, >>> orte_jobid_t job, orte_vpid_t vpid, >>> orte_epoch_t epoch, int *num_procs, >>> orte_proc_t ***proc_info_array) >>> +#else >>> +int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, >>> orte_jobid_t job, orte_vpid_t vpid, >>> + int *num_procs, orte_proc_t >>> ***proc_info_array) >>> +#endif >>> { >>> int ret; >>> int32_t cnt, cnt_procs, n; >>> @@ -463,11 +468,13 @@ >>> OBJ_RELEASE(cmd); >>> return ret; >>> } >>> +#if ORTE_ENABLE_EPOCH >>> if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { >>> ORTE_ERROR_LOG(ret); >>> OBJ_RELEASE(cmd); >>> return ret; >>> } >>> +#endif >>> /* define a max time to wait for send to complete */ >>> timer_fired = false; >>> error_exit = ORTE_SUCCESS; >>> >>> Modified: trunk/orte/util/comm/comm.h >>> ============================================================================== >>> --- trunk/orte/util/comm/comm.h (original) >>> +++ trunk/orte/util/comm/comm.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -52,7 +52,10 @@ >>> int *num_nodes, orte_node_t >>> ***node_info_array); >>> >>> ORTE_DECLSPEC int orte_util_comm_query_proc_info(const orte_process_name_t >>> *hnp, orte_jobid_t job, orte_vpid_t vpid, >>> - orte_epoch_t epoch, int >>> *num_procs, orte_proc_t ***proc_info_array); >>> +#if ORTE_ENABLE_EPOCH >>> + orte_epoch_t epoch, >>> +#endif >>> + int *num_procs, >>> orte_proc_t ***proc_info_array); >>> >>> ORTE_DECLSPEC int orte_util_comm_spawn_job(const orte_process_name_t *hnp, >>> orte_job_t *jdata); >>> >>> >>> Modified: trunk/orte/util/hnp_contact.c >>> ============================================================================== >>> --- trunk/orte/util/hnp_contact.c (original) >>> +++ trunk/orte/util/hnp_contact.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -55,7 +55,8 @@ >>> { >>> ptr->name.jobid = ORTE_JOBID_INVALID; >>> ptr->name.vpid = ORTE_VPID_INVALID; >>> - ptr->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >>> + >>> ptr->rml_uri = NULL; >>> } >>> static void orte_hnp_contact_destruct(orte_hnp_contact_t *ptr) >>> >>> Modified: trunk/orte/util/name_fns.c >>> ============================================================================== >>> --- trunk/orte/util/name_fns.c (original) >>> +++ trunk/orte/util/name_fns.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -46,7 +46,7 @@ >>> { >>> list->name.jobid = ORTE_JOBID_INVALID; >>> list->name.vpid = ORTE_VPID_INVALID; >>> - list->name.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(list->name.epoch,ORTE_EPOCH_MIN); >>> } >>> >>> /* destructor - used to free any resources held by instance */ >>> @@ -116,7 +116,10 @@ >>> char* orte_util_print_name_args(const orte_process_name_t *name) >>> { >>> orte_print_args_buffers_t *ptr; >>> - char *job, *vpid, *epoch; >>> + char *job, *vpid; >>> +#if ORTE_ENABLE_EPOCH >>> + char *epoch; >>> +#endif >>> >>> /* protect against NULL names */ >>> if (NULL == name) { >>> @@ -141,7 +144,7 @@ >>> */ >>> job = orte_util_print_jobids(name->jobid); >>> vpid = orte_util_print_vpids(name->vpid); >>> - epoch = orte_util_print_epoch(name->epoch); >>> + ORTE_EPOCH_SET(epoch,orte_util_print_epoch(name->epoch)); >>> >>> /* get the next buffer */ >>> ptr = get_print_name_buffer(); >>> @@ -156,9 +159,15 @@ >>> ptr->cntr = 0; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> snprintf(ptr->buffers[ptr->cntr++], >>> ORTE_PRINT_NAME_ARGS_MAX_SIZE, >>> "[%s,%s,%s]", job, vpid, epoch); >>> +#else >>> + snprintf(ptr->buffers[ptr->cntr++], >>> + ORTE_PRINT_NAME_ARGS_MAX_SIZE, >>> + "[%s,%s]", job, vpid); >>> +#endif >>> >>> return ptr->buffers[ptr->cntr-1]; >>> } >>> @@ -282,6 +291,7 @@ >>> return ptr->buffers[ptr->cntr-1]; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> char* orte_util_print_epoch(const orte_epoch_t epoch) >>> { >>> orte_print_args_buffers_t *ptr; >>> @@ -309,6 +319,7 @@ >>> } >>> return ptr->buffers[ptr->cntr-1]; >>> } >>> +#endif >>> >>> >>> >>> @@ -403,6 +414,7 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> int orte_util_convert_epoch_to_string(char **epoch_string, const >>> orte_epoch_t epoch) >>> { >>> /* check for wildcard value - handle appropriately */ >>> @@ -425,7 +437,6 @@ >>> return ORTE_SUCCESS; >>> } >>> >>> - >>> int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* >>> epoch_string) >>> { >>> if (NULL == epoch_string) { /* got an error */ >>> @@ -450,6 +461,7 @@ >>> >>> return ORTE_SUCCESS; >>> } >>> +#endif >>> >>> int orte_util_convert_string_to_process_name(orte_process_name_t *name, >>> const char* name_string) >>> @@ -457,13 +469,15 @@ >>> char *temp, *token; >>> orte_jobid_t job; >>> orte_vpid_t vpid; >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t epoch; >>> +#endif >>> int return_code=ORTE_SUCCESS; >>> - >>> + >>> /* set default */ >>> name->jobid = ORTE_JOBID_INVALID; >>> name->vpid = ORTE_VPID_INVALID; >>> - name->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(name->epoch,ORTE_EPOCH_MIN); >>> >>> /* check for NULL string - error */ >>> if (NULL == name_string) { >>> @@ -510,6 +524,7 @@ >>> vpid = strtoul(token, NULL, 10); >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field >>> -> epoch*/ >>> >>> /* check for error */ >>> @@ -528,10 +543,11 @@ >>> } else { >>> epoch = strtoul(token, NULL, 10); >>> } >>> +#endif >>> >>> name->jobid = job; >>> name->vpid = vpid; >>> - name->epoch = epoch; >>> + ORTE_EPOCH_SET(name->epoch,epoch); >>> >>> free(temp); >>> >>> @@ -568,6 +584,7 @@ >>> asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned >>> long)name->vpid); >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> if (ORTE_EPOCH_WILDCARD == name->epoch) { >>> asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, >>> ORTE_SCHEMA_WILDCARD_STRING); >>> } else if (ORTE_EPOCH_INVALID == name->epoch) { >>> @@ -575,6 +592,10 @@ >>> } else { >>> asprintf(name_string, "%s%c%lu", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, >>> (unsigned long)name->epoch); >>> } >>> +#else >>> + asprintf(name_string, "%s", tmp2); >>> +#endif >>> + >>> free(tmp); >>> free(tmp2); >>> >>> @@ -585,8 +606,11 @@ >>> /**** CREATE PROCESS NAME ****/ >>> int orte_util_create_process_name(orte_process_name_t **name, >>> orte_jobid_t job, >>> - orte_vpid_t vpid, >>> - orte_epoch_t epoch) >>> + orte_vpid_t vpid >>> +#if ORTE_ENABLE_EPOCH >>> + ,orte_epoch_t epoch >>> +#endif >>> + ) >>> { >>> *name = NULL; >>> >>> @@ -598,7 +622,8 @@ >>> >>> (*name)->jobid = job; >>> (*name)->vpid = vpid; >>> - (*name)->epoch = epoch; >>> + ORTE_EPOCH_SET((*name)->epoch,epoch); >>> + >>> return ORTE_SUCCESS; >>> } >>> >>> @@ -655,6 +680,7 @@ >>> } >>> } >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* Get here if jobid's and vpid's are equal, or not being checked. >>> * Now check epoch. >>> */ >>> @@ -666,6 +692,7 @@ >>> return OPAL_VALUE1_GREATER; >>> } >>> } >>> +#endif >>> >>> /* only way to get here is if all fields are being checked and are equal, >>> * or jobid not checked, but vpid equal, >>> >>> Modified: trunk/orte/util/name_fns.h >>> ============================================================================== >>> --- trunk/orte/util/name_fns.h (original) >>> +++ trunk/orte/util/name_fns.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -61,9 +61,13 @@ >>> #define ORTE_VPID_PRINT(n) \ >>> orte_util_print_vpids(n) >>> >>> +#if ORTE_ENABLE_EPOCH >>> ORTE_DECLSPEC char* orte_util_print_epoch(const orte_epoch_t epoch); >>> #define ORTE_EPOCH_PRINT(n) \ >>> orte_util_print_epoch(n) >>> +#else >>> +#define ORTE_EPOCH_PRINT(n) >>> +#endif >>> >>> ORTE_DECLSPEC char* orte_util_print_job_family(const orte_jobid_t job); >>> #define ORTE_JOB_FAMILY_PRINT(n) \ >>> @@ -104,6 +108,24 @@ >>> #define ORTE_JOBID_IS_DAEMON(n) \ >>> !((n) & 0x0000ffff) >>> >>> +/* Macro for getting the epoch out of the process name */ >>> +#if ORTE_ENABLE_EPOCH >>> +#define ORTE_EPOCH_GET(n) \ >>> + ((n)->epoch) >>> +#else >>> +#define ORTE_EPOCH_GET(n) >>> +#endif >>> + >>> +/* Macro for setting the epoch in the process name */ >>> +#if ORTE_ENABLE_EPOCH >>> +#define ORTE_EPOCH_SET(n,m) \ >>> + ( (n) = (m) ) >>> +#else >>> +#define ORTE_EPOCH_SET(n,m) \ >>> + do { \ >>> + } while(0); >>> +#endif >>> + >>> /* List of names for general use */ >>> struct orte_namelist_t { >>> opal_list_item_t item; /**< Allows this item to be placed on a list >>> */ >>> @@ -117,16 +139,24 @@ >>> ORTE_DECLSPEC int orte_util_convert_string_to_jobid(orte_jobid_t *jobid, >>> const char* jobidstring); >>> ORTE_DECLSPEC int orte_util_convert_vpid_to_string(char **vpid_string, >>> const orte_vpid_t vpid); >>> ORTE_DECLSPEC int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const >>> char* vpidstring); >>> +#if ORTE_ENABLE_EPOCH >>> ORTE_DECLSPEC int orte_util_convert_epoch_to_string(char **epoch_string, >>> const orte_epoch_t epoch); >>> ORTE_DECLSPEC int orte_util_convert_string_to_epoch(orte_vpid_t *epoch, >>> const char* epochstring); >>> +#endif >>> ORTE_DECLSPEC int >>> orte_util_convert_string_to_process_name(orte_process_name_t *name, >>> const char* name_string); >>> ORTE_DECLSPEC int orte_util_convert_process_name_to_string(char** >>> name_string, >>> const orte_process_name_t *name); >>> +#if ORTE_ENABLE_EPOCH >>> ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, >>> orte_jobid_t job, >>> orte_vpid_t vpid, >>> orte_epoch_t epoch); >>> +#else >>> +ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, >>> + orte_jobid_t job, >>> + orte_vpid_t vpid); >>> +#endif >>> ORTE_DECLSPEC int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t >>> fields, >>> const orte_process_name_t* name1, >>> const orte_process_name_t* name2); >>> >>> Modified: trunk/orte/util/nidmap.c >>> ============================================================================== >>> --- trunk/orte/util/nidmap.c (original) >>> +++ trunk/orte/util/nidmap.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -249,7 +249,7 @@ >>> */ >>> /* construct the URI */ >>> proc.vpid = node->daemon; >>> - proc.epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN); >>> >>> orte_util_convert_process_name_to_string(&proc_name, &proc); >>> asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, >>> (int)orte_process_info.my_port); >>> @@ -1001,6 +1001,7 @@ >>> } >>> #endif >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* Look up the current epoch value that we have stored locally. >>> * >>> * Note that this will not ping the HNP to get the most up to date epoch >>> stored >>> @@ -1023,7 +1024,9 @@ >>> /*print_orte_job_data();*/ >>> return e; >>> } >>> +#endif >>> >>> +#if ORTE_RESIL_ORTE >>> bool orte_util_proc_is_running(orte_process_name_t *proc) { >>> int i; >>> unsigned int j; >>> @@ -1078,7 +1081,9 @@ >>> >>> return ORTE_ERROR; >>> } >>> +#endif >>> >>> +#if ORTE_ENABLE_EPOCH >>> /* >>> * This function performs both the get and set operations on the epoch for a >>> * sepcific process name. If the epoch passed into the function is >>> @@ -1091,6 +1096,11 @@ >>> orte_job_t *jdata; >>> orte_proc_t *pdata; >>> >>> + if (ORTE_JOBID_INVALID == proc->jobid || >>> + ORTE_VPID_INVALID == proc->vpid) { >>> + return ORTE_EPOCH_INVALID; >>> + } >>> + >>> /* Sanity check just to make sure we don't overwrite our existing >>> * orte_job_data. >>> */ >>> @@ -1165,4 +1175,5 @@ >>> return ORTE_EPOCH_MIN; >>> } >>> } >>> +#endif >>> >>> >>> Modified: trunk/orte/util/nidmap.h >>> ============================================================================== >>> --- trunk/orte/util/nidmap.h (original) >>> +++ trunk/orte/util/nidmap.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>> @@ -48,11 +48,19 @@ >>> ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc); >>> ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc); >>> >>> +#if ORTE_ENABLE_EPOCH >>> ORTE_DECLSPEC orte_epoch_t orte_util_lookup_epoch(orte_process_name_t >>> *proc); >>> ORTE_DECLSPEC orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, >>> orte_epoch_t epoch); >>> +#endif >>> >>> ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, >>> orte_proc_state_t state); >>> + >>> +#if ORTE_RESIL_ORTE >>> +#define PROC_IS_RUNNING(n) orte_util_proc_is_running(n) >>> ORTE_DECLSPEC bool orte_util_proc_is_running(orte_process_name_t *proc); >>> +#else >>> +#define PROC_IS_RUNNING(n) ( true ) >>> +#endif >>> >>> ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr); >>> ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); >>> @@ -72,5 +80,8 @@ >>> END_C_DECLS >>> >>> /* Local functions */ >>> +#if ORTE_ENABLE_EPOCH >>> orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, >>> orte_epoch_t epoch); >>> #endif >>> + >>> +#endif >>> >>> Modified: trunk/orte/util/proc_info.c >>> ============================================================================== >>> --- trunk/orte/util/proc_info.c (original) >>> +++ trunk/orte/util/proc_info.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>> 2011) >>> @@ -36,13 +36,19 @@ >>> >>> #include "orte/util/proc_info.h" >>> >>> +#if ORTE_ENABLE_EPOCH >>> +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, >>> ORTE_EPOCH_MIN} >>> +#else >>> +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID} >>> +#endif >>> + >>> ORTE_DECLSPEC orte_proc_info_t orte_process_info = { >>> - /* .my_name = */ {ORTE_JOBID_INVALID, >>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>> - /* .my_daemon = */ {ORTE_JOBID_INVALID, >>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>> + /* .my_name = */ ORTE_NAME_INVALID, >>> + /* .my_daemon = */ ORTE_NAME_INVALID, >>> /* .my_daemon_uri = */ NULL, >>> - /* .my_hnp = */ {ORTE_JOBID_INVALID, >>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>> + /* .my_hnp = */ ORTE_NAME_INVALID, >>> /* .my_hnp_uri = */ NULL, >>> - /* .my_parent = */ {ORTE_JOBID_INVALID, >>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>> + /* .my_parent = */ ORTE_NAME_INVALID, >>> /* .hnp_pid = */ 0, >>> /* .app_num = */ 0, >>> /* .num_procs = */ 1, >>> >>> Modified: trunk/test/util/orte_session_dir.c >>> ============================================================================== >>> --- trunk/test/util/orte_session_dir.c (original) >>> +++ trunk/test/util/orte_session_dir.c 2011-08-26 18:16:14 EDT (Fri, >>> 26 Aug 2011) >>> @@ -57,7 +57,7 @@ >>> orte_process_info.my_name->cellid = 0; >>> orte_process_info.my_name->jobid = 0; >>> orte_process_info.my_name->vpid = 0; >>> - orte_process_info.my_name->epoch = ORTE_EPOCH_MIN; >>> + ORTE_EPOCH_SET(orte_process_info.my_name->epoch,ORTE_EPOCH_MIN); >>> >>> test_init("orte_session_dir_t"); >>> test_out = fopen( "test_session_dir_out", "w+" ); >>> _______________________________________________ >>> svn-full mailing list >>> svn-f...@open-mpi.org >>> hxxp://www.open-mpi.org/mailman/listinfo.cgi/svn-full >> >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> hxxp://www.open-mpi.org/mailman/listinfo.cgi/devel >> > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel