Has nothing to do with version, George - it's a problem of ORTE_ENABLE_EPOCH not being included in an AC_DEFINE. It is solely defined via AM_CONDITIONAL, but then used in .h files - which is simply wrong.
Please fix it. On Aug 26, 2011, at 5:41 PM, George Bosilca wrote: > We can't reproduce this. It compiles and runs without troubles on our macs. > However, it might depend on the Mac OS X version, we recently moved to Lion. > > Thanks, > george. > > On Aug 26, 2011, at 19:19 , Ralph Castain wrote: > >> Hate to say this, but the trunk is broken - won't build on Mac with that >> disabled. I'll try to dig into it later :-( >> >> >> On Aug 26, 2011, at 4:18 PM, Wesley Bland wrote: >> >>> The epoch and resilient rote code is now macro'd away. To enable use >>> >>> --enable-resilient-orte >>> >>> which defines: >>> >>> ORTE_ENABLE_EPOCH >>> ORTE_RESIL_ORTE >>> >>> -- >>> >>> Wesley >>> >>> On Aug 26, 2011, at 6:16 PM, wbl...@osl.iu.edu wrote: >>> >>>> Author: wbland >>>> Date: 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> New Revision: 25093 >>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/25093 >>>> >>>> Log: >>>> By popular demand the epoch code is now disabled by default. >>>> >>>> To enable the epochs and the resilient orte code, use the configure flag: >>>> >>>> --enable-resilient-orte >>>> >>>> This will define both: >>>> >>>> ORTE_ENABLE_EPOCH >>>> ORTE_RESIL_ORTE >>>> >>>> Text files modified: >>>> trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c | 12 ++++ >>>> >>>> trunk/ompi/mca/coll/sm2/coll_sm2_module.c | 3 >>>> >>>> trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 49 >>>> ++++++++---------- >>>> trunk/ompi/mca/dpm/orte/dpm_orte.c | 2 >>>> >>>> trunk/ompi/mca/pml/bfo/pml_bfo_failover.c | 10 +-- >>>> >>>> trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h | 6 -- >>>> >>>> trunk/ompi/proc/proc.c | 6 +- >>>> >>>> trunk/opal/config/opal_configure_options.m4 | 8 +++ >>>> >>>> trunk/orte/include/orte/types.h | 24 >>>> +++++++++ >>>> trunk/orte/mca/db/daemon/db_daemon.c | 2 >>>> >>>> trunk/orte/mca/errmgr/app/errmgr_app.c | 19 >>>> ++++++- >>>> trunk/orte/mca/errmgr/base/errmgr_base_fns.c | 12 ++-- >>>> >>>> trunk/orte/mca/errmgr/base/errmgr_base_tool.c | 6 +- >>>> >>>> trunk/orte/mca/errmgr/hnp/errmgr_hnp.c | 99 >>>> +++++++++++++++++++++++++++------------ >>>> trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 6 +- >>>> >>>> trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c | 6 +- >>>> >>>> trunk/orte/mca/errmgr/orted/errmgr_orted.c | 71 >>>> +++++++++++++++++++++------- >>>> trunk/orte/mca/ess/alps/ess_alps_module.c | 4 >>>> >>>> trunk/orte/mca/ess/base/base.h | 4 + >>>> >>>> trunk/orte/mca/ess/base/ess_base_select.c | 14 ++--- >>>> >>>> trunk/orte/mca/ess/env/ess_env_module.c | 3 >>>> >>>> trunk/orte/mca/ess/ess.h | 4 + >>>> >>>> trunk/orte/mca/ess/generic/ess_generic_module.c | 6 +- >>>> >>>> trunk/orte/mca/ess/hnp/ess_hnp_module.c | 2 >>>> >>>> trunk/orte/mca/ess/lsf/ess_lsf_module.c | 3 >>>> >>>> trunk/orte/mca/ess/singleton/ess_singleton_module.c | 2 >>>> >>>> trunk/orte/mca/ess/slave/ess_slave_module.c | 3 >>>> >>>> trunk/orte/mca/ess/slurm/ess_slurm_module.c | 3 >>>> >>>> trunk/orte/mca/ess/slurmd/ess_slurmd_module.c | 4 >>>> >>>> trunk/orte/mca/ess/tm/ess_tm_module.c | 2 >>>> >>>> trunk/orte/mca/filem/rsh/filem_rsh_module.c | 6 +- >>>> >>>> trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c | 21 >>>> ++----- >>>> trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c | 8 +- >>>> >>>> trunk/orte/mca/iof/base/base.h | 8 +- >>>> >>>> trunk/orte/mca/iof/base/iof_base_open.c | 2 >>>> >>>> trunk/orte/mca/iof/hnp/iof_hnp.c | 7 +- >>>> >>>> trunk/orte/mca/iof/hnp/iof_hnp_receive.c | 6 +- >>>> >>>> trunk/orte/mca/iof/orted/iof_orted.c | 2 >>>> >>>> trunk/orte/mca/odls/base/odls_base_default_fns.c | 7 +- >>>> >>>> trunk/orte/mca/odls/base/odls_base_open.c | 5 - >>>> >>>> trunk/orte/mca/odls/base/odls_base_state.c | 6 +- >>>> >>>> trunk/orte/mca/oob/tcp/oob_tcp_msg.c | 2 >>>> >>>> trunk/orte/mca/oob/tcp/oob_tcp_peer.c | 5 ++ >>>> >>>> trunk/orte/mca/plm/base/plm_base_jobid.c | 4 >>>> >>>> trunk/orte/mca/plm/base/plm_base_launch_support.c | 3 >>>> >>>> trunk/orte/mca/plm/base/plm_base_orted_cmds.c | 8 +-- >>>> >>>> trunk/orte/mca/plm/base/plm_base_receive.c | 7 ++ >>>> >>>> trunk/orte/mca/plm/base/plm_base_rsh_support.c | 4 + >>>> >>>> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 23 >>>> +++++---- >>>> trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c | 3 >>>> >>>> trunk/orte/mca/rmaps/seq/rmaps_seq.c | 3 >>>> >>>> trunk/orte/mca/rmcast/base/rmcast_base_open.c | 6 +- >>>> >>>> trunk/orte/mca/rmcast/tcp/rmcast_tcp.c | 4 >>>> >>>> trunk/orte/mca/rmcast/udp/rmcast_udp.c | 4 >>>> >>>> trunk/orte/mca/rml/base/rml_base_components.c | 5 + >>>> >>>> trunk/orte/mca/rml/rml_types.h | 6 + >>>> >>>> trunk/orte/mca/routed/base/routed_base_components.c | 6 +- >>>> >>>> trunk/orte/mca/routed/base/routed_base_register_sync.c | 4 + >>>> >>>> trunk/orte/mca/routed/binomial/routed_binomial.c | 54 >>>> ++++++++++++--------- >>>> trunk/orte/mca/routed/cm/routed_cm.c | 19 >>>> +++---- >>>> trunk/orte/mca/routed/direct/routed_direct.c | 3 >>>> >>>> trunk/orte/mca/routed/linear/routed_linear.c | 17 >>>> +++--- >>>> trunk/orte/mca/routed/radix/routed_radix.c | 22 >>>> ++++---- >>>> trunk/orte/mca/routed/slave/routed_slave.c | 6 +- >>>> >>>> trunk/orte/mca/sensor/file/sensor_file.c | 2 >>>> >>>> trunk/orte/mca/snapc/base/snapc_base_fns.c | 4 >>>> >>>> trunk/orte/mca/snapc/full/snapc_full_global.c | 12 ++-- >>>> >>>> trunk/orte/mca/snapc/full/snapc_full_local.c | 6 +- >>>> >>>> trunk/orte/mca/snapc/full/snapc_full_module.c | 4 >>>> >>>> trunk/orte/mca/sstore/base/sstore_base_fns.c | 6 +- >>>> >>>> trunk/orte/mca/sstore/central/sstore_central_global.c | 3 >>>> >>>> trunk/orte/mca/sstore/central/sstore_central_local.c | 6 +- >>>> >>>> trunk/orte/mca/sstore/stage/sstore_stage_global.c | 7 +- >>>> >>>> trunk/orte/mca/sstore/stage/sstore_stage_local.c | 12 ++-- >>>> >>>> trunk/orte/orted/orted_comm.c | 20 >>>> ++++---- >>>> trunk/orte/orted/orted_main.c | 7 +- >>>> >>>> trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c | 4 + >>>> >>>> trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c | 4 + >>>> >>>> trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c | 6 ++ >>>> >>>> trunk/orte/runtime/data_type_support/orte_dt_print_fns.c | 19 >>>> +++++++ >>>> trunk/orte/runtime/data_type_support/orte_dt_size_fns.c | 2 >>>> >>>> trunk/orte/runtime/data_type_support/orte_dt_support.h | 11 ++++ >>>> >>>> trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c | 10 +++ >>>> >>>> trunk/orte/runtime/orte_data_server.c | 2 >>>> >>>> trunk/orte/runtime/orte_globals.c | 4 + >>>> >>>> trunk/orte/runtime/orte_init.c | 9 +++ >>>> >>>> trunk/orte/runtime/orte_wait.h | 6 +- >>>> >>>> trunk/orte/test/system/oob_stress.c | 3 >>>> >>>> trunk/orte/test/system/orte_ring.c | 6 - >>>> >>>> trunk/orte/test/system/orte_spawn.c | 4 >>>> >>>> trunk/orte/tools/orte-ps/orte-ps.c | 10 +++ >>>> >>>> trunk/orte/tools/orte-top/orte-top.c | 2 >>>> >>>> trunk/orte/util/comm/comm.c | 7 ++ >>>> >>>> trunk/orte/util/comm/comm.h | 5 + >>>> >>>> trunk/orte/util/hnp_contact.c | 3 >>>> >>>> trunk/orte/util/name_fns.c | 47 >>>> ++++++++++++++---- >>>> trunk/orte/util/name_fns.h | 30 >>>> ++++++++++++ >>>> trunk/orte/util/nidmap.c | 13 ++++ >>>> >>>> trunk/orte/util/nidmap.h | 11 ++++ >>>> >>>> trunk/orte/util/proc_info.c | 14 ++++- >>>> >>>> trunk/test/util/orte_session_dir.c | 2 >>>> >>>> 101 files changed, 652 insertions(+), 362 deletions(-) >>>> >>>> Modified: trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >>>> (original) >>>> +++ trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -693,8 +693,16 @@ >>>> bool found = false; >>>> >>>> BTL_VERBOSE(("Searching for ep and proc with follow parameters:" >>>> - "jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d", >>>> - process_name->jobid, process_name->vpid, >>>> process_name->epoch, subnet_id, lid)); >>>> + "jobid %d, vpid %d, " >>>> +#if ORTE_ENABLE_EPOCH >>>> + "epoch %d, " >>>> +#endif >>>> + "sid %" PRIx64 ", lid %d", >>>> + process_name->jobid, process_name->vpid, >>>> +#if ORTE_ENABLE_EPOCH >>>> + process_name->epoch, >>>> +#endif >>>> + subnet_id, lid)); >>>> /* find ibproc */ >>>> OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); >>>> for (ib_proc = (mca_btl_openib_proc_t*) >>>> >>>> Modified: trunk/ompi/mca/coll/sm2/coll_sm2_module.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/coll/sm2/coll_sm2_module.c (original) >>>> +++ trunk/ompi/mca/coll/sm2/coll_sm2_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -1208,7 +1208,8 @@ >>>> peer = OBJ_NEW(orte_namelist_t); >>>> peer->name.jobid = >>>> comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid; >>>> peer->name.vpid = >>>> comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid; >>>> - peer->name.epoch = >>>> comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch; >>>> + >>>> ORTE_EPOCH_SET(peer->name.epoch,comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch); >>>> + >>>> opal_list_append(&peers, &peer->item); >>>> } >>>> /* prepare send data */ >>>> >>>> Modified: trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c (original) >>>> +++ trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -702,7 +702,7 @@ >>>> void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t >>>> *peer_ref) { >>>> peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>>> peer_ref->proc_name.vpid = ORTE_VPID_INVALID; >>>> - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t); >>>> OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t); >>>> @@ -730,7 +730,7 @@ >>>> >>>> peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>>> peer_ref->proc_name.vpid = ORTE_VPID_INVALID; >>>> - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) { >>>> HOKE_TRAFFIC_MSG_REF_RETURN(item); >>>> @@ -840,7 +840,7 @@ >>>> >>>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> msg_ref->matched = INVALID_INT; >>>> msg_ref->done = INVALID_INT; >>>> @@ -868,7 +868,7 @@ >>>> >>>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> msg_ref->matched = INVALID_INT; >>>> msg_ref->done = INVALID_INT; >>>> @@ -902,7 +902,7 @@ >>>> >>>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> msg_ref->done = INVALID_INT; >>>> msg_ref->active = INVALID_INT; >>>> @@ -934,7 +934,7 @@ >>>> >>>> msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; >>>> msg_ref->proc_name.vpid = ORTE_VPID_INVALID; >>>> - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> msg_ref->done = INVALID_INT; >>>> msg_ref->active = INVALID_INT; >>>> @@ -954,7 +954,7 @@ >>>> >>>> msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; >>>> msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; >>>> - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); >>>> } >>>> >>>> void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( >>>> ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) { >>>> @@ -962,7 +962,7 @@ >>>> >>>> msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; >>>> msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; >>>> - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); >>>> } >>>> >>>> >>>> @@ -1015,7 +1015,7 @@ >>>> } >>>> >>>> >>>> -#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>>> v_rank, v_comm, p_jobid, p_vpid, p_epoch) \ >>>> +#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>>> v_rank, v_comm, p_jobid, p_vpid) \ >>>> { \ >>>> HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \ >>>> \ >>>> @@ -1034,7 +1034,7 @@ >>>> \ >>>> msg_ref->proc_name.jobid = p_jobid; \ >>>> msg_ref->proc_name.vpid = p_vpid; \ >>>> - msg_ref->proc_name.epoch = p_epoch; \ >>>> + >>>> ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); >>>> \ >>>> \ >>>> msg_ref->matched = 0; \ >>>> msg_ref->done = 0; \ >>>> @@ -1043,7 +1043,7 @@ >>>> msg_ref->active_drain = 0; \ >>>> } >>>> >>>> -#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>>> v_rank, v_comm, p_jobid, p_vpid, p_epoch) \ >>>> +#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, >>>> v_rank, v_comm, p_jobid, p_vpid) \ >>>> { \ >>>> HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \ >>>> \ >>>> @@ -1063,7 +1063,7 @@ >>>> \ >>>> msg_ref->proc_name.jobid = p_jobid; \ >>>> msg_ref->proc_name.vpid = p_vpid; \ >>>> - msg_ref->proc_name.epoch = p_epoch; \ >>>> + >>>> ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); >>>> \ >>>> } >>>> >>>> >>>> @@ -1466,7 +1466,7 @@ >>>> >>>> new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid; >>>> new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid; >>>> - new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch; >>>> + >>>> ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch); >>>> >>>> opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, >>>> &(new_peer_ref->super)); >>>> } >>>> @@ -3237,13 +3237,11 @@ >>>> CREATE_NEW_MSG((*msg_ref), msg_type, >>>> count, ddt_size, tag, dest, comm, >>>> peer_ref->proc_name.jobid, >>>> - peer_ref->proc_name.vpid, >>>> - peer_ref->proc_name.epoch); >>>> + peer_ref->proc_name.vpid); >>>> } else { >>>> CREATE_NEW_MSG((*msg_ref), msg_type, >>>> count, ddt_size, tag, dest, comm, >>>> - ORTE_JOBID_INVALID, ORTE_VPID_INVALID, >>>> - ORTE_EPOCH_INVALID); >>>> + ORTE_JOBID_INVALID, ORTE_VPID_INVALID); >>>> } >>>> >>>> if( msg_type == COORD_MSG_TYPE_P_SEND || >>>> @@ -3377,7 +3375,7 @@ >>>> if( NULL == from_peer_ref && NULL != to_peer_ref ) { >>>> (*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid; >>>> (*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid; >>>> - (*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch; >>>> + >>>> ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch); >>>> } >>>> >>>> return exit_status; >>>> @@ -3808,8 +3806,7 @@ >>>> CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type, >>>> count, NULL, tag, dest, comm, >>>> peer_ref->proc_name.jobid, >>>> - peer_ref->proc_name.vpid, >>>> - peer_ref->proc_name.epoch); >>>> + peer_ref->proc_name.vpid); >>>> >>>> (*msg_ref)->done = 0; >>>> (*msg_ref)->active = 0; >>>> @@ -5284,8 +5281,7 @@ >>>> */ >>>> peer_name.jobid = ORTE_PROC_MY_NAME->jobid; >>>> peer_name.vpid = peer_idx; >>>> - peer_name.epoch = ORTE_EPOCH_INVALID; >>>> - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); >>>> + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); >>>> >>>> if( NULL == (peer_ref = find_peer(peer_name))) { >>>> opal_output(mca_crcp_bkmrk_component.super.output_handle, >>>> @@ -5346,8 +5342,7 @@ >>>> >>>> peer_name.jobid = ORTE_PROC_MY_NAME->jobid; >>>> peer_name.vpid = peer_idx; >>>> - peer_name.epoch = ORTE_EPOCH_INVALID; >>>> - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); >>>> + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); >>>> >>>> if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name, >>>> OMPI_CRCP_COORD_BOOKMARK_TAG, >>>> @@ -5529,7 +5524,8 @@ >>>> HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret); >>>> d_msg_ack->peer.jobid = peer_ref->proc_name.jobid; >>>> d_msg_ack->peer.vpid = peer_ref->proc_name.vpid; >>>> - d_msg_ack->peer.epoch = peer_ref->proc_name.epoch; >>>> + ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch); >>>> + >>>> d_msg_ack->complete = false; >>>> opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super)); >>>> OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle, >>>> @@ -6169,8 +6165,7 @@ >>>> count, datatype_size, tag, rank, >>>> ompi_comm_lookup(comm_id), >>>> peer_ref->proc_name.jobid, >>>> - peer_ref->proc_name.vpid, >>>> - peer_ref->proc_name.epoch); >>>> + peer_ref->proc_name.vpid); >>>> >>>> traffic_message_create_drain_message(true, num_left_unresolved, >>>> peer_ref, >>>> >>>> Modified: trunk/ompi/mca/dpm/orte/dpm_orte.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/dpm/orte/dpm_orte.c (original) >>>> +++ trunk/ompi/mca/dpm/orte/dpm_orte.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -1130,7 +1130,7 @@ >>>> /* flag the identity of the remote proc */ >>>> carport.jobid = mev->sender.jobid; >>>> carport.vpid = mev->sender.vpid; >>>> - carport.epoch = mev->sender.epoch; >>>> + ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch); >>>> >>>> /* release the event */ >>>> OBJ_RELEASE(mev); >>>> >>>> Modified: trunk/ompi/mca/pml/bfo/pml_bfo_failover.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/pml/bfo/pml_bfo_failover.c (original) >>>> +++ trunk/ompi/mca/pml/bfo/pml_bfo_failover.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -1,8 +1,5 @@ >>>> /* >>>> * Copyright (c) 2010 Oracle and/or its affiliates. All rights >>>> reserved. >>>> - * Copyright (c) 2004-2011 The University of Tennessee and The University >>>> - * of Tennessee Research Foundation. All rights >>>> - * reserved. >>>> * $COPYRIGHT$ >>>> * >>>> * Additional copyrights may follow >>>> @@ -398,13 +395,13 @@ >>>> (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { >>>> orte_proc.jobid = hdr->hdr_restart.hdr_jobid; >>>> orte_proc.vpid = hdr->hdr_restart.hdr_vpid; >>>> - orte_proc.epoch = hdr->hdr_restart.hdr_epoch; >>>> + >>>> ompi_proc = ompi_proc_find(&orte_proc); >>>> opal_output_verbose(20, mca_pml_bfo_output, >>>> "RNDVRESTARTNOTIFY: received: does not match >>>> request, sending NACK back " >>>> "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d >>>> SRC:req=%d,hdr=%d " >>>> "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, >>>> peer=%d, hdr->hdr_jobid=%d, " >>>> - "hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, >>>> ompi_proc->proc_hostname=%s", >>>> + "hdr->hdr_vpid=%d, >>>> ompi_proc->proc_hostname=%s", >>>> (uint16_t)recvreq->req_msgseq, >>>> hdr->hdr_match.hdr_seq, >>>> recvreq->req_recv.req_base.req_comm->c_contextid, >>>> hdr->hdr_match.hdr_ctx, >>>> >>>> recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, >>>> @@ -413,7 +410,7 @@ >>>> recvreq->remote_req_send.pval, (void *)recvreq, >>>> >>>> recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, >>>> hdr->hdr_restart.hdr_jobid, >>>> hdr->hdr_restart.hdr_vpid, >>>> - hdr->hdr_restart.hdr_epoch, >>>> ompi_proc->proc_hostname); >>>> + ompi_proc->proc_hostname); >>>> mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); >>>> return; >>>> } >>>> @@ -715,7 +712,6 @@ >>>> restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed >>>> for NACKs */ >>>> restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid; >>>> restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid; >>>> - restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch; >>>> >>>> bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc); >>>> >>>> >>>> Modified: trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h >>>> ============================================================================== >>>> --- trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h (original) >>>> +++ trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -2,9 +2,6 @@ >>>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >>>> * University Research and Technology >>>> * Corporation. All rights reserved. >>>> - * Copyright (c) 2004-2011 The University of Tennessee and The University >>>> - * of Tennessee Research Foundation. All rights >>>> - * reserved. >>>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >>>> * University of Stuttgart. All rights reserved. >>>> * Copyright (c) 2004-2005 The Regents of the University of California. >>>> @@ -415,7 +412,6 @@ >>>> int32_t hdr_dst_rank; /**< needed to send NACK */ >>>> uint32_t hdr_jobid; /**< needed to send NACK */ >>>> uint32_t hdr_vpid; /**< needed to send NACK */ >>>> - uint32_t hdr_epoch; /**< needed to send NACK */ >>>> }; >>>> typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t; >>>> >>>> @@ -428,7 +424,6 @@ >>>> (h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \ >>>> (h).hdr_jobid = ntohl((h).hdr_jobid); \ >>>> (h).hdr_vpid = ntohl((h).hdr_vpid); \ >>>> - (h).hdr_epoch = ntohl((h).hdr_epoch); \ >>>> } while (0) >>>> >>>> #define MCA_PML_BFO_RESTART_HDR_HTON(h) \ >>>> @@ -437,7 +432,6 @@ >>>> (h).hdr_dst_rank = htonl((h).hdr_dst_rank); \ >>>> (h).hdr_jobid = htonl((h).hdr_jobid); \ >>>> (h).hdr_vpid = htonl((h).hdr_vpid); \ >>>> - (h).hdr_epoch = htonl((h).hdr_epoch); \ >>>> } while (0) >>>> >>>> #endif /* PML_BFO */ >>>> >>>> Modified: trunk/ompi/proc/proc.c >>>> ============================================================================== >>>> --- trunk/ompi/proc/proc.c (original) >>>> +++ trunk/ompi/proc/proc.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -108,7 +108,8 @@ >>>> >>>> proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; >>>> proc->proc_name.vpid = i; >>>> - proc->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN); >>>> + >>>> if (i == ORTE_PROC_MY_NAME->vpid) { >>>> ompi_proc_local_proc = proc; >>>> proc->proc_flags = OPAL_PROC_ALL_LOCAL; >>>> @@ -362,8 +363,7 @@ >>>> >>>> /* Does not change: proc->proc_name.vpid */ >>>> proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; >>>> - proc->proc_name.epoch = ORTE_EPOCH_INVALID; >>>> - proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name); >>>> + >>>> ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name)); >>>> >>>> /* Make sure to clear the local flag before we set it below */ >>>> proc->proc_flags = 0; >>>> >>>> Modified: trunk/opal/config/opal_configure_options.m4 >>>> ============================================================================== >>>> --- trunk/opal/config/opal_configure_options.m4 (original) >>>> +++ trunk/opal/config/opal_configure_options.m4 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -416,6 +416,14 @@ >>>> AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1") >>>> >>>> # >>>> +# Compile in resilient runtime code >>>> +# >>>> +AC_ARG_ENABLE(resilient-orte, >>>> + [AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient >>>> runtime code.])]) >>>> +AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"]) >>>> +AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"]) >>>> + >>>> +# >>>> # Do we want to install binaries? >>>> # >>>> AC_ARG_ENABLE([binaries], >>>> >>>> Modified: trunk/orte/include/orte/types.h >>>> ============================================================================== >>>> --- trunk/orte/include/orte/types.h (original) >>>> +++ trunk/orte/include/orte/types.h 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -81,24 +81,43 @@ >>>> #define ORTE_VPID_T OPAL_UINT32 >>>> #define ORTE_VPID_MAX UINT32_MAX-2 >>>> #define ORTE_VPID_MIN 0 >>>> + >>>> +#if ORTE_ENABLE_EPOCH >>>> typedef uint32_t orte_epoch_t; >>>> #define ORTE_EPOCH_T OPAL_UINT32 >>>> #define ORTE_EPOCH_MAX UINT32_MAX-2 >>>> #define ORTE_EPOCH_MIN 0 >>>> +#endif >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> #define ORTE_PROCESS_NAME_HTON(n) \ >>>> do { \ >>>> n.jobid = htonl(n.jobid); \ >>>> n.vpid = htonl(n.vpid); \ >>>> n.epoch = htonl(n.epoch); \ >>>> } while (0) >>>> +#else >>>> +#define ORTE_PROCESS_NAME_HTON(n) \ >>>> +do { \ >>>> + n.jobid = htonl(n.jobid); \ >>>> + n.vpid = htonl(n.vpid); \ >>>> +} while (0) >>>> +#endif >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> #define ORTE_PROCESS_NAME_NTOH(n) \ >>>> do { \ >>>> n.jobid = ntohl(n.jobid); \ >>>> n.vpid = ntohl(n.vpid); \ >>>> n.epoch = ntohl(n.epoch); \ >>>> } while (0) >>>> +#else >>>> +#define ORTE_PROCESS_NAME_NTOH(n) \ >>>> +do { \ >>>> + n.jobid = ntohl(n.jobid); \ >>>> + n.vpid = ntohl(n.vpid); \ >>>> +} while (0) >>>> +#endif >>>> >>>> #define ORTE_NAME_ARGS(n) \ >>>> (unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : >>>> (unsigned long)(n)->jobid), \ >>>> @@ -127,6 +146,7 @@ >>>> struct orte_process_name_t { >>>> orte_jobid_t jobid; /**< Job number */ >>>> orte_vpid_t vpid; /**< Process id - equivalent to rank */ >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; /**< Epoch - used to measure the generation of a >>>> recovered process. >>>> * The epoch will start at ORTE_EPOCH_MIN and >>>> * increment every time the process is detected >>>> as >>>> @@ -135,6 +155,7 @@ >>>> * processes that did not directly detect the >>>> * failure to increment their epochs. >>>> */ >>>> +#endif >>>> }; >>>> typedef struct orte_process_name_t orte_process_name_t; >>>> >>>> @@ -157,7 +178,10 @@ >>>> #define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an >>>> orte_process_name_t */ >>>> #define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid >>>> */ >>>> #define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a >>>> jobid */ >>>> + >>>> +#if ORTE_ENABLE_EPOCH >>>> #define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an >>>> epoch */ >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> /* State-related types */ >>>> >>>> Modified: trunk/orte/mca/db/daemon/db_daemon.c >>>> ============================================================================== >>>> --- trunk/orte/mca/db/daemon/db_daemon.c (original) >>>> +++ trunk/orte/mca/db/daemon/db_daemon.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -386,7 +386,7 @@ >>>> dat = OBJ_NEW(orte_db_data_t); >>>> dat->name.jobid = sender->jobid; >>>> dat->name.vpid = sender->vpid; >>>> - dat->name.epoch= sender->epoch; >>>> + ORTE_EPOCH_SET(dat->name.epoch,sender->epoch); >>>> dat->key = key; >>>> count=1; >>>> opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32); >>>> >>>> Modified: trunk/orte/mca/errmgr/app/errmgr_app.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/app/errmgr_app.c (original) >>>> +++ trunk/orte/mca/errmgr/app/errmgr_app.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -82,8 +82,10 @@ >>>> NULL, >>>> NULL, >>>> NULL, >>>> - orte_errmgr_base_register_migration_warning, >>>> - orte_errmgr_base_set_fault_callback >>>> + orte_errmgr_base_register_migration_warning >>>> +#if ORTE_RESIL_ORTE >>>> + ,orte_errmgr_base_set_fault_callback >>>> +#endif >>>> }; >>>> >>>> /************************ >>>> @@ -93,18 +95,23 @@ >>>> { >>>> int ret = ORTE_SUCCESS; >>>> >>>> +#if ORTE_RESIL_ORTE >>>> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>>> ORTE_RML_TAG_EPOCH_CHANGE, >>>> ORTE_RML_PERSISTENT, >>>> epoch_change_recv, >>>> NULL); >>>> +#endif >>>> + >>>> return ret; >>>> } >>>> >>>> static int finalize(void) >>>> { >>>> +#if ORTE_RESIL_ORTE >>>> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, >>>> ORTE_RML_TAG_EPOCH_CHANGE); >>>> +#endif >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -151,6 +158,7 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> void epoch_change_recv(int status, >>>> orte_process_name_t *sender, >>>> opal_buffer_t *buffer, >>>> @@ -209,15 +217,20 @@ >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> (*fault_cbfunc)(procs); >>>> + } else if (NULL == fault_cbfunc) { >>>> + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >>>> + "%s errmgr:app Calling fault callback failed (NULL >>>> pointer)!", >>>> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> } else { >>>> OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >>>> - "%s errmgr:app Calling fault callback failed!", >>>> + "%s errmgr:app Calling fault callback failed >>>> (num_dead <= 0)!", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> } >>>> >>>> free(proc); >>>> OBJ_RELEASE(procs); >>>> } >>>> +#endif >>>> >>>> static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, >>>> orte_std_cntr_t num_procs) >>>> { >>>> >>>> Modified: trunk/orte/mca/errmgr/base/errmgr_base_fns.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/base/errmgr_base_fns.c (original) >>>> +++ trunk/orte/mca/errmgr/base/errmgr_base_fns.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -97,13 +97,13 @@ >>>> { >>>> item->proc_name.vpid = ORTE_VPID_INVALID; >>>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>>> - item->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); >>>> } >>>> >>>> void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t >>>> *item) >>>> { >>>> item->proc_name.vpid = ORTE_VPID_INVALID; >>>> - item->proc_name.epoch = ORTE_EPOCH_INVALID; >>>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); >>>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>>> } >>>> >>>> @@ -139,13 +139,13 @@ >>>> void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) >>>> { >>>> item->proc_name.vpid = ORTE_VPID_INVALID; >>>> - item->proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); >>>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>>> >>>> item->node_name = NULL; >>>> >>>> item->map_proc_name.vpid = ORTE_VPID_INVALID; >>>> - item->map_proc_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN); >>>> item->map_proc_name.jobid = ORTE_JOBID_INVALID; >>>> >>>> item->map_node_name = NULL; >>>> @@ -156,7 +156,7 @@ >>>> void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) >>>> { >>>> item->proc_name.vpid = ORTE_VPID_INVALID; >>>> - item->proc_name.epoch = ORTE_EPOCH_INVALID; >>>> + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); >>>> item->proc_name.jobid = ORTE_JOBID_INVALID; >>>> >>>> if( NULL != item->node_name ) { >>>> @@ -165,7 +165,7 @@ >>>> } >>>> >>>> item->map_proc_name.vpid = ORTE_VPID_INVALID; >>>> - item->map_proc_name.epoch = ORTE_EPOCH_INVALID; >>>> + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID); >>>> item->map_proc_name.jobid = ORTE_JOBID_INVALID; >>>> >>>> if( NULL != item->map_node_name ) { >>>> >>>> Modified: trunk/orte/mca/errmgr/base/errmgr_base_tool.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/base/errmgr_base_tool.c (original) >>>> +++ trunk/orte/mca/errmgr/base/errmgr_base_tool.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -267,7 +267,7 @@ >>>> */ >>>> errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID; >>>> errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID; >>>> - errmgr_cmdline_sender.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN); >>>> if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>>> ORTE_RML_TAG_MIGRATE, >>>> 0, >>>> @@ -379,14 +379,14 @@ >>>> if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, >>>> ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) { >>>> swap_dest.jobid = errmgr_cmdline_sender.jobid; >>>> swap_dest.vpid = errmgr_cmdline_sender.vpid; >>>> - swap_dest.epoch = errmgr_cmdline_sender.epoch; >>>> + ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch); >>>> >>>> errmgr_cmdline_sender = *sender; >>>> >>>> orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS); >>>> >>>> errmgr_cmdline_sender.jobid = swap_dest.jobid; >>>> errmgr_cmdline_sender.vpid = swap_dest.vpid; >>>> - errmgr_cmdline_sender.epoch = swap_dest.epoch; >>>> + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch); >>>> >>>> goto cleanup; >>>> } >>>> >>>> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp.c (original) >>>> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -53,6 +53,7 @@ >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/runtime/orte_locks.h" >>>> #include "orte/runtime/orte_quit.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/errmgr/errmgr.h" >>>> #include "orte/mca/errmgr/base/base.h" >>>> @@ -83,9 +84,11 @@ >>>> orte_errmgr_hnp_global_suggest_map_targets, >>>> /* FT Event hook */ >>>> orte_errmgr_hnp_global_ft_event, >>>> - orte_errmgr_base_register_migration_warning, >>>> + orte_errmgr_base_register_migration_warning >>>> +#if ORTE_RESIL_ORTE >>>> /* Set the callback */ >>>> - orte_errmgr_base_set_fault_callback >>>> + ,orte_errmgr_base_set_fault_callback >>>> +#endif >>>> }; >>>> >>>> >>>> @@ -97,14 +100,16 @@ >>>> static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t >>>> jobstate, >>>> orte_proc_state_t state, >>>> orte_exit_code_t exit_code); >>>> static void check_job_complete(orte_job_t *jdata); >>>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>>> epoch); >>>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); >>>> static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, >>>> orte_proc_state_t state, orte_exit_code_t exit_code); >>>> static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); >>>> +#if ORTE_RESIL_ORTE >>>> static int send_to_local_applications(opal_pointer_array_t *dead_names); >>>> static void failure_notification(int status, orte_process_name_t* sender, >>>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>>> void* cbdata); >>>> +#endif >>>> >>>> /************************ >>>> * API Definitions >>>> @@ -380,16 +385,21 @@ >>>> **********************/ >>>> int orte_errmgr_hnp_base_global_init(void) >>>> { >>>> - int ret; >>>> + int ret = ORTE_SUCCESS; >>>> >>>> +#if ORTE_RESIL_ORTE >>>> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>>> ORTE_RML_TAG_FAILURE_NOTICE, >>>> ORTE_RML_PERSISTENT, failure_notification, >>>> NULL); >>>> +#endif >>>> + >>>> return ret; >>>> } >>>> >>>> int orte_errmgr_hnp_base_global_finalize(void) >>>> { >>>> +#if ORTE_RESIL_ORTE >>>> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); >>>> +#endif >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -406,6 +416,7 @@ >>>> orte_odls_child_t *child; >>>> int rc; >>>> orte_app_context_t *app; >>>> + orte_proc_t *pdat; >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, >>>> "%s errmgr:hnp: job %s reported state %s" >>>> @@ -538,7 +549,7 @@ >>>> ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, >>>> exit_code); >>>> /* order all local procs for this job to be killed */ >>>> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >>>> check_job_complete(jdata); /* set the local proc states */ >>>> /* the job object for this job will have been NULL'd >>>> * in the array if the job was solely local. If it isn't >>>> @@ -550,7 +561,7 @@ >>>> break; >>>> case ORTE_JOB_STATE_COMM_FAILED: >>>> /* order all local procs for this job to be killed */ >>>> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >>>> check_job_complete(jdata); /* set the local proc states */ >>>> /* the job object for this job will have been NULL'd >>>> * in the array if the job was solely local. If it isn't >>>> @@ -562,7 +573,7 @@ >>>> break; >>>> case ORTE_JOB_STATE_HEARTBEAT_FAILED: >>>> /* order all local procs for this job to be killed */ >>>> - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); >>>> check_job_complete(jdata); /* set the local proc states */ >>>> /* the job object for this job will have been NULL'd >>>> * in the array if the job was solely local. If it isn't >>>> @@ -632,10 +643,6 @@ >>>> } >>>> } >>>> >>>> - if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { >>>> - exit_code = 0; >>>> - } >>>> - >>>> orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); >>>> check_job_complete(jdata); /* need to set the job state */ >>>> /* the job object for this job will have been NULL'd >>>> @@ -679,7 +686,7 @@ >>>> >>>> case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: >>>> if (jdata->enable_recovery) { >>>> - killprocs(proc->jobid, proc->vpid, proc->epoch); >>>> + killprocs(proc->jobid, proc->vpid); >>>> /* is this a local proc */ >>>> if (NULL != (child = proc_is_local(proc))) { >>>> /* local proc - see if it has reached its restart limit */ >>>> @@ -778,18 +785,37 @@ >>>> opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM >>>> FAILED DAEMON %s", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> ORTE_NAME_PRINT(proc)); >>>> /* kill all local procs */ >>>> - killprocs(ORTE_JOBID_WILDCARD, >>>> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); >>>> + killprocs(ORTE_JOBID_WILDCARD, >>>> ORTE_VPID_WILDCARD); >>>> /* kill all jobs */ >>>> hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >>>> /* check if all is complete so we can terminate */ >>>> check_job_complete(jdata); >>>> } >>>> } else { >>>> +#if !ORTE_RESIL_ORTE >>>> + if (NULL == (pdat = >>>> (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { >>>> + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >>>> + orte_show_help("help-orte-errmgr-hnp.txt", >>>> "errmgr-hnp:daemon-died", true, >>>> + ORTE_VPID_PRINT(proc->vpid), >>>> "Unknown"); >>>> + } else { >>>> + orte_show_help("help-orte-errmgr-hnp.txt", >>>> "errmgr-hnp:daemon-died", true, >>>> + ORTE_VPID_PRINT(proc->vpid), >>>> + (NULL == pdat->node) ? "Unknown" : >>>> + ((NULL == pdat->node->name) ? >>>> "Unknown" : pdat->node->name)); >>>> + } >>>> +#endif >>>> if (ORTE_SUCCESS != >>>> orte_errmgr_hnp_record_dead_process(proc)) { >>>> /* The process is already dead so don't keep trying >>>> to do >>>> * this stuff. */ >>>> return ORTE_SUCCESS; >>>> } >>>> + >>>> +#if !ORTE_RESIL_ORTE >>>> + /* kill all local procs */ >>>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>>> + /* kill all jobs */ >>>> + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >>>> +#endif >>>> /* We'll check if the job was complete when we get the >>>> * message back from the HNP notifying us of the dead >>>> * process */ >>>> @@ -805,7 +831,7 @@ >>>> } else { >>>> orte_errmgr_hnp_record_dead_process(proc); >>>> /* kill all local procs */ >>>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>>> /* kill all jobs */ >>>> hnp_abort(ORTE_JOBID_WILDCARD, exit_code); >>>> return ORTE_ERR_UNRECOVERABLE; >>>> @@ -824,6 +850,7 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> static void failure_notification(int status, orte_process_name_t* sender, >>>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>>> void* cbdata) >>>> @@ -984,6 +1011,7 @@ >>>> >>>> OBJ_RELEASE(dead_names); >>>> } >>>> +#endif >>>> >>>> /***************** >>>> * Local Functions >>>> @@ -1354,7 +1382,6 @@ >>>> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >>>> } >>>> break; >>>> -#if 0 >>>> case ORTE_PROC_STATE_ABORTED_BY_SIG: >>>> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>>> "%s errmgr:hnp:check_job_completed proc %s >>>> aborted by signal", >>>> @@ -1370,7 +1397,6 @@ >>>> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >>>> } >>>> break; >>>> -#endif >>>> case ORTE_PROC_STATE_TERM_WO_SYNC: >>>> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>>> "%s errmgr:hnp:check_job_completed proc %s >>>> terminated without sync", >>>> @@ -1393,7 +1419,6 @@ >>>> } >>>> break; >>>> case ORTE_PROC_STATE_COMM_FAILED: >>>> -#if 1 >>>> if (!jdata->abort) { >>>> jdata->state = ORTE_JOB_STATE_COMM_FAILED; >>>> /* point to the lowest rank to cause the problem */ >>>> @@ -1403,7 +1428,6 @@ >>>> jdata->abort = true; >>>> ORTE_UPDATE_EXIT_STATUS(proc->exit_code); >>>> } >>>> -#endif >>>> break; >>>> case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: >>>> if (!jdata->abort) { >>>> @@ -1530,9 +1554,6 @@ >>>> */ >>>> CHECK_DAEMONS: >>>> if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { >>>> -#if 0 >>>> - if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* >>>> Subtract one for the HNP */ >>>> -#endif >>>> if (0 == orte_routed.num_routes()) { >>>> /* orteds are done! */ >>>> OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>>> @@ -1696,7 +1717,7 @@ >>>> } >>>> } >>>> >>>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>>> epoch) >>>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) >>>> { >>>> opal_pointer_array_t cmd; >>>> orte_proc_t proc; >>>> @@ -1707,7 +1728,9 @@ >>>> orte_sensor.stop(job); >>>> } >>>> >>>> - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && >>>> ORTE_EPOCH_WILDCARD == epoch) { >>>> + if (ORTE_JOBID_WILDCARD == job >>>> + && ORTE_VPID_WILDCARD == vpid >>>> + && ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { >>>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { >>>> ORTE_ERROR_LOG(rc); >>>> } >>>> @@ -1718,7 +1741,7 @@ >>>> OBJ_CONSTRUCT(&proc, orte_proc_t); >>>> proc.name.jobid = job; >>>> proc.name.vpid = vpid; >>>> - proc.name.epoch = epoch; >>>> + ORTE_EPOCH_SET(proc.name.epoch,epoch); >>>> opal_pointer_array_add(&cmd, &proc); >>>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { >>>> ORTE_ERROR_LOG(rc); >>>> @@ -1913,13 +1936,15 @@ >>>> } >>>> >>>> if (NULL != (pdat = >>>> (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && >>>> - ORTE_PROC_STATE_TERMINATED < pdat->state) { >>>> + ORTE_PROC_STATE_TERMINATED > pdat->state) { >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* Make sure that the epochs match. */ >>>> if (proc->epoch != pdat->name.epoch) { >>>> opal_output(1, "The epoch does not match the current epoch. >>>> Throwing the request out."); >>>> return ORTE_SUCCESS; >>>> } >>>> +#endif >>>> >>>> dead_names = OBJ_NEW(opal_pointer_array_t); >>>> >>>> @@ -1935,6 +1960,7 @@ >>>> } >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> if (!mca_errmgr_hnp_component.term_in_progress) { >>>> /* >>>> * Send a message to the other daemons so they know that a daemon >>>> has >>>> @@ -1949,7 +1975,7 @@ >>>> OBJ_RELEASE(buffer); >>>> } else { >>>> >>>> - /* Iterate of the list of dead procs and send them along >>>> with >>>> + /* Iterate over the list of dead procs and send them >>>> along with >>>> * the rest. The HNP needs this info so it can tell the other >>>> * ORTEDs and they can inform the appropriate applications. >>>> */ >>>> @@ -1973,6 +1999,9 @@ >>>> } else { >>>> orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); >>>> } >>>> +#else >>>> + orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); >>>> +#endif >>>> } >>>> >>>> return ORTE_SUCCESS; >>>> @@ -2011,6 +2040,7 @@ >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> ORTE_NAME_PRINT(&pdat->name))); >>>> >>>> +#if ORTE_RESIL_ORTE >>>> /* Make sure the epochs match, if not it probably means that we >>>> * already reported this failure. */ >>>> if (name_item->epoch != pdat->name.epoch) { >>>> @@ -2018,6 +2048,7 @@ >>>> } >>>> >>>> orte_util_set_epoch(name_item, name_item->epoch + 1); >>>> +#endif >>>> >>>> /* Remove it from the job array */ >>>> opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); >>>> @@ -2034,6 +2065,7 @@ >>>> >>>> OBJ_RELEASE(pdat); >>>> >>>> +#if ORTE_RESIL_ORTE >>>> /* Create a new proc object that will keep track of the epoch >>>> * information */ >>>> pdat = OBJ_NEW(orte_proc_t); >>>> @@ -2041,14 +2073,15 @@ >>>> pdat->name.vpid = name_item->vpid; >>>> pdat->name.epoch = name_item->epoch + 1; >>>> >>>> - /* Set the state as terminated so we'll know the process isn't >>>> - * actually there. */ >>>> - pdat->state = ORTE_PROC_STATE_TERMINATED; >>>> - >>>> opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); >>>> jdat->num_procs++; >>>> jdat->num_terminated++; >>>> +#endif >>>> + /* Set the state as terminated so we'll know the process isn't >>>> + * actually there. */ >>>> + pdat->state = ORTE_PROC_STATE_TERMINATED; >>>> } else { >>>> +#if ORTE_RESIL_ORTE >>>> opal_output(0, "Proc data not found for %s", >>>> ORTE_NAME_PRINT(name_item)); >>>> /* Create a new proc object that will keep track of the epoch >>>> * information */ >>>> @@ -2064,11 +2097,13 @@ >>>> opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); >>>> jdat->num_procs++; >>>> jdat->num_terminated++; >>>> +#endif >>>> } >>>> >>>> check_job_complete(jdat); >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> if (!orte_orteds_term_ordered) { >>>> /* Need to update the orted routing module. */ >>>> orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); >>>> @@ -2077,10 +2112,12 @@ >>>> (*fault_cbfunc)(dead_procs); >>>> } >>>> } >>>> +#endif >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> int send_to_local_applications(opal_pointer_array_t *dead_names) { >>>> opal_buffer_t *buf; >>>> int ret = ORTE_SUCCESS; >>>> @@ -2121,3 +2158,5 @@ >>>> >>>> return ret; >>>> } >>>> +#endif >>>> + >>>> >>>> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c (original) >>>> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -522,7 +522,7 @@ >>>> wp_item = OBJ_NEW(errmgr_autor_wp_item_t); >>>> wp_item->name.jobid = proc->jobid; >>>> wp_item->name.vpid = proc->vpid; >>>> - wp_item->name.epoch = proc->epoch; >>>> + ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); >>>> wp_item->state = state; >>>> >>>> opal_list_append(procs_pending_recovery, &(wp_item->super)); >>>> @@ -626,7 +626,7 @@ >>>> { >>>> wp->name.jobid = ORTE_JOBID_INVALID; >>>> wp->name.vpid = ORTE_VPID_INVALID; >>>> - wp->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> wp->state = 0; >>>> } >>>> @@ -635,7 +635,7 @@ >>>> { >>>> wp->name.jobid = ORTE_JOBID_INVALID; >>>> wp->name.vpid = ORTE_VPID_INVALID; >>>> - wp->name.epoch = ORTE_EPOCH_INVALID; >>>> + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); >>>> >>>> wp->state = 0; >>>> } >>>> >>>> Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c (original) >>>> +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -750,7 +750,7 @@ >>>> close_iof_stdin = true; >>>> iof_name.jobid = proc->name.jobid; >>>> iof_name.vpid = proc->name.vpid; >>>> - iof_name.epoch = proc->name.epoch; >>>> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >>>> } >>>> } >>>> } >>>> @@ -807,7 +807,7 @@ >>>> close_iof_stdin = true; >>>> iof_name.jobid = proc->name.jobid; >>>> iof_name.vpid = proc->name.vpid; >>>> - iof_name.epoch = proc->name.epoch; >>>> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >>>> } >>>> } >>>> } >>>> @@ -855,7 +855,7 @@ >>>> close_iof_stdin = true; >>>> iof_name.jobid = proc->name.jobid; >>>> iof_name.vpid = proc->name.vpid; >>>> - iof_name.epoch = proc->name.epoch; >>>> + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); >>>> } >>>> } >>>> } >>>> >>>> Modified: trunk/orte/mca/errmgr/orted/errmgr_orted.c >>>> ============================================================================== >>>> --- trunk/orte/mca/errmgr/orted/errmgr_orted.c (original) >>>> +++ trunk/orte/mca/errmgr/orted/errmgr_orted.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -34,6 +34,7 @@ >>>> #include "orte/util/show_help.h" >>>> #include "orte/util/nidmap.h" >>>> #include "orte/runtime/orte_globals.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> #include "orte/mca/rml/rml.h" >>>> #include "orte/mca/odls/odls.h" >>>> #include "orte/mca/odls/base/base.h" >>>> @@ -41,7 +42,9 @@ >>>> #include "orte/mca/plm/plm_types.h" >>>> #include "orte/mca/routed/routed.h" >>>> #include "orte/mca/sensor/sensor.h" >>>> +#include "orte/mca/ess/ess.h" >>>> #include "orte/runtime/orte_quit.h" >>>> +#include "orte/runtime/orte_globals.h" >>>> >>>> #include "orte/mca/errmgr/errmgr.h" >>>> #include "orte/mca/errmgr/base/base.h" >>>> @@ -59,13 +62,15 @@ >>>> static void update_local_children(orte_odls_job_t *jobdat, >>>> orte_job_state_t jobstate, >>>> orte_proc_state_t state); >>>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>>> epoch); >>>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); >>>> static int record_dead_process(orte_process_name_t *proc); >>>> -static int send_to_local_applications(opal_pointer_array_t *dead_names); >>>> static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); >>>> +#if ORTE_RESIL_ORTE >>>> +static int send_to_local_applications(opal_pointer_array_t *dead_names); >>>> static void failure_notification(int status, orte_process_name_t* sender, >>>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>>> void* cbdata); >>>> +#endif >>>> >>>> /* >>>> * Module functions: Global >>>> @@ -104,8 +109,10 @@ >>>> predicted_fault, >>>> suggest_map_targets, >>>> ft_event, >>>> - orte_errmgr_base_register_migration_warning, >>>> - orte_errmgr_base_set_fault_callback /* Set callback function */ >>>> + orte_errmgr_base_register_migration_warning >>>> +#if ORTE_RESIL_ORTE >>>> + ,orte_errmgr_base_set_fault_callback /* Set callback function */ >>>> +#endif >>>> }; >>>> >>>> /************************ >>>> @@ -113,16 +120,22 @@ >>>> ************************/ >>>> static int init(void) >>>> { >>>> - int ret; >>>> + int ret = ORTE_SUCCESS; >>>> >>>> +#if ORTE_RESIL_ORTE >>>> ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>>> ORTE_RML_TAG_FAILURE_NOTICE, >>>> ORTE_RML_PERSISTENT, failure_notification, >>>> NULL); >>>> +#endif >>>> + >>>> return ret; >>>> } >>>> >>>> static int finalize(void) >>>> { >>>> +#if ORTE_RESIL_ORTE >>>> orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); >>>> +#endif >>>> + >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> @@ -228,10 +241,10 @@ >>>> /* update all procs in job */ >>>> update_local_children(jobdat, jobstate, >>>> ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); >>>> /* order all local procs for this job to be killed */ >>>> - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); >>>> case ORTE_JOB_STATE_COMM_FAILED: >>>> /* kill all local procs */ >>>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>>> /* tell the caller we can't recover */ >>>> return ORTE_ERR_UNRECOVERABLE; >>>> break; >>>> @@ -276,7 +289,7 @@ >>>> /* see if this was a lifeline */ >>>> if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { >>>> /* kill our children */ >>>> - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, >>>> ORTE_EPOCH_WILDCARD); >>>> + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); >>>> /* terminate - our routed children will see >>>> * us leave and automatically die >>>> */ >>>> @@ -290,10 +303,18 @@ >>>> if (0 == orte_routed.num_routes() && >>>> 0 == opal_list_get_size(&orte_local_children)) { >>>> orte_quit(); >>>> + } else { >>>> + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, >>>> + "%s errmgr:orted not exiting, num_routes() == >>>> %d, num children == %d", >>>> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> + orte_routed.num_routes(), >>>> + opal_list_get_size(&orte_local_children))); >>>> } >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> record_dead_process(proc); >>>> +#endif >>>> >>>> /* if not, then indicate we can continue */ >>>> return ORTE_SUCCESS; >>>> @@ -344,7 +365,7 @@ >>>> /* Decrement the number of local procs */ >>>> jobdat->num_local_procs--; >>>> /* kill this proc */ >>>> - killprocs(proc->jobid, proc->vpid, proc->epoch); >>>> + killprocs(proc->jobid, proc->vpid); >>>> } >>>> app = >>>> (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, >>>> child->app_idx); >>>> if( jobdat->enable_recovery && child->restarts < >>>> app->max_restarts ) { >>>> @@ -526,10 +547,12 @@ >>>> ORTE_ERROR_LOG(rc); >>>> goto FINAL_CLEANUP; >>>> } >>>> +#if ORTE_ENABLE_EPOCH >>>> if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, >>>> &child->name->epoch, 1, ORTE_EPOCH))) { >>>> ORTE_ERROR_LOG(rc); >>>> goto FINAL_CLEANUP; >>>> } >>>> +#endif >>>> } >>>> } >>>> /* pack an invalid marker */ >>>> @@ -660,7 +683,7 @@ >>>> continue; >>>> } >>>> >>>> - if (name_item->epoch < orte_util_lookup_epoch(name_item)) { >>>> + if (0 < >>>> ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { >>>> continue; >>>> } >>>> >>>> @@ -669,9 +692,11 @@ >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> ORTE_NAME_PRINT(name_item))); >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* Increment the epoch */ >>>> orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); >>>> orte_util_set_epoch(name_item, name_item->epoch + 1); >>>> +#endif >>>> >>>> OPAL_THREAD_LOCK(&orte_odls_globals.mutex); >>>> >>>> @@ -706,6 +731,7 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> static void failure_notification(int status, orte_process_name_t* sender, >>>> opal_buffer_t *buffer, orte_rml_tag_t tag, >>>> void* cbdata) >>>> @@ -714,7 +740,7 @@ >>>> orte_std_cntr_t n; >>>> int ret = ORTE_SUCCESS, num_failed; >>>> int32_t i; >>>> - orte_process_name_t *name_item, proc; >>>> + orte_process_name_t *name_item; >>>> >>>> dead_names = OBJ_NEW(opal_pointer_array_t); >>>> >>>> @@ -746,7 +772,7 @@ >>>> /* There shouldn't be an issue of receiving this message multiple >>>> * times but it doesn't hurt to double check. >>>> */ >>>> - if (proc.epoch < orte_util_lookup_epoch(name_item)) { >>>> + if (0 < >>>> ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { >>>> opal_output(1, "Received from proc %s local epoch %d", >>>> ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); >>>> continue; >>>> } >>>> @@ -767,6 +793,7 @@ >>>> free(name_item); >>>> } >>>> } >>>> +#endif >>>> >>>> /***************** >>>> * Local Functions >>>> @@ -948,11 +975,13 @@ >>>> ORTE_ERROR_LOG(rc); >>>> return rc; >>>> } >>>> +#if ORTE_ENABLE_EPOCH >>>> /* Pack the child's epoch. */ >>>> if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, >>>> &(child->name->epoch), 1, ORTE_EPOCH))) { >>>> ORTE_ERROR_LOG(rc); >>>> return rc; >>>> } >>>> +#endif >>>> /* pack the contact info */ >>>> if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, >>>> OPAL_STRING))) { >>>> ORTE_ERROR_LOG(rc); >>>> @@ -1015,7 +1044,7 @@ >>>> } >>>> } >>>> >>>> -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t >>>> epoch) >>>> +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) >>>> { >>>> opal_pointer_array_t cmd; >>>> orte_proc_t proc; >>>> @@ -1026,7 +1055,9 @@ >>>> orte_sensor.stop(job); >>>> } >>>> >>>> - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && >>>> ORTE_EPOCH_WILDCARD == epoch) { >>>> + if (ORTE_JOBID_WILDCARD == job >>>> + && ORTE_VPID_WILDCARD == vpid >>>> + && 0 == ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { >>>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { >>>> ORTE_ERROR_LOG(rc); >>>> } >>>> @@ -1037,7 +1068,7 @@ >>>> OBJ_CONSTRUCT(&proc, orte_proc_t); >>>> proc.name.jobid = job; >>>> proc.name.vpid = vpid; >>>> - proc.name.epoch = epoch; >>>> + ORTE_EPOCH_SET(proc.name.epoch,epoch); >>>> opal_pointer_array_add(&cmd, &proc); >>>> if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { >>>> ORTE_ERROR_LOG(rc); >>>> @@ -1082,20 +1113,21 @@ >>>> return rc; >>>> } >>>> >>>> +#if ORTE_RESIL_ORTE >>>> int send_to_local_applications(opal_pointer_array_t *dead_names) { >>>> opal_buffer_t *buf; >>>> int ret; >>>> orte_process_name_t *name_item; >>>> int size, i; >>>> >>>> - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, >>>> - "%s Sending failure to local applications.", >>>> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> - >>>> buf = OBJ_NEW(opal_buffer_t); >>>> >>>> size = opal_pointer_array_get_size(dead_names); >>>> >>>> + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, >>>> + "%s Sending %d failure(s) to local applications.", >>>> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size)); >>>> + >>>> if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { >>>> ORTE_ERROR_LOG(ret); >>>> OBJ_RELEASE(buf); >>>> @@ -1122,4 +1154,5 @@ >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> +#endif >>>> >>>> >>>> Modified: trunk/orte/mca/ess/alps/ess_alps_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/alps/ess_alps_module.c (original) >>>> +++ trunk/orte/mca/ess/alps/ess_alps_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -363,8 +363,8 @@ >>>> >>>> ORTE_PROC_MY_NAME->jobid = jobid; >>>> ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>>> + ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); >>>> + >>>> ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "ess:alps set name to %s", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> Modified: trunk/orte/mca/ess/base/base.h >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/base/base.h (original) >>>> +++ trunk/orte/mca/ess/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -57,7 +57,11 @@ >>>> >>>> ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available; >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> ORTE_DECLSPEC orte_epoch_t >>>> orte_ess_base_proc_get_epoch(orte_process_name_t *proc); >>>> +#else >>>> +ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc); >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> >>>> >>>> Modified: trunk/orte/mca/ess/base/ess_base_select.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/base/ess_base_select.c (original) >>>> +++ trunk/orte/mca/ess/base/ess_base_select.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -36,21 +36,19 @@ >>>> * Generic function to retrieve the epoch of a specific process >>>> * from the job data. >>>> */ >>>> +#if !ORTE_ENABLE_EPOCH >>>> +int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { >>>> + return 0; >>>> +} >>>> +#else >>>> orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { >>>> orte_epoch_t epoch = ORTE_EPOCH_INVALID; >>>> >>>> -#if !ORTE_DISABLE_FULL_SUPPORT >>>> epoch = orte_util_lookup_epoch(proc); >>>> -#endif >>>> - >>>> - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, >>>> - "%s ess:generic: proc %s has epoch %d", >>>> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> - ORTE_NAME_PRINT(proc), >>>> - epoch)); >>>> >>>> return epoch; >>>> } >>>> +#endif >>>> >>>> int >>>> orte_ess_base_select(void) >>>> >>>> Modified: trunk/orte/mca/ess/env/ess_env_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/env/ess_env_module.c (original) >>>> +++ trunk/orte/mca/ess/env/ess_env_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -392,8 +392,7 @@ >>>> >>>> ORTE_PROC_MY_NAME->jobid = jobid; >>>> ORTE_PROC_MY_NAME->vpid = vpid; >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "ess:env set name to %s", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> Modified: trunk/orte/mca/ess/ess.h >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/ess.h (original) >>>> +++ trunk/orte/mca/ess/ess.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -111,7 +111,11 @@ >>>> * will get the most up to date version stored within the orte_proc_t >>>> struct. >>>> * Obviously the epoch of the proc that is passed in will be ignored. >>>> */ >>>> +#if ORTE_ENABLE_EPOCH >>>> typedef orte_epoch_t >>>> (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); >>>> +#else >>>> +typedef int >>>> (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); >>>> +#endif >>>> >>>> /** >>>> * Update the pidmap >>>> >>>> Modified: trunk/orte/mca/ess/generic/ess_generic_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/generic/ess_generic_module.c (original) >>>> +++ trunk/orte/mca/ess/generic/ess_generic_module.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -155,7 +155,7 @@ >>>> goto error; >>>> } >>>> ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "%s completed name definition", >>>> @@ -273,7 +273,7 @@ >>>> if (vpid == ORTE_PROC_MY_NAME->vpid) { >>>> ORTE_PROC_MY_DAEMON->jobid = 0; >>>> ORTE_PROC_MY_DAEMON->vpid = i; >>>> - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >>>> } >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "%s node %d name %s rank %s", >>>> @@ -304,7 +304,7 @@ >>>> if (vpid == ORTE_PROC_MY_NAME->vpid) { >>>> ORTE_PROC_MY_DAEMON->jobid = 0; >>>> ORTE_PROC_MY_DAEMON->vpid = i; >>>> - ORTE_PROC_MY_DAEMON->epoch = >>>> ORTE_PROC_MY_NAME->epoch; >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >>>> } >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "%s node %d name %s rank %d", >>>> >>>> Modified: trunk/orte/mca/ess/hnp/ess_hnp_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/hnp/ess_hnp_module.c (original) >>>> +++ trunk/orte/mca/ess/hnp/ess_hnp_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -494,7 +494,7 @@ >>>> proc = OBJ_NEW(orte_proc_t); >>>> proc->name.jobid = ORTE_PROC_MY_NAME->jobid; >>>> proc->name.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - proc->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> proc->pid = orte_process_info.pid; >>>> proc->rml_uri = orte_rml.get_contact_info(); >>>> >>>> Modified: trunk/orte/mca/ess/lsf/ess_lsf_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/lsf/ess_lsf_module.c (original) >>>> +++ trunk/orte/mca/ess/lsf/ess_lsf_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -357,8 +357,7 @@ >>>> >>>> ORTE_PROC_MY_NAME->jobid = jobid; >>>> ORTE_PROC_MY_NAME->vpid = vpid; >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>>> >>>> /* fix up the base name and make it the "real" name */ >>>> lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); >>>> >>>> Modified: trunk/orte/mca/ess/singleton/ess_singleton_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/singleton/ess_singleton_module.c (original) >>>> +++ trunk/orte/mca/ess/singleton/ess_singleton_module.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -188,7 +188,7 @@ >>>> /* set the name */ >>>> ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); >>>> ORTE_PROC_MY_NAME->vpid = 0; >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>>> >>>> } else { >>>> /* >>>> >>>> Modified: trunk/orte/mca/ess/slave/ess_slave_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/slave/ess_slave_module.c (original) >>>> +++ trunk/orte/mca/ess/slave/ess_slave_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -280,8 +280,7 @@ >>>> >>>> ORTE_PROC_MY_NAME->jobid = jobid; >>>> ORTE_PROC_MY_NAME->vpid = vpid; >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "ess:slave set name to %s", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> Modified: trunk/orte/mca/ess/slurm/ess_slurm_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/slurm/ess_slurm_module.c (original) >>>> +++ trunk/orte/mca/ess/slurm/ess_slurm_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -368,8 +368,7 @@ >>>> /* fix up the vpid and make it the "real" vpid */ >>>> slurm_nodeid = atoi(getenv("SLURM_NODEID")); >>>> ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid; >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "ess:slurm set name to %s", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> Modified: trunk/orte/mca/ess/slurmd/ess_slurmd_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/slurmd/ess_slurmd_module.c (original) >>>> +++ trunk/orte/mca/ess/slurmd/ess_slurmd_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -195,7 +195,7 @@ >>>> } >>>> ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); >>>> #endif >>>> - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>>> /* get our local rank */ >>>> if (NULL == (envar = getenv("SLURM_LOCALID"))) { >>>> error = "could not get SLURM_LOCALID"; >>>> @@ -260,7 +260,7 @@ >>>> nodeid = strtol(envar, NULL, 10); >>>> ORTE_PROC_MY_DAEMON->jobid = 0; >>>> ORTE_PROC_MY_DAEMON->vpid = nodeid; >>>> - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); >>>> >>>> /* get the number of ppn */ >>>> if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) { >>>> >>>> Modified: trunk/orte/mca/ess/tm/ess_tm_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/ess/tm/ess_tm_module.c (original) >>>> +++ trunk/orte/mca/ess/tm/ess_tm_module.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -364,7 +364,7 @@ >>>> >>>> ORTE_PROC_MY_NAME->jobid = jobid; >>>> ORTE_PROC_MY_NAME->vpid = vpid; >>>> - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, >>>> "ess:tm set name to %s", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> Modified: trunk/orte/mca/filem/rsh/filem_rsh_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/filem/rsh/filem_rsh_module.c (original) >>>> +++ trunk/orte/mca/filem/rsh/filem_rsh_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -1097,11 +1097,11 @@ >>>> if( NULL != proc_set ) { >>>> wp_item->proc_set.source.jobid = proc_set->source.jobid; >>>> wp_item->proc_set.source.vpid = proc_set->source.vpid; >>>> - wp_item->proc_set.source.epoch = proc_set->source.epoch; >>>> + >>>> ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,proc_set->source.epoch); >>>> >>>> wp_item->proc_set.sink.jobid = proc_set->sink.jobid; >>>> wp_item->proc_set.sink.vpid = proc_set->sink.vpid; >>>> - wp_item->proc_set.sink.epoch = proc_set->sink.epoch; >>>> + ORTE_EPOCH_SET(wp_item->proc_set.sink.epoch,proc_set->sink.epoch); >>>> } >>>> /* Copy the File Set */ >>>> if( NULL != file_set ) { >>>> @@ -1396,7 +1396,7 @@ >>>> wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t); >>>> wp_item->proc_set.source.jobid = sender->jobid; >>>> wp_item->proc_set.source.vpid = sender->vpid; >>>> - wp_item->proc_set.source.epoch = sender->epoch; >>>> + ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,sender->epoch); >>>> >>>> opal_list_append(&work_pool_waiting, &(wp_item->super)); >>>> } >>>> >>>> Modified: trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c >>>> ============================================================================== >>>> --- trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c (original) >>>> +++ trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -168,8 +168,7 @@ >>>> if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { >>>> /* I send first */ >>>> peer.vpid = vpids[1]; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> /* setup a temp buffer so I can inform the other side as to the >>>> * number of entries in my buffer >>>> @@ -226,8 +225,7 @@ >>>> opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); >>>> opal_dss.copy_payload(&buf, sendbuf); >>>> peer.vpid = vpids[0]; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, >>>> "%s grpcomm:coll:two-proc sending to %s", >>>> @@ -320,8 +318,7 @@ >>>> /* first send my current contents */ >>>> nv = (rank - distance + np) % np; >>>> peer.vpid = vpids[nv]; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> OBJ_CONSTRUCT(&buf, opal_buffer_t); >>>> opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); >>>> @@ -340,8 +337,7 @@ >>>> num_recvd = 0; >>>> nv = (rank + distance) % np; >>>> peer.vpid = vpids[nv]; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> OBJ_CONSTRUCT(&bucket, opal_buffer_t); >>>> if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, >>>> @@ -439,8 +435,7 @@ >>>> /* first send my current contents */ >>>> nv = rank ^ distance; >>>> peer.vpid = vpids[nv]; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> OBJ_CONSTRUCT(&buf, opal_buffer_t); >>>> opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); >>>> @@ -646,8 +641,7 @@ >>>> proc.jobid = jobid; >>>> proc.vpid = 0; >>>> while (proc.vpid < jobdat->num_procs && 0 < >>>> opal_list_get_size(&daemon_tree)) { >>>> - proc.epoch = ORTE_EPOCH_INVALID; >>>> - proc.epoch = orte_ess.proc_get_epoch(&proc); >>>> + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); >>>> >>>> /* get the daemon that hosts this proc */ >>>> daemonvpid = orte_ess.proc_get_daemon(&proc); >>>> @@ -713,8 +707,7 @@ >>>> /* send it */ >>>> my_parent.jobid = ORTE_PROC_MY_NAME->jobid; >>>> my_parent.vpid = orte_routed.get_routing_tree(NULL); >>>> - my_parent.epoch = ORTE_EPOCH_INVALID; >>>> - my_parent.epoch = orte_ess.proc_get_epoch(&my_parent); >>>> + >>>> ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); >>>> >>>> OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, >>>> "%s grpcomm:base:daemon_coll: daemon collective >>>> not the HNP - sending to parent %s", >>>> >>>> Modified: trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c (original) >>>> +++ trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -95,7 +95,7 @@ >>>> >>>> my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; >>>> my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; >>>> - my_local_rank_zero_proc.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); >>>> >>>> if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { >>>> ORTE_ERROR_LOG(rc); >>>> @@ -270,7 +270,7 @@ >>>> proc.jobid = ORTE_PROC_MY_NAME->jobid; >>>> for (v=0; v < orte_process_info.num_procs; v++) { >>>> proc.vpid = v; >>>> - proc.epoch = orte_util_lookup_epoch(&proc); >>>> + ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); >>>> >>>> /* is this proc local_rank=0 on its node? */ >>>> if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { >>>> @@ -285,7 +285,7 @@ >>>> nm = OBJ_NEW(orte_namelist_t); >>>> nm->name.jobid = proc.jobid; >>>> nm->name.vpid = proc.vpid; >>>> - nm->name.epoch = proc.epoch; >>>> + ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); >>>> >>>> opal_list_append(&my_local_peers, &nm->item); >>>> /* if I am not local_rank=0, is this one? */ >>>> @@ -293,7 +293,7 @@ >>>> 0 == orte_ess.get_local_rank(&proc)) { >>>> my_local_rank_zero_proc.jobid = proc.jobid; >>>> my_local_rank_zero_proc.vpid = proc.vpid; >>>> - my_local_rank_zero_proc.epoch = proc.epoch; >>>> + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); >>>> } >>>> } >>>> >>>> >>>> Modified: trunk/orte/mca/iof/base/base.h >>>> ============================================================================== >>>> --- trunk/orte/mca/iof/base/base.h (original) >>>> +++ trunk/orte/mca/iof/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -135,7 +135,7 @@ >>>> ep = OBJ_NEW(orte_iof_sink_t); \ >>>> ep->name.jobid = (nm)->jobid; \ >>>> ep->name.vpid = (nm)->vpid; \ >>>> - ep->name.epoch = (nm)->epoch; \ >>>> + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ >>>> ep->tag = (tg); \ >>>> if (0 <= (fid)) { \ >>>> ep->wev->fd = (fid); \ >>>> @@ -169,7 +169,7 @@ >>>> rev = OBJ_NEW(orte_iof_read_event_t); \ >>>> rev->name.jobid = (nm)->jobid; \ >>>> rev->name.vpid = (nm)->vpid; \ >>>> - rev->name.epoch = (nm)->epoch; \ >>>> + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ >>>> rev->tag = (tg); \ >>>> rev->fd = (fid); \ >>>> *(rv) = rev; \ >>>> @@ -194,7 +194,7 @@ >>>> ep = OBJ_NEW(orte_iof_sink_t); \ >>>> ep->name.jobid = (nm)->jobid; \ >>>> ep->name.vpid = (nm)->vpid; \ >>>> - ep->name.epoch = (nm)->epoch; \ >>>> + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ >>>> ep->tag = (tg); \ >>>> if (0 <= (fid)) { \ >>>> ep->wev->fd = (fid); \ >>>> @@ -215,7 +215,7 @@ >>>> rev = OBJ_NEW(orte_iof_read_event_t); \ >>>> rev->name.jobid = (nm)->jobid; \ >>>> rev->name.vpid = (nm)->vpid; \ >>>> - rev->name.epoch= (nm)->epoch; \ >>>> + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ >>>> rev->tag = (tg); \ >>>> *(rv) = rev; \ >>>> opal_event_set(opal_event_base, \ >>>> >>>> Modified: trunk/orte/mca/iof/base/iof_base_open.c >>>> ============================================================================== >>>> --- trunk/orte/mca/iof/base/iof_base_open.c (original) >>>> +++ trunk/orte/mca/iof/base/iof_base_open.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -91,7 +91,7 @@ >>>> { >>>> ptr->daemon.jobid = ORTE_JOBID_INVALID; >>>> ptr->daemon.vpid = ORTE_VPID_INVALID; >>>> - ptr->daemon.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN); >>>> ptr->wev = OBJ_NEW(orte_iof_write_event_t); >>>> } >>>> static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) >>>> >>>> Modified: trunk/orte/mca/iof/hnp/iof_hnp.c >>>> ============================================================================== >>>> --- trunk/orte/mca/iof/hnp/iof_hnp.c (original) >>>> +++ trunk/orte/mca/iof/hnp/iof_hnp.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -186,7 +186,7 @@ >>>> proct = OBJ_NEW(orte_iof_proc_t); >>>> proct->name.jobid = dst_name->jobid; >>>> proct->name.vpid = dst_name->vpid; >>>> - proct->name.epoch = dst_name->epoch; >>>> + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); >>>> opal_list_append(&mca_iof_hnp_component.procs, &proct->super); >>>> /* see if we are to output to a file */ >>>> if (NULL != orte_output_filename) { >>>> @@ -281,8 +281,7 @@ >>>> &mca_iof_hnp_component.sinks); >>>> sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; >>>> sink->daemon.vpid = proc->node->daemon->name.vpid; >>>> - sink->daemon.epoch = ORTE_EPOCH_INVALID; >>>> - sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon); >>>> + >>>> ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); >>>> } >>>> } >>>> >>>> @@ -389,7 +388,7 @@ >>>> &mca_iof_hnp_component.sinks); >>>> sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; >>>> sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> Modified: trunk/orte/mca/iof/hnp/iof_hnp_receive.c >>>> ============================================================================== >>>> --- trunk/orte/mca/iof/hnp/iof_hnp_receive.c (original) >>>> +++ trunk/orte/mca/iof/hnp/iof_hnp_receive.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -109,21 +109,21 @@ >>>> NULL, &mca_iof_hnp_component.sinks); >>>> sink->daemon.jobid = mev->sender.jobid; >>>> sink->daemon.vpid = mev->sender.vpid; >>>> - sink->daemon.epoch = mev->sender.epoch; >>>> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >>>> } >>>> if (ORTE_IOF_STDERR & stream) { >>>> ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, >>>> NULL, &mca_iof_hnp_component.sinks); >>>> sink->daemon.jobid = mev->sender.jobid; >>>> sink->daemon.vpid = mev->sender.vpid; >>>> - sink->daemon.epoch = mev->sender.epoch; >>>> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >>>> } >>>> if (ORTE_IOF_STDDIAG & stream) { >>>> ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, >>>> NULL, &mca_iof_hnp_component.sinks); >>>> sink->daemon.jobid = mev->sender.jobid; >>>> sink->daemon.vpid = mev->sender.vpid; >>>> - sink->daemon.epoch = mev->sender.epoch; >>>> + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); >>>> } >>>> goto CLEAN_RETURN; >>>> } >>>> >>>> Modified: trunk/orte/mca/iof/orted/iof_orted.c >>>> ============================================================================== >>>> --- trunk/orte/mca/iof/orted/iof_orted.c (original) >>>> +++ trunk/orte/mca/iof/orted/iof_orted.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -163,7 +163,7 @@ >>>> proct = OBJ_NEW(orte_iof_proc_t); >>>> proct->name.jobid = dst_name->jobid; >>>> proct->name.vpid = dst_name->vpid; >>>> - proct->name.epoch = dst_name->epoch; >>>> + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); >>>> opal_list_append(&mca_iof_orted_component.procs, &proct->super); >>>> /* see if we are to output to a file */ >>>> if (NULL != orte_output_filename) { >>>> >>>> Modified: trunk/orte/mca/odls/base/odls_base_default_fns.c >>>> ============================================================================== >>>> --- trunk/orte/mca/odls/base/odls_base_default_fns.c (original) >>>> +++ trunk/orte/mca/odls/base/odls_base_default_fns.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -734,8 +734,7 @@ >>>> proc.jobid = jobdat->jobid; >>>> for (j=0; j < jobdat->num_procs; j++) { >>>> proc.vpid = j; >>>> - proc.epoch = ORTE_EPOCH_INVALID; >>>> - proc.epoch = orte_ess.proc_get_epoch(&proc); >>>> + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); >>>> /* get the vpid of the daemon that is to host this proc */ >>>> if (ORTE_VPID_INVALID == (host_daemon = >>>> orte_ess.proc_get_daemon(&proc))) { >>>> ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >>>> @@ -1044,6 +1043,7 @@ >>>> free(param); >>>> free(value); >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* setup the epoch */ >>>> if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, >>>> child->name->epoch))) { >>>> ORTE_ERROR_LOG(rc); >>>> @@ -1057,6 +1057,7 @@ >>>> opal_setenv(param, value, true, env); >>>> free(param); >>>> free(value); >>>> +#endif >>>> >>>> /* setup the vpid */ >>>> if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, >>>> child->name->vpid))) { >>>> @@ -2721,7 +2722,7 @@ >>>> OBJ_CONSTRUCT(&proctmp, orte_proc_t); >>>> proctmp.name.jobid = ORTE_JOBID_WILDCARD; >>>> proctmp.name.vpid = ORTE_VPID_WILDCARD; >>>> - proctmp.name.epoch = ORTE_EPOCH_WILDCARD; >>>> + ORTE_EPOCH_SET(proctmp.name.epoch,ORTE_EPOCH_WILDCARD); >>>> opal_pointer_array_add(&procarray, &proctmp); >>>> procptr = &procarray; >>>> do_cleanup = true; >>>> >>>> Modified: trunk/orte/mca/odls/base/odls_base_open.c >>>> ============================================================================== >>>> --- trunk/orte/mca/odls/base/odls_base_open.c (original) >>>> +++ trunk/orte/mca/odls/base/odls_base_open.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -187,7 +187,7 @@ >>>> if (-1 == rank) { >>>> /* wildcard */ >>>> nm->name.vpid = ORTE_VPID_WILDCARD; >>>> - nm->name.epoch = ORTE_EPOCH_WILDCARD; >>>> + ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD); >>>> } else if (rank < 0) { >>>> /* error out on bozo case */ >>>> orte_show_help("help-odls-base.txt", >>>> @@ -200,8 +200,7 @@ >>>> * will be in the job - we'll check later >>>> */ >>>> nm->name.vpid = rank; >>>> - nm->name.epoch = ORTE_EPOCH_INVALID; >>>> - nm->name.epoch = orte_ess.proc_get_epoch(&nm->name); >>>> + >>>> ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); >>>> } >>>> opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); >>>> } >>>> >>>> Modified: trunk/orte/mca/odls/base/odls_base_state.c >>>> ============================================================================== >>>> --- trunk/orte/mca/odls/base/odls_base_state.c (original) >>>> +++ trunk/orte/mca/odls/base/odls_base_state.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -77,17 +77,17 @@ >>>> /* if I am the HNP, then use me as the source */ >>>> p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; >>>> p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); >>>> } >>>> else { >>>> /* otherwise, set the HNP as the source */ >>>> p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; >>>> p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; >>>> - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; >>>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); >>>> } >>>> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >>>> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >>>> >>>> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >>>> >>>> >>>> Modified: trunk/orte/mca/oob/tcp/oob_tcp_msg.c >>>> ============================================================================== >>>> --- trunk/orte/mca/oob/tcp/oob_tcp_msg.c (original) >>>> +++ trunk/orte/mca/oob/tcp/oob_tcp_msg.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -137,6 +137,7 @@ >>>> bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct >>>> mca_oob_tcp_peer_t * peer) >>>> { >>>> int rc; >>>> + >>>> while(1) { >>>> rc = writev(peer->peer_sd, msg->msg_rwptr, msg->msg_rwnum); >>>> if(rc < 0) { >>>> @@ -338,6 +339,7 @@ >>>> orte_process_name_t src = msg->msg_hdr.msg_src; >>>> >>>> OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); >>>> + >>>> if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, >>>> &src) != OPAL_EQUAL) { >>>> opal_hash_table_remove_value_uint64(&mca_oob_tcp_component.tcp_peers, >>>> >>>> orte_util_hash_name(&peer->peer_name)); >>>> >>>> Modified: trunk/orte/mca/oob/tcp/oob_tcp_peer.c >>>> ============================================================================== >>>> --- trunk/orte/mca/oob/tcp/oob_tcp_peer.c (original) >>>> +++ trunk/orte/mca/oob/tcp/oob_tcp_peer.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -903,6 +903,11 @@ >>>> static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user) >>>> { >>>> mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user; >>>> + >>>> + if (orte_abnormal_term_ordered) { >>>> + return; >>>> + } >>>> + >>>> OPAL_THREAD_LOCK(&peer->peer_lock); >>>> switch(peer->peer_state) { >>>> case MCA_OOB_TCP_CONNECT_ACK: >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_jobid.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_jobid.c (original) >>>> +++ trunk/orte/mca/plm/base/plm_base_jobid.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -62,12 +62,12 @@ >>>> /* set the name */ >>>> ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); >>>> ORTE_PROC_MY_NAME->vpid = 0; >>>> - ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); >>>> >>>> /* copy it to the HNP field */ >>>> ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; >>>> ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; >>>> - ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_PROC_MY_NAME->epoch); >>>> >>>> /* done */ >>>> return ORTE_SUCCESS; >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_launch_support.c (original) >>>> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -377,8 +377,7 @@ >>>> /* push stdin - the IOF will know what to do with the specified target */ >>>> name.jobid = job; >>>> name.vpid = jdata->stdin_target; >>>> - name.epoch = ORTE_EPOCH_INVALID; >>>> - name.epoch = orte_ess.proc_get_epoch(&name); >>>> + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >>>> >>>> if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) { >>>> ORTE_ERROR_LOG(rc); >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_orted_cmds.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_orted_cmds.c (original) >>>> +++ trunk/orte/mca/plm/base/plm_base_orted_cmds.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -163,8 +163,7 @@ >>>> continue; >>>> } >>>> peer.vpid = v; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> /* don't worry about errors on the send here - just >>>> * issue it and keep going >>>> @@ -242,7 +241,7 @@ >>>> OBJ_CONSTRUCT(&proc, orte_proc_t); >>>> proc.name.jobid = jobid; >>>> proc.name.vpid = ORTE_VPID_WILDCARD; >>>> - proc.name.epoch = ORTE_EPOCH_WILDCARD; >>>> + ORTE_EPOCH_SET(proc.name.epoch,ORTE_EPOCH_WILDCARD); >>>> opal_pointer_array_add(&procs, &proc); >>>> if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) { >>>> ORTE_ERROR_LOG(rc); >>>> @@ -340,8 +339,7 @@ >>>> continue; >>>> } >>>> peer.vpid = v; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> /* check to see if this daemon is known to be "dead" */ >>>> if (proc->state > ORTE_PROC_STATE_UNTERMINATED) { >>>> /* don't try to send this */ >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_receive.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_receive.c (original) >>>> +++ trunk/orte/mca/plm/base/plm_base_receive.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -146,7 +146,9 @@ >>>> orte_job_t *jdata, *parent; >>>> opal_buffer_t answer; >>>> orte_vpid_t vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; >>>> +#endif >>>> orte_proc_t *proc; >>>> orte_proc_state_t state; >>>> orte_exit_code_t exit_code; >>>> @@ -394,8 +396,7 @@ >>>> break; >>>> } >>>> name.vpid = vpid; >>>> - name.epoch = ORTE_EPOCH_INVALID; >>>> - name.epoch = orte_ess.proc_get_epoch(&name); >>>> + >>>> ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >>>> >>>> /* unpack the pid */ >>>> count = 1; >>>> @@ -488,9 +489,11 @@ >>>> } >>>> name.vpid = vpid; >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> count=1; >>>> opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH); >>>> name.epoch = epoch; >>>> +#endif >>>> >>>> OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, >>>> "%s plm:base:receive Described rank %s", >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_rsh_support.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_rsh_support.c (original) >>>> +++ trunk/orte/mca/plm/base/plm_base_rsh_support.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -1527,7 +1527,9 @@ >>>> { >>>> char *param, *path, *tmp, *cmd, *basename, *dest_dir; >>>> int i; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; >>>> +#endif >>>> orte_process_name_t proc; >>>> >>>> /* if a prefix is set, pass it to the bootproxy in a special way */ >>>> @@ -1638,6 +1640,7 @@ >>>> opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv); >>>> free(cmd); >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* set the epoch */ >>>> proc.jobid = jobid; >>>> proc.vpid = vpid; >>>> @@ -1648,6 +1651,7 @@ >>>> opal_setenv(param, cmd, true, argv); >>>> free(param); >>>> free(cmd); >>>> +#endif >>>> >>>> /* set the number of procs */ >>>> asprintf(&cmd, "%d", (int)num_procs); >>>> >>>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) >>>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -33,12 +33,14 @@ >>>> #include "orte/mca/ess/ess.h" >>>> #include "opal/mca/sysinfo/sysinfo_types.h" >>>> >>>> +#include "orte/types.h" >>>> #include "orte/util/show_help.h" >>>> #include "orte/util/name_fns.h" >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/util/hostfile/hostfile.h" >>>> #include "orte/util/dash_host/dash_host.h" >>>> #include "orte/mca/errmgr/errmgr.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rmaps/base/rmaps_private.h" >>>> #include "orte/mca/rmaps/base/base.h" >>>> @@ -454,7 +456,7 @@ >>>> */ >>>> >>>> /* We do set the epoch here since they all start with the same value. >>>> */ >>>> - proc->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> proc->app_idx = app_idx; >>>> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >>>> @@ -559,11 +561,12 @@ >>>> } >>>> } >>>> proc->name.vpid = vpid; >>>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>>> - proc->name.epoch = >>>> orte_ess.proc_get_epoch(&proc->name); >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> + >>>> /* If there is an invalid epoch here, it's because it >>>> doesn't exist yet. */ >>>> - if (ORTE_NODE_RANK_INVALID == proc->name.epoch) { >>>> - proc->name.epoch = ORTE_EPOCH_MIN; >>>> + if (0 == >>>> ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>>> } >>>> } >>>> if (NULL == opal_pointer_array_get_item(jdata->procs, >>>> proc->name.vpid)) { >>>> @@ -601,8 +604,8 @@ >>>> } >>>> } >>>> proc->name.vpid = vpid; >>>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>>> - proc->name.epoch = >>>> orte_ess.proc_get_epoch(&proc->name); >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> } >>>> if (NULL == opal_pointer_array_get_item(jdata->procs, >>>> proc->name.vpid)) { >>>> if (ORTE_SUCCESS != (rc = >>>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >>>> @@ -835,7 +838,7 @@ >>>> return ORTE_ERR_OUT_OF_RESOURCE; >>>> } >>>> proc->name.vpid = daemons->num_procs; /* take the next available >>>> vpid */ >>>> - proc->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>>> proc->node = node; >>>> proc->nodename = node->name; >>>> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >>>> @@ -1014,8 +1017,8 @@ >>>> return ORTE_ERR_OUT_OF_RESOURCE; >>>> } >>>> proc->name.vpid = jdata->num_procs; /* take the next available vpid >>>> */ >>>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>>> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> proc->node = node; >>>> proc->nodename = node->name; >>>> OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, >>>> >>>> Modified: trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c (original) >>>> +++ trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -502,8 +502,7 @@ >>>> } >>>> proc->name.vpid = rank; >>>> /* Either init or update the epoch. */ >>>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>>> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> >>>> proc->slot_list = strdup(rfmap->slot_list); >>>> /* insert the proc into the proper place */ >>>> >>>> Modified: trunk/orte/mca/rmaps/seq/rmaps_seq.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmaps/seq/rmaps_seq.c (original) >>>> +++ trunk/orte/mca/rmaps/seq/rmaps_seq.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -235,8 +235,7 @@ >>>> } >>>> /* assign the vpid */ >>>> proc->name.vpid = vpid++; >>>> - proc->name.epoch = ORTE_EPOCH_INVALID; >>>> - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> >>>> /* add to the jdata proc array */ >>>> if (ORTE_SUCCESS != (rc = >>>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >>>> >>>> Modified: trunk/orte/mca/rmcast/base/rmcast_base_open.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmcast/base/rmcast_base_open.c (original) >>>> +++ trunk/orte/mca/rmcast/base/rmcast_base_open.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -341,7 +341,7 @@ >>>> { >>>> ptr->name.jobid = ORTE_JOBID_INVALID; >>>> ptr->name.vpid = ORTE_VPID_INVALID; >>>> - ptr->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >>>> ptr->channel = ORTE_RMCAST_INVALID_CHANNEL; >>>> OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t); >>>> ptr->seq_num = ORTE_RMCAST_SEQ_INVALID; >>>> @@ -430,7 +430,7 @@ >>>> { >>>> ptr->name.jobid = ORTE_JOBID_INVALID; >>>> ptr->name.vpid = ORTE_VPID_INVALID; >>>> - ptr->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >>>> OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t); >>>> } >>>> static void recvlog_destruct(rmcast_recv_log_t *ptr) >>>> @@ -439,7 +439,7 @@ >>>> >>>> ptr->name.jobid = ORTE_JOBID_INVALID; >>>> ptr->name.vpid = ORTE_VPID_INVALID; >>>> - ptr->name.epoch = ORTE_EPOCH_INVALID; >>>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_INVALID); >>>> while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) { >>>> OBJ_RELEASE(item); >>>> } >>>> >>>> Modified: trunk/orte/mca/rmcast/tcp/rmcast_tcp.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmcast/tcp/rmcast_tcp.c (original) >>>> +++ trunk/orte/mca/rmcast/tcp/rmcast_tcp.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -681,7 +681,7 @@ >>>> /* caller requested id of sender */ >>>> name->jobid = recvptr->name.jobid; >>>> name->vpid = recvptr->name.vpid; >>>> - name->epoch= recvptr->name.epoch; >>>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>>> } >>>> *seq_num = recvptr->seq_num; >>>> *msg = recvptr->iovec_array; >>>> @@ -776,7 +776,7 @@ >>>> /* caller requested id of sender */ >>>> name->jobid = recvptr->name.jobid; >>>> name->vpid = recvptr->name.vpid; >>>> - name->epoch= recvptr->name.epoch; >>>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>>> } >>>> *seq_num = recvptr->seq_num; >>>> if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { >>>> >>>> Modified: trunk/orte/mca/rmcast/udp/rmcast_udp.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmcast/udp/rmcast_udp.c (original) >>>> +++ trunk/orte/mca/rmcast/udp/rmcast_udp.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -460,7 +460,7 @@ >>>> /* caller requested id of sender */ >>>> name->jobid = recvptr->name.jobid; >>>> name->vpid = recvptr->name.vpid; >>>> - name->epoch= recvptr->name.epoch; >>>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>>> } >>>> *seq_num = recvptr->seq_num; >>>> *msg = recvptr->iovec_array; >>>> @@ -553,7 +553,7 @@ >>>> /* caller requested id of sender */ >>>> name->jobid = recvptr->name.jobid; >>>> name->vpid = recvptr->name.vpid; >>>> - name->epoch= recvptr->name.epoch; >>>> + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); >>>> } >>>> *seq_num = recvptr->seq_num; >>>> if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { >>>> >>>> Modified: trunk/orte/mca/rml/base/rml_base_components.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rml/base/rml_base_components.c (original) >>>> +++ trunk/orte/mca/rml/base/rml_base_components.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -20,6 +20,7 @@ >>>> #include "opal/util/output.h" >>>> >>>> #include "orte/mca/rml/rml.h" >>>> +#include "orte/util/name_fns.h" >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> >>>> @@ -67,14 +68,14 @@ >>>> { >>>> pkt->sender.jobid = ORTE_JOBID_INVALID; >>>> pkt->sender.vpid = ORTE_VPID_INVALID; >>>> - pkt->sender.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_MIN); >>>> pkt->buffer = NULL; >>>> } >>>> static void msg_pkt_destructor(orte_msg_packet_t *pkt) >>>> { >>>> pkt->sender.jobid = ORTE_JOBID_INVALID; >>>> pkt->sender.vpid = ORTE_VPID_INVALID; >>>> - pkt->sender.epoch = ORTE_EPOCH_INVALID; >>>> + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_INVALID); >>>> if (NULL != pkt->buffer) { >>>> OBJ_RELEASE(pkt->buffer); >>>> } >>>> >>>> Modified: trunk/orte/mca/rml/rml_types.h >>>> ============================================================================== >>>> --- trunk/orte/mca/rml/rml_types.h (original) >>>> +++ trunk/orte/mca/rml/rml_types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -62,7 +62,7 @@ >>>> pkt = OBJ_NEW(orte_msg_packet_t); \ >>>> pkt->sender.jobid = (sndr)->jobid; \ >>>> pkt->sender.vpid = (sndr)->vpid; \ >>>> - pkt->sender.epoch = (sndr)->epoch; \ >>>> + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ >>>> if ((crt)) { \ >>>> pkt->buffer = OBJ_NEW(opal_buffer_t); \ >>>> opal_dss.copy_payload(pkt->buffer, *(buf)); \ >>>> @@ -85,7 +85,7 @@ >>>> pkt = OBJ_NEW(orte_msg_packet_t); \ >>>> pkt->sender.jobid = (sndr)->jobid; \ >>>> pkt->sender.vpid = (sndr)->vpid; \ >>>> - pkt->sender.epoch = (sndr)->epoch; \ >>>> + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ >>>> if ((crt)) { \ >>>> pkt->buffer = OBJ_NEW(opal_buffer_t); \ >>>> opal_dss.copy_payload(pkt->buffer, *(buf)); \ >>>> @@ -191,8 +191,10 @@ >>>> >>>> #define ORTE_RML_TAG_SUBSCRIBE 46 >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* For Epoch Updates */ >>>> #define ORTE_RML_TAG_EPOCH_CHANGE 47 >>>> +#endif >>>> >>>> /* Notify of failed processes */ >>>> #define ORTE_RML_TAG_FAILURE_NOTICE 48 >>>> >>>> Modified: trunk/orte/mca/routed/base/routed_base_components.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/base/routed_base_components.c (original) >>>> +++ trunk/orte/mca/routed/base/routed_base_components.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -65,7 +65,7 @@ >>>> { >>>> ptr->route.jobid = ORTE_JOBID_INVALID; >>>> ptr->route.vpid = ORTE_VPID_INVALID; >>>> - ptr->route.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN); >>>> ptr->hnp_uri = NULL; >>>> } >>>> static void jfamdest(orte_routed_jobfam_t *ptr) >>>> @@ -117,7 +117,7 @@ >>>> jfam = OBJ_NEW(orte_routed_jobfam_t); >>>> jfam->route.jobid = ORTE_PROC_MY_HNP->jobid; >>>> jfam->route.vpid = ORTE_PROC_MY_HNP->vpid; >>>> - jfam->route.epoch = ORTE_PROC_MY_HNP->epoch; >>>> + ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch); >>>> jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); >>>> if (NULL != orte_process_info.my_hnp_uri) { >>>> jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri); >>>> @@ -252,7 +252,7 @@ >>>> jfam->job_family = jobfamily; >>>> jfam->route.jobid = name.jobid; >>>> jfam->route.vpid = name.vpid; >>>> - jfam->route.epoch = name.epoch; >>>> + ORTE_EPOCH_SET(jfam->route.epoch,name.epoch); >>>> jfam->hnp_uri = strdup(uri); >>>> done: >>>> free(uri); >>>> >>>> Modified: trunk/orte/mca/routed/base/routed_base_register_sync.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/base/routed_base_register_sync.c (original) >>>> +++ trunk/orte/mca/routed/base/routed_base_register_sync.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -127,7 +127,9 @@ >>>> orte_std_cntr_t cnt; >>>> char *rml_uri; >>>> orte_vpid_t vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; >>>> +#endif >>>> int rc; >>>> >>>> if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { >>>> @@ -146,11 +148,13 @@ >>>> cnt = 1; >>>> while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, >>>> ORTE_VPID))) { >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> cnt = 1; >>>> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, >>>> ORTE_EPOCH))) { >>>> ORTE_ERROR_LOG(rc); >>>> continue; >>>> } >>>> +#endif >>>> >>>> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, >>>> OPAL_STRING))) { >>>> ORTE_ERROR_LOG(rc); >>>> >>>> Modified: trunk/orte/mca/routed/binomial/routed_binomial.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/binomial/routed_binomial.c (original) >>>> +++ trunk/orte/mca/routed/binomial/routed_binomial.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -33,6 +33,7 @@ >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/runtime/orte_wait.h" >>>> #include "orte/runtime/runtime.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rml/base/rml_contact.h" >>>> >>>> @@ -147,7 +148,7 @@ >>>> >>>> if (proc->jobid == ORTE_JOBID_INVALID || >>>> proc->vpid == ORTE_VPID_INVALID || >>>> - proc->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -216,7 +217,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -274,8 +275,7 @@ >>>> ORTE_NAME_PRINT(route))); >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>>> + >>>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -290,8 +290,7 @@ >>>> jfam->job_family = jfamily; >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>>> + >>>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>>> >>>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>>> return ORTE_SUCCESS; >>>> @@ -317,11 +316,21 @@ >>>> /* initialize */ >>>> daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; >>>> daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; >>>> - daemon.epoch = ORTE_PROC_MY_DAEMON->epoch; >>>> + ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch); >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> target->epoch == ORTE_EPOCH_INVALID) { >>>> +#else >>>> + if (target->jobid == ORTE_JOBID_INVALID || >>>> + target->vpid == ORTE_VPID_INVALID) { >>>> +#endif >>>> + ret = ORTE_NAME_INVALID; >>>> + goto found; >>>> + } >>>> + >>>> + if (0 > ORTE_EPOCH_CMP(target->epoch, >>>> orte_ess.proc_get_epoch(target))) { >>>> ret = ORTE_NAME_INVALID; >>>> goto found; >>>> } >>>> @@ -443,7 +452,7 @@ >>>> >>>> /* If the daemon to which we should be routing is dead, then >>>> update >>>> * the routing tree and start over. */ >>>> - if (!orte_util_proc_is_running(&daemon)) { >>>> + if (!PROC_IS_RUNNING(&daemon)) { >>>> update_routing_tree(daemon.jobid); >>>> goto startover; >>>> } >>>> @@ -461,8 +470,7 @@ >>>> ret = &daemon; >>>> >>>> found: >>>> - daemon.epoch = ORTE_EPOCH_INVALID; >>>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, >>>> "%s routed_binomial_get(%s) --> %s", >>>> @@ -879,7 +887,7 @@ >>>> */ >>>> local_lifeline.jobid = proc->jobid; >>>> local_lifeline.vpid = proc->vpid; >>>> - local_lifeline.epoch = proc->epoch; >>>> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >>>> lifeline = &local_lifeline; >>>> >>>> return ORTE_SUCCESS; >>>> @@ -924,11 +932,11 @@ >>>> * that process so we can check it's state. >>>> */ >>>> proc_name.vpid = peer; >>>> - proc_name.epoch = orte_util_lookup_epoch(&proc_name); >>>> + >>>> ORTE_EPOCH_SET(proc_name.epoch,orte_util_lookup_epoch(&proc_name)); >>>> >>>> - if (!orte_util_proc_is_running(&proc_name) >>>> - && ORTE_EPOCH_MIN < proc_name.epoch >>>> - && ORTE_EPOCH_INVALID != proc_name.epoch) { >>>> + if (!PROC_IS_RUNNING(&proc_name) >>>> + && 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,proc_name.epoch) >>>> + && 0 != >>>> ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc_name.epoch)) { >>>> OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>>> "%s routed:binomial child %s is >>>> dead", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> @@ -967,7 +975,7 @@ >>>> } >>>> >>>> /* find the children of this rank */ >>>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>>> "%s routed:binomial find children of rank %d", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank)); >>>> bitmap = opal_cube_dim(num_procs); >>>> @@ -977,24 +985,25 @@ >>>> >>>> for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { >>>> peer = rank | mask; >>>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>>> "%s routed:binomial find children checking peer >>>> %d", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer)); >>>> if (peer < num_procs) { >>>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>>> "%s routed:binomial find children computing >>>> tree", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> /* execute compute on this child */ >>>> if (0 <= (found = binomial_tree(peer, rank, me, num_procs, >>>> nchildren, childrn, relatives, mine, jobid))) { >>>> proc_name.vpid = found; >>>> >>>> - if (!orte_util_proc_is_running(&proc_name) && >>>> ORTE_EPOCH_MIN < orte_util_lookup_epoch(&proc_name)) { >>>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>>> + if (!PROC_IS_RUNNING(&proc_name) >>>> + && 0 < >>>> ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,orte_util_lookup_epoch(&proc_name))) { >>>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>>> "%s routed:binomial find children >>>> proc out of date - returning parent %d", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> parent)); >>>> return parent; >>>> } >>>> - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, >>>> + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, >>>> "%s routed:binomial find children >>>> returning found value %d", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> found)); >>>> return found; >>>> @@ -1029,8 +1038,7 @@ >>>> ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid, >>>> orte_process_info.max_procs, >>>> &num_children, &my_children, NULL, true, >>>> jobid); >>>> - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_PARENT->epoch = >>>> orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); >>>> >>>> if (0 < opal_output_get_verbosity(orte_routed_base_output)) { >>>> opal_output(0, "%s: parent %d num_children %d", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, >>>> num_children); >>>> >>>> Modified: trunk/orte/mca/routed/cm/routed_cm.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/cm/routed_cm.c (original) >>>> +++ trunk/orte/mca/routed/cm/routed_cm.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -35,6 +35,7 @@ >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/runtime/orte_wait.h" >>>> #include "orte/runtime/runtime.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rml/base/rml_contact.h" >>>> >>>> @@ -139,7 +140,7 @@ >>>> >>>> if (proc->jobid == ORTE_JOBID_INVALID || >>>> proc->vpid == ORTE_VPID_INVALID || >>>> - proc->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -200,7 +201,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -257,8 +258,7 @@ >>>> ORTE_NAME_PRINT(route))); >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>>> + >>>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -273,8 +273,7 @@ >>>> jfam->job_family = jfamily; >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = ORTE_EPOCH_INVALID; >>>> - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); >>>> + >>>> ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); >>>> >>>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>>> return ORTE_SUCCESS; >>>> @@ -299,7 +298,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> ret = ORTE_NAME_INVALID; >>>> goto found; >>>> } >>>> @@ -367,8 +366,7 @@ >>>> } >>>> >>>> /* Initialize daemon's epoch, based on its current vpid/jobid */ >>>> - daemon.epoch = ORTE_EPOCH_INVALID; >>>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>>> >>>> /* if the daemon is me, then send direct to the target! */ >>>> if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { >>>> @@ -814,8 +812,7 @@ >>>> */ >>>> local_lifeline.jobid = proc->jobid; >>>> local_lifeline.vpid = proc->vpid; >>>> - local_lifeline.epoch = ORTE_EPOCH_INVALID; >>>> - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); >>>> + >>>> ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); >>>> >>>> lifeline = &local_lifeline; >>>> >>>> >>>> Modified: trunk/orte/mca/routed/direct/routed_direct.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/direct/routed_direct.c (original) >>>> +++ trunk/orte/mca/routed/direct/routed_direct.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -24,6 +24,7 @@ >>>> #include "orte/util/name_fns.h" >>>> #include "orte/util/proc_info.h" >>>> #include "orte/runtime/orte_globals.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rml/base/rml_contact.h" >>>> >>>> @@ -135,7 +136,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> ret = ORTE_NAME_INVALID; >>>> } else { >>>> /* all routes are direct */ >>>> >>>> Modified: trunk/orte/mca/routed/linear/routed_linear.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/linear/routed_linear.c (original) >>>> +++ trunk/orte/mca/routed/linear/routed_linear.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -31,6 +31,7 @@ >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/runtime/orte_wait.h" >>>> #include "orte/runtime/runtime.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rml/base/rml_contact.h" >>>> >>>> @@ -132,7 +133,7 @@ >>>> >>>> if (proc->jobid == ORTE_JOBID_INVALID || >>>> proc->vpid == ORTE_VPID_INVALID || >>>> - proc->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -201,7 +202,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -259,7 +260,7 @@ >>>> ORTE_NAME_PRINT(route))); >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = route->epoch; >>>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>>> return ORTE_SUCCESS; >>>> } >>>> } >>>> @@ -273,7 +274,7 @@ >>>> jfam->job_family = jfamily; >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = route->epoch; >>>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -373,8 +374,7 @@ >>>> } >>>> >>>> /* Initialize daemon's epoch, based on its current vpid/jobid */ >>>> - daemon.epoch = ORTE_EPOCH_INVALID; >>>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>>> >>>> /* if the daemon is me, then send direct to the target! */ >>>> if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { >>>> @@ -395,8 +395,7 @@ >>>> /* we are at end of chain - wrap around */ >>>> daemon.vpid = 0; >>>> } >>>> - daemon.epoch = ORTE_EPOCH_INVALID; >>>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>>> ret = &daemon; >>>> } >>>> } >>>> @@ -741,7 +740,7 @@ >>>> */ >>>> local_lifeline.jobid = proc->jobid; >>>> local_lifeline.vpid = proc->vpid; >>>> - local_lifeline.epoch = proc->epoch; >>>> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >>>> lifeline = &local_lifeline; >>>> >>>> return ORTE_SUCCESS; >>>> >>>> Modified: trunk/orte/mca/routed/radix/routed_radix.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/radix/routed_radix.c (original) >>>> +++ trunk/orte/mca/routed/radix/routed_radix.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -31,6 +31,7 @@ >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/runtime/orte_wait.h" >>>> #include "orte/runtime/runtime.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rml/base/rml_contact.h" >>>> >>>> @@ -145,7 +146,7 @@ >>>> >>>> if (proc->jobid == ORTE_JOBID_INVALID || >>>> proc->vpid == ORTE_VPID_INVALID || >>>> - proc->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -214,7 +215,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> return ORTE_ERR_BAD_PARAM; >>>> } >>>> >>>> @@ -272,7 +273,7 @@ >>>> ORTE_NAME_PRINT(route))); >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = route->epoch; >>>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>>> return ORTE_SUCCESS; >>>> } >>>> } >>>> @@ -286,7 +287,7 @@ >>>> jfam->job_family = jfamily; >>>> jfam->route.jobid = route->jobid; >>>> jfam->route.vpid = route->vpid; >>>> - jfam->route.epoch = route->epoch; >>>> + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); >>>> opal_pointer_array_add(&orte_routed_jobfams, jfam); >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -310,7 +311,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> ret = ORTE_NAME_INVALID; >>>> goto found; >>>> } >>>> @@ -413,8 +414,7 @@ >>>> if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { >>>> /* yep - we need to step through this child */ >>>> daemon.vpid = child->vpid; >>>> - daemon.epoch = ORTE_EPOCH_INVALID; >>>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>>> + >>>> ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>>> ret = &daemon; >>>> goto found; >>>> } >>>> @@ -425,8 +425,7 @@ >>>> * any of our children, so we have to step up through our parent >>>> */ >>>> daemon.vpid = ORTE_PROC_MY_PARENT->vpid; >>>> - daemon.epoch = ORTE_EPOCH_INVALID; >>>> - daemon.epoch = orte_ess.proc_get_epoch(&daemon); >>>> + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); >>>> >>>> ret = &daemon; >>>> >>>> @@ -788,7 +787,7 @@ >>>> */ >>>> local_lifeline.jobid = proc->jobid; >>>> local_lifeline.vpid = proc->vpid; >>>> - local_lifeline.epoch = proc->epoch; >>>> + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); >>>> lifeline = &local_lifeline; >>>> >>>> return ORTE_SUCCESS; >>>> @@ -881,8 +880,7 @@ >>>> ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel; >>>> ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel); >>>> } >>>> - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; >>>> - ORTE_PROC_MY_PARENT->epoch = >>>> orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); >>>> + >>>> ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); >>>> >>>> /* compute my direct children and the bitmap that shows which vpids >>>> * lie underneath their branch >>>> >>>> Modified: trunk/orte/mca/routed/slave/routed_slave.c >>>> ============================================================================== >>>> --- trunk/orte/mca/routed/slave/routed_slave.c (original) >>>> +++ trunk/orte/mca/routed/slave/routed_slave.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -26,6 +26,7 @@ >>>> #include "orte/runtime/orte_globals.h" >>>> #include "orte/runtime/orte_wait.h" >>>> #include "orte/runtime/runtime.h" >>>> +#include "orte/runtime/data_type_support/orte_dt_support.h" >>>> >>>> #include "orte/mca/rml/base/rml_contact.h" >>>> >>>> @@ -134,7 +135,7 @@ >>>> >>>> if (target->jobid == ORTE_JOBID_INVALID || >>>> target->vpid == ORTE_VPID_INVALID || >>>> - target->epoch == ORTE_EPOCH_INVALID) { >>>> + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { >>>> ret = ORTE_NAME_INVALID; >>>> } else { >>>> /* a slave must always route via its parent daemon */ >>>> @@ -275,8 +276,7 @@ >>>> */ >>>> local_lifeline.jobid = proc->jobid; >>>> local_lifeline.vpid = proc->vpid; >>>> - local_lifeline.epoch = ORTE_EPOCH_INVALID; >>>> - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); >>>> + >>>> ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); >>>> >>>> lifeline = &local_lifeline; >>>> >>>> >>>> Modified: trunk/orte/mca/sensor/file/sensor_file.c >>>> ============================================================================== >>>> --- trunk/orte/mca/sensor/file/sensor_file.c (original) >>>> +++ trunk/orte/mca/sensor/file/sensor_file.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -70,7 +70,9 @@ >>>> opal_list_item_t super; >>>> orte_jobid_t jobid; >>>> orte_vpid_t vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; >>>> +#endif >>>> char *file; >>>> int tick; >>>> bool check_size; >>>> >>>> Modified: trunk/orte/mca/snapc/base/snapc_base_fns.c >>>> ============================================================================== >>>> --- trunk/orte/mca/snapc/base/snapc_base_fns.c (original) >>>> +++ trunk/orte/mca/snapc/base/snapc_base_fns.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -81,7 +81,7 @@ >>>> { >>>> snapshot->process_name.jobid = 0; >>>> snapshot->process_name.vpid = 0; >>>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>>> >>>> @@ -92,7 +92,7 @@ >>>> { >>>> snapshot->process_name.jobid = 0; >>>> snapshot->process_name.vpid = 0; >>>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>>> >>>> >>>> Modified: trunk/orte/mca/snapc/full/snapc_full_global.c >>>> ============================================================================== >>>> --- trunk/orte/mca/snapc/full/snapc_full_global.c (original) >>>> +++ trunk/orte/mca/snapc/full/snapc_full_global.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -427,7 +427,7 @@ >>>> new_proc = OBJ_NEW(orte_proc_t); >>>> new_proc->name.jobid = proc->name.jobid; >>>> new_proc->name.vpid = proc->name.vpid; >>>> - new_proc->name.epoch = proc->name.epoch; >>>> + ORTE_EPOCH_SET(new_proc->name.epoch,proc->name.epoch); >>>> new_proc->node = OBJ_NEW(orte_node_t); >>>> new_proc->node->name = proc->node->name; >>>> opal_list_append(migrating_procs, &new_proc->super); >>>> @@ -618,7 +618,7 @@ >>>> >>>> orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; >>>> orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; >>>> - orted_snapshot->process_name.epoch = >>>> cur_node->daemon->name.epoch; >>>> + >>>> ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); >>>> >>>> mask = ORTE_NS_CMP_JOBID; >>>> >>>> @@ -636,7 +636,7 @@ >>>> >>>> app_snapshot->process_name.jobid = procs[p]->name.jobid; >>>> app_snapshot->process_name.vpid = procs[p]->name.vpid; >>>> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >>>> + >>>> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >>>> >>>> opal_list_append(&(orted_snapshot->super.local_snapshots), >>>> &(app_snapshot->super)); >>>> } >>>> @@ -800,7 +800,7 @@ >>>> >>>> app_snapshot->process_name.jobid = procs[p]->name.jobid; >>>> app_snapshot->process_name.vpid = procs[p]->name.vpid; >>>> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >>>> + >>>> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >>>> >>>> opal_list_append(&(orted_snapshot->super.local_snapshots), >>>> &(app_snapshot->super)); >>>> } >>>> @@ -816,7 +816,7 @@ >>>> >>>> orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; >>>> orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; >>>> - orted_snapshot->process_name.epoch = >>>> cur_node->daemon->name.epoch; >>>> + >>>> ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); >>>> >>>> mask = ORTE_NS_CMP_ALL; >>>> >>>> @@ -837,7 +837,7 @@ >>>> >>>> app_snapshot->process_name.jobid = procs[p]->name.jobid; >>>> app_snapshot->process_name.vpid = procs[p]->name.vpid; >>>> - app_snapshot->process_name.epoch = procs[p]->name.epoch; >>>> + >>>> ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); >>>> >>>> opal_list_append(&(orted_snapshot->super.local_snapshots), >>>> &(app_snapshot->super)); >>>> } >>>> >>>> Modified: trunk/orte/mca/snapc/full/snapc_full_local.c >>>> ============================================================================== >>>> --- trunk/orte/mca/snapc/full/snapc_full_local.c (original) >>>> +++ trunk/orte/mca/snapc/full/snapc_full_local.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -2033,7 +2033,7 @@ >>>> vpid_snapshot->process_pid = child->pid; >>>> vpid_snapshot->super.process_name.jobid = child->name->jobid; >>>> vpid_snapshot->super.process_name.vpid = child->name->vpid; >>>> - vpid_snapshot->super.process_name.epoch = child->name->epoch; >>>> + >>>> ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); >>>> } >>>> } >>>> >>>> @@ -2095,7 +2095,7 @@ >>>> vpid_snapshot->process_pid = child->pid; >>>> vpid_snapshot->super.process_name.jobid = child->name->jobid; >>>> vpid_snapshot->super.process_name.vpid = child->name->vpid; >>>> - vpid_snapshot->super.process_name.epoch = child->name->epoch; >>>> + >>>> ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); >>>> /*vpid_snapshot->migrating = true;*/ >>>> >>>> opal_list_append(&(local_global_snapshot.local_snapshots), >>>> &(vpid_snapshot->super.super)); >>>> @@ -2111,7 +2111,7 @@ >>>> vpid_snapshot->process_pid = child->pid; >>>> vpid_snapshot->super.process_name.jobid = child->name->jobid; >>>> vpid_snapshot->super.process_name.vpid = child->name->vpid; >>>> - vpid_snapshot->super.process_name.epoch = child->name->epoch; >>>> + >>>> ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); >>>> } >>>> } >>>> >>>> >>>> Modified: trunk/orte/mca/snapc/full/snapc_full_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/snapc/full/snapc_full_module.c (original) >>>> +++ trunk/orte/mca/snapc/full/snapc_full_module.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -83,7 +83,7 @@ >>>> void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t >>>> *snapshot) { >>>> snapshot->process_name.jobid = 0; >>>> snapshot->process_name.vpid = 0; >>>> - snapshot->process_name.epoch = 0; >>>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); >>>> >>>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>>> } >>>> @@ -91,7 +91,7 @@ >>>> void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t >>>> *snapshot) { >>>> snapshot->process_name.jobid = 0; >>>> snapshot->process_name.vpid = 0; >>>> - snapshot->process_name.epoch = 0; >>>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); >>>> >>>> snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; >>>> } >>>> >>>> Modified: trunk/orte/mca/sstore/base/sstore_base_fns.c >>>> ============================================================================== >>>> --- trunk/orte/mca/sstore/base/sstore_base_fns.c (original) >>>> +++ trunk/orte/mca/sstore/base/sstore_base_fns.c 2011-08-26 18:16:14 EDT >>>> (Fri, 26 Aug 2011) >>>> @@ -62,7 +62,7 @@ >>>> { >>>> snapshot->process_name.jobid = 0; >>>> snapshot->process_name.vpid = 0; >>>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> snapshot->crs_comp = NULL; >>>> snapshot->compress_comp = NULL; >>>> @@ -76,7 +76,7 @@ >>>> { >>>> snapshot->process_name.jobid = 0; >>>> snapshot->process_name.vpid = 0; >>>> - snapshot->process_name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); >>>> >>>> if( NULL != snapshot->crs_comp ) { >>>> free(snapshot->crs_comp); >>>> @@ -637,7 +637,7 @@ >>>> >>>> vpid_snapshot->process_name.jobid = proc.jobid; >>>> vpid_snapshot->process_name.vpid = proc.vpid; >>>> - vpid_snapshot->process_name.epoch = proc.epoch; >>>> + ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch); >>>> } >>>> else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, >>>> strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) { >>>> vpid_snapshot->crs_comp = strdup(value); >>>> >>>> Modified: trunk/orte/mca/sstore/central/sstore_central_global.c >>>> ============================================================================== >>>> --- trunk/orte/mca/sstore/central/sstore_central_global.c (original) >>>> +++ trunk/orte/mca/sstore/central/sstore_central_global.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -1216,8 +1216,7 @@ >>>> >>>> vpid_snapshot->process_name.jobid = handle_info->jobid; >>>> vpid_snapshot->process_name.vpid = i; >>>> - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; >>>> - vpid_snapshot->process_name.epoch = >>>> orte_ess.proc_get_epoch(&vpid_snapshot->process_name); >>>> + >>>> ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); >>>> >>>> vpid_snapshot->crs_comp = NULL; >>>> global_snapshot->start_time = NULL; >>>> >>>> Modified: trunk/orte/mca/sstore/central/sstore_central_local.c >>>> ============================================================================== >>>> --- trunk/orte/mca/sstore/central/sstore_central_local.c (original) >>>> +++ trunk/orte/mca/sstore/central/sstore_central_local.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -210,7 +210,7 @@ >>>> { >>>> info->name.jobid = ORTE_JOBID_INVALID; >>>> info->name.vpid = ORTE_VPID_INVALID; >>>> - info->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> info->local_location = NULL; >>>> info->metadata_filename = NULL; >>>> @@ -222,7 +222,7 @@ >>>> { >>>> info->name.jobid = ORTE_JOBID_INVALID; >>>> info->name.vpid = ORTE_VPID_INVALID; >>>> - info->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> if( NULL != info->local_location ) { >>>> free(info->local_location); >>>> @@ -535,7 +535,7 @@ >>>> >>>> app_info->name.jobid = name->jobid; >>>> app_info->name.vpid = name->vpid; >>>> - app_info->name.epoch = name->epoch; >>>> + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); >>>> >>>> opal_list_append(handle_info->app_info_handle, &(app_info->super)); >>>> >>>> >>>> Modified: trunk/orte/mca/sstore/stage/sstore_stage_global.c >>>> ============================================================================== >>>> --- trunk/orte/mca/sstore/stage/sstore_stage_global.c (original) >>>> +++ trunk/orte/mca/sstore/stage/sstore_stage_global.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -1218,10 +1218,10 @@ >>>> p_set = OBJ_NEW(orte_filem_base_process_set_t); >>>> p_set->source.jobid = peer->jobid; >>>> p_set->source.vpid = peer->vpid; >>>> - p_set->source.epoch = peer->epoch; >>>> + ORTE_EPOCH_SET(p_set->source.epoch,peer->epoch); >>>> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >>>> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >>>> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >>>> } >>>> >>>> @@ -1706,8 +1706,7 @@ >>>> >>>> vpid_snapshot->process_name.jobid = handle_info->jobid; >>>> vpid_snapshot->process_name.vpid = i; >>>> - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; >>>> - vpid_snapshot->process_name.epoch = >>>> orte_ess.proc_get_epoch(&vpid_snapshot->process_name); >>>> + >>>> ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); >>>> >>>> /* JJH: Currently we do not have this information since we do not save >>>> * individual vpid info in the Global SStore. It is in the metadata >>>> >>>> Modified: trunk/orte/mca/sstore/stage/sstore_stage_local.c >>>> ============================================================================== >>>> --- trunk/orte/mca/sstore/stage/sstore_stage_local.c (original) >>>> +++ trunk/orte/mca/sstore/stage/sstore_stage_local.c 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -287,7 +287,7 @@ >>>> { >>>> info->name.jobid = ORTE_JOBID_INVALID; >>>> info->name.vpid = ORTE_VPID_INVALID; >>>> - info->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> info->local_location = NULL; >>>> info->compressed_local_location = NULL; >>>> @@ -302,7 +302,7 @@ >>>> { >>>> info->name.jobid = ORTE_JOBID_INVALID; >>>> info->name.vpid = ORTE_VPID_INVALID; >>>> - info->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); >>>> >>>> if( NULL != info->local_location ) { >>>> free(info->local_location); >>>> @@ -1014,7 +1014,7 @@ >>>> >>>> app_info->name.jobid = name->jobid; >>>> app_info->name.vpid = name->vpid; >>>> - app_info->name.epoch = name->epoch; >>>> + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); >>>> >>>> opal_list_append(handle_info->app_info_handle, &(app_info->super)); >>>> >>>> @@ -2057,17 +2057,17 @@ >>>> /* if I am the HNP, then use me as the source */ >>>> p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; >>>> p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); >>>> } >>>> else { >>>> /* otherwise, set the HNP as the source */ >>>> p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; >>>> p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; >>>> - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; >>>> + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); >>>> } >>>> p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; >>>> p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; >>>> - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; >>>> + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); >>>> opal_list_append(&(filem_request->process_sets), &(p_set->super) ); >>>> >>>> /* Define the file set */ >>>> >>>> Modified: trunk/orte/orted/orted_comm.c >>>> ============================================================================== >>>> --- trunk/orte/orted/orted_comm.c (original) >>>> +++ trunk/orte/orted/orted_comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -123,18 +123,13 @@ >>>> nm = (orte_routed_tree_t*)item; >>>> >>>> target.vpid = nm->vpid; >>>> - target.epoch = orte_util_lookup_epoch(&target); >>>> + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); >>>> >>>> - if (!orte_util_proc_is_running(&target)) { >>>> + if (!PROC_IS_RUNNING(&target)) { >>>> continue; >>>> } >>>> >>>> - target.epoch = ORTE_EPOCH_INVALID; >>>> - if (ORTE_NODE_RANK_INVALID == (target.epoch = >>>> orte_ess.proc_get_epoch(&target))) { >>>> - /* If we are trying to send to a previously failed process >>>> it's >>>> - * better to fail silently. */ >>>> - continue; >>>> - } >>>> + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); >>>> >>>> OPAL_OUTPUT_VERBOSE((1, orte_debug_output, >>>> "%s orte:daemon:send_relay sending relay msg to >>>> %s", >>>> @@ -422,7 +417,8 @@ >>>> proct = OBJ_NEW(orte_proc_t); >>>> proct->name.jobid = proc.jobid; >>>> proct->name.vpid = proc.vpid; >>>> - proct->name.epoch = proc.epoch; >>>> + ORTE_EPOCH_SET(proct->name.epoch,proc.epoch); >>>> + >>>> opal_pointer_array_add(&procarray, proct); >>>> num_replies++; >>>> } >>>> @@ -1059,7 +1055,9 @@ >>>> orte_job_t *jdata; >>>> orte_proc_t *proc; >>>> orte_vpid_t vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; >>>> +#endif >>>> int32_t i, num_procs; >>>> >>>> /* setup the answer */ >>>> @@ -1086,12 +1084,14 @@ >>>> goto CLEANUP; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* unpack the epoch */ >>>> n = 1; >>>> if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &epoch, &n, >>>> ORTE_EPOCH))) { >>>> ORTE_ERROR_LOG(ret); >>>> goto CLEANUP; >>>> } >>>> +#endif >>>> >>>> /* if they asked for a specific proc, then just get that info */ >>>> if (ORTE_VPID_WILDCARD != vpid) { >>>> @@ -1201,7 +1201,7 @@ >>>> /* loop across all daemons */ >>>> proc2.jobid = ORTE_PROC_MY_NAME->jobid; >>>> for (proc2.vpid=1; proc2.vpid < >>>> orte_process_info.num_procs; proc2.vpid++) { >>>> - proc2.epoch = orte_util_lookup_epoch(&proc2); >>>> + >>>> ORTE_EPOCH_SET(proc2.epoch,orte_util_lookup_epoch(&proc2)); >>>> >>>> /* setup the cmd */ >>>> relay_msg = OBJ_NEW(opal_buffer_t); >>>> >>>> Modified: trunk/orte/orted/orted_main.c >>>> ============================================================================== >>>> --- trunk/orte/orted/orted_main.c (original) >>>> +++ trunk/orte/orted/orted_main.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -388,14 +388,14 @@ >>>> orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); >>>> ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; >>>> ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; >>>> - ORTE_PROC_MY_DAEMON->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_EPOCH_MIN); >>>> >>>> /* if I am also the hnp, then update that contact info field too */ >>>> if (ORTE_PROC_IS_HNP) { >>>> orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); >>>> ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; >>>> ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; >>>> - ORTE_PROC_MY_HNP->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_EPOCH_MIN); >>>> } >>>> >>>> /* setup the primary daemon command receive function */ >>>> @@ -495,7 +495,8 @@ >>>> proc = OBJ_NEW(orte_proc_t); >>>> proc->name.jobid = jdata->jobid; >>>> proc->name.vpid = 0; >>>> - proc->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>>> + >>>> proc->state = ORTE_PROC_STATE_RUNNING; >>>> proc->app_idx = 0; >>>> proc->node = nodes[0]; /* hnp node must be there */ >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >>>> (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -76,6 +76,7 @@ >>>> } >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /** check the epochs - if one of them is WILDCARD, then ignore >>>> * this field since anything is okay >>>> */ >>>> @@ -87,6 +88,7 @@ >>>> return OPAL_VALUE1_GREATER; >>>> } >>>> } >>>> +#endif >>>> >>>> /** only way to get here is if all fields are equal or WILDCARD */ >>>> return OPAL_EQUAL; >>>> @@ -122,6 +124,7 @@ >>>> return OPAL_EQUAL; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_dt_compare_epoch(orte_epoch_t *value1, >>>> orte_epoch_t *value2, >>>> opal_data_type_t type) >>>> @@ -136,6 +139,7 @@ >>>> >>>> return OPAL_EQUAL; >>>> } >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> /** >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c >>>> (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -61,7 +61,7 @@ >>>> >>>> val->jobid = src->jobid; >>>> val->vpid = src->vpid; >>>> - val->epoch = src->epoch; >>>> + ORTE_EPOCH_SET(val->epoch,src->epoch); >>>> >>>> *dest = val; >>>> return ORTE_SUCCESS; >>>> @@ -105,6 +105,7 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* >>>> * EPOCH >>>> */ >>>> @@ -123,6 +124,7 @@ >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >>>> (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -58,7 +58,9 @@ >>>> orte_process_name_t* proc; >>>> orte_jobid_t *jobid; >>>> orte_vpid_t *vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t *epoch; >>>> +#endif >>>> >>>> /* collect all the jobids in a contiguous array */ >>>> jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t)); >>>> @@ -100,6 +102,7 @@ >>>> } >>>> free(vpid); >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* Collect all the epochs in a contiguous array */ >>>> epoch = (orte_epoch_t *) malloc(num_vals * sizeof(orte_epoch_t)); >>>> if (NULL == epoch) { >>>> @@ -118,6 +121,7 @@ >>>> return rc; >>>> } >>>> free(epoch); >>>> +#endif >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> @@ -156,6 +160,7 @@ >>>> return ret; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* >>>> * EPOCH >>>> */ >>>> @@ -171,6 +176,7 @@ >>>> >>>> return ret; >>>> } >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> /* >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_print_fns.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_print_fns.c >>>> (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_print_fns.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -125,8 +125,10 @@ >>>> orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, >>>> ORTE_STD_CNTR_T); >>>> break; >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> case ORTE_EPOCH: >>>> orte_dt_quick_print(output, "ORTE_EPOCH", prefix, src, >>>> ORTE_EPOCH_T); >>>> +#endif >>>> >>>> case ORTE_VPID: >>>> orte_dt_quick_print(output, "ORTE_VPID", prefix, src, >>>> ORTE_VPID_T); >>>> @@ -478,11 +480,21 @@ >>>> if (orte_xml_output) { >>>> /* need to create the output in XML format */ >>>> if (0 == src->pid) { >>>> +#if ORTE_ENABLE_EPOCH >>>> asprintf(output, "%s<process rank=\"%s\" status=\"%s\" >>>> epoch=\"%s\"/>\n", pfx2, >>>> ORTE_VPID_PRINT(src->name.vpid), >>>> orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); >>>> +#else >>>> + asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", >>>> pfx2, >>>> + ORTE_VPID_PRINT(src->name.vpid), >>>> orte_proc_state_to_str(src->state)); >>>> +#endif >>>> } else { >>>> +#if ORTE_ENABLE_EPOCH >>>> asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\" >>>> epoch=\"%s\"/>\n", pfx2, >>>> ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, >>>> orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); >>>> +#else >>>> + asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" >>>> status=\"%s\"/>\n", pfx2, >>>> + ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, >>>> orte_proc_state_to_str(src->state)); >>>> +#endif >>>> } >>>> free(pfx2); >>>> return ORTE_SUCCESS; >>>> @@ -490,10 +502,17 @@ >>>> >>>> if (!orte_devel_level_output) { >>>> /* just print a very simple output for users */ >>>> +#if ORTE_ENABLE_EPOCH >>>> asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: >>>> %s", pfx2, >>>> ORTE_JOBID_PRINT(src->name.jobid), >>>> ORTE_VPID_PRINT(src->name.vpid), >>>> ORTE_EPOCH_PRINT(src->name.epoch)); >>>> +#else >>>> + asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s >>>> Epoch: %s", pfx2, >>>> + ORTE_JOBID_PRINT(src->name.jobid), >>>> + ORTE_VPID_PRINT(src->name.vpid)); >>>> +#endif >>>> + >>>> /* set the return */ >>>> *output = tmp; >>>> free(pfx2); >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_size_fns.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_size_fns.c >>>> (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_size_fns.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -45,9 +45,11 @@ >>>> *size = sizeof(orte_std_cntr_t); >>>> break; >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> case ORTE_EPOCH: >>>> *size = sizeof(orte_epoch_t); >>>> break; >>>> +#endif >>>> >>>> case ORTE_VPID: >>>> *size = sizeof(orte_vpid_t); >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_support.h >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_support.h (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_support.h 2011-08-26 >>>> 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -52,9 +52,14 @@ >>>> int orte_dt_compare_vpid(orte_vpid_t *value1, >>>> orte_vpid_t *value2, >>>> opal_data_type_t type); >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_dt_compare_epoch(orte_epoch_t *value1, >>>> orte_epoch_t *value2, >>>> opal_data_type_t type); >>>> +#define ORTE_EPOCH_CMP(n,m) ( (m) - (n) ) >>>> +#else >>>> +#define ORTE_EPOCH_CMP(n,m) ( 0 ) >>>> +#endif >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, >>>> opal_data_type_t type); >>>> int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, >>>> opal_data_type_t type); >>>> @@ -86,7 +91,9 @@ >>>> int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t >>>> *src, opal_data_type_t type); >>>> int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, >>>> opal_data_type_t type); >>>> int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, >>>> opal_data_type_t type); >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, >>>> opal_data_type_t type); >>>> +#endif >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t >>>> type); >>>> int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, >>>> opal_data_type_t type); >>>> @@ -116,8 +123,10 @@ >>>> int32_t num_vals, opal_data_type_t type); >>>> int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src, >>>> int32_t num_vals, opal_data_type_t type); >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src, >>>> int32_t num_vals, opal_data_type_t type); >>>> +#endif >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, >>>> int32_t num_vals, opal_data_type_t type); >>>> @@ -185,8 +194,10 @@ >>>> int32_t *num_vals, opal_data_type_t type); >>>> int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest, >>>> int32_t *num_vals, opal_data_type_t type); >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest, >>>> int32_t *num_vals, opal_data_type_t type); >>>> +#endif >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, >>>> int32_t *num_vals, opal_data_type_t type); >>>> >>>> Modified: trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >>>> (original) >>>> +++ trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c >>>> 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) >>>> @@ -54,7 +54,9 @@ >>>> orte_process_name_t* proc; >>>> orte_jobid_t *jobid; >>>> orte_vpid_t *vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t *epoch; >>>> +#endif >>>> >>>> num = *num_vals; >>>> >>>> @@ -92,6 +94,7 @@ >>>> return rc; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* collect all the epochs in a contiguous array */ >>>> epoch= (orte_epoch_t*)malloc(num * sizeof(orte_epoch_t)); >>>> if (NULL == epoch) { >>>> @@ -109,18 +112,21 @@ >>>> free(jobid); >>>> return rc; >>>> } >>>> +#endif >>>> >>>> /* build the names from the jobid/vpid/epoch arrays */ >>>> proc = (orte_process_name_t*)dest; >>>> for (i=0; i < num; i++) { >>>> proc->jobid = jobid[i]; >>>> proc->vpid = vpid[i]; >>>> - proc->epoch = epoch[i]; >>>> + ORTE_EPOCH_SET(proc->epoch,epoch[i]); >>>> proc++; >>>> } >>>> >>>> /* cleanup */ >>>> +#if ORTE_ENABLE_EPOCH >>>> free(epoch); >>>> +#endif >>>> free(vpid); >>>> free(jobid); >>>> >>>> @@ -159,6 +165,7 @@ >>>> return ret; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* >>>> * EPOCH >>>> */ >>>> @@ -174,6 +181,7 @@ >>>> >>>> return ret; >>>> } >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> /* >>>> >>>> Modified: trunk/orte/runtime/orte_data_server.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/orte_data_server.c (original) >>>> +++ trunk/orte/runtime/orte_data_server.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -220,7 +220,7 @@ >>>> data->port = port_name; >>>> data->owner.jobid = sender->jobid; >>>> data->owner.vpid = sender->vpid; >>>> - data->owner.epoch = sender->epoch; >>>> + ORTE_EPOCH_SET(data->owner.epoch,sender->epoch); >>>> >>>> /* store the data */ >>>> data->index = opal_pointer_array_add(orte_data_server_store, >>>> data); >>>> >>>> Modified: trunk/orte/runtime/orte_globals.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/orte_globals.c (original) >>>> +++ trunk/orte/runtime/orte_globals.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -277,6 +277,7 @@ >>>> return rc; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> tmp = ORTE_EPOCH; >>>> if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_epoch, >>>> orte_dt_unpack_epoch, >>>> @@ -290,6 +291,7 @@ >>>> ORTE_ERROR_LOG(rc); >>>> return rc; >>>> } >>>> +#endif >>>> >>>> #if !ORTE_DISABLE_FULL_SUPPORT >>>> tmp = ORTE_JOB; >>>> @@ -933,7 +935,7 @@ >>>> proc->beat = 0; >>>> OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t); >>>> opal_ring_buffer_init(&proc->stats, orte_stat_history_size); >>>> - proc->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); >>>> #if OPAL_ENABLE_FT_CR == 1 >>>> proc->ckpt_state = 0; >>>> proc->ckpt_snapshot_ref = NULL; >>>> >>>> Modified: trunk/orte/runtime/orte_init.c >>>> ============================================================================== >>>> --- trunk/orte/runtime/orte_init.c (original) >>>> +++ trunk/orte/runtime/orte_init.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -57,8 +57,17 @@ >>>> char *orte_prohibited_session_dirs = NULL; >>>> bool orte_create_session_dirs = true; >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> +orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, >>>> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD}; >>>> +#else >>>> orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, >>>> ORTE_VPID_WILDCARD}; >>>> +#endif >>>> + >>>> +#if ORTE_ENABLE_EPOCH >>>> +orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, >>>> ORTE_VPID_INVALID, ORTE_EPOCH_INVALID}; >>>> +#else >>>> orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, >>>> ORTE_VPID_INVALID}; >>>> +#endif >>>> >>>> >>>> #if OPAL_CC_USE_PRAGMA_IDENT >>>> >>>> Modified: trunk/orte/runtime/orte_wait.h >>>> ============================================================================== >>>> --- trunk/orte/runtime/orte_wait.h (original) >>>> +++ trunk/orte/runtime/orte_wait.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -204,7 +204,7 @@ >>>> mev = OBJ_NEW(orte_message_event_t); \ >>>> mev->sender.jobid = (sndr)->jobid; \ >>>> mev->sender.vpid = (sndr)->vpid; \ >>>> - mev->sender.epoch = (sndr)->epoch; \ >>>> + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ >>>> opal_dss.copy_payload(mev->buffer, (buf)); \ >>>> mev->tag = (tg); \ >>>> mev->file = strdup((buf)->parent.cls_init_file_name); \ >>>> @@ -228,7 +228,7 @@ >>>> mev = OBJ_NEW(orte_message_event_t); \ >>>> mev->sender.jobid = (sndr)->jobid; \ >>>> mev->sender.vpid = (sndr)->vpid; \ >>>> - mev->sender.epoch = (sndr)->epoch; \ >>>> + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ >>>> opal_dss.copy_payload(mev->buffer, (buf)); \ >>>> mev->tag = (tg); \ >>>> opal_event_evtimer_set(opal_event_base, \ >>>> @@ -258,7 +258,7 @@ >>>> tmp = OBJ_NEW(orte_notify_event_t); \ >>>> tmp->proc.jobid = (data)->jobid; \ >>>> tmp->proc.vpid = (data)->vpid; \ >>>> - tmp->proc.epoch = (data)->epoch; \ >>>> + ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \ >>>> opal_event.evtimer_set(opal_event_base, \ >>>> tmp->ev, (cbfunc), tmp); \ >>>> now.tv_sec = 0; \ >>>> >>>> Modified: trunk/orte/test/system/oob_stress.c >>>> ============================================================================== >>>> --- trunk/orte/test/system/oob_stress.c (original) >>>> +++ trunk/orte/test/system/oob_stress.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -74,8 +74,7 @@ >>>> >>>> for (j=1; j < count+1; j++) { >>>> peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % >>>> orte_process_info.num_procs; >>>> - peer.epoch = ORTE_EPOCH_INVALID; >>>> - peer.epoch = orte_ess.proc_get_epoch(&peer); >>>> + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); >>>> >>>> /* rank0 starts ring */ >>>> if (ORTE_PROC_MY_NAME->vpid == 0) { >>>> >>>> Modified: trunk/orte/test/system/orte_ring.c >>>> ============================================================================== >>>> --- trunk/orte/test/system/orte_ring.c (original) >>>> +++ trunk/orte/test/system/orte_ring.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -41,16 +41,14 @@ >>>> if( right_peer_orte_name.vpid >= num_peers ) { >>>> right_peer_orte_name.vpid = 0; >>>> } >>>> - right_peer_orte_name.epoch = ORTE_EPOCH_INVALID; >>>> - right_peer_orte_name.epoch = >>>> orte_ess.proc_get_epoch(&right_peer_orte_name); >>>> + >>>> ORTE_EPOCH_SET(right_peer_orte_name.epoch,orte_ess.proc_get_epoch(&right_peer_orte_name)); >>>> >>>> left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid; >>>> left_peer_orte_name.vpid = ORTE_PROC_MY_NAME->vpid - 1; >>>> if( ORTE_PROC_MY_NAME->vpid == 0 ) { >>>> left_peer_orte_name.vpid = num_peers - 1; >>>> } >>>> - left_peer_orte_name.epoch = ORTE_EPOCH_INVALID; >>>> - left_peer_orte_name.epoch = >>>> orte_ess.proc_get_epoch(&left_peer_orte_name); >>>> + >>>> ORTE_EPOCH_SET(left_peer_orte_name.epoch,orte_ess.proc_get_epoch(&left_peer_orte_name)); >>>> >>>> printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is >>>> %s\n", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), getpid(), >>>> >>>> Modified: trunk/orte/test/system/orte_spawn.c >>>> ============================================================================== >>>> --- trunk/orte/test/system/orte_spawn.c (original) >>>> +++ trunk/orte/test/system/orte_spawn.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -74,8 +74,8 @@ >>>> for (i=0; i < app->num_procs; i++) { >>>> name.vpid = i; >>>> >>>> - name.epoch = ORTE_EPOCH_INVALID; >>>> - name.epoch = orte_ess.proc_get_epoch(&name); >>>> + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); >>>> + >>>> fprintf(stderr, "Parent: sending message to child %s\n", >>>> ORTE_NAME_PRINT(&name)); >>>> if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) { >>>> ORTE_ERROR_LOG(rc); >>>> >>>> Modified: trunk/orte/tools/orte-ps/orte-ps.c >>>> ============================================================================== >>>> --- trunk/orte/tools/orte-ps/orte-ps.c (original) >>>> +++ trunk/orte/tools/orte-ps/orte-ps.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -869,8 +869,14 @@ >>>> } >>>> >>>> /* query the HNP for info on the procs in this job */ >>>> - if (ORTE_SUCCESS != (ret = >>>> orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid, >>>> - >>>> ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD, &cnt, &procs))) { >>>> + if (ORTE_SUCCESS != (ret = >>>> orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), >>>> + >>>> job->jobid, >>>> + >>>> ORTE_VPID_WILDCARD, >>>> +#if ORTE_ENABLE_EPOCH >>>> + >>>> ORTE_EPOCH_WILDCARD, >>>> +#endif >>>> + &cnt, >>>> + >>>> &procs))) { >>>> ORTE_ERROR_LOG(ret); >>>> } >>>> job->procs->addr = (void**)procs; >>>> >>>> Modified: trunk/orte/tools/orte-top/orte-top.c >>>> ============================================================================== >>>> --- trunk/orte/tools/orte-top/orte-top.c (original) >>>> +++ trunk/orte/tools/orte-top/orte-top.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -471,7 +471,7 @@ >>>> if (NULL == ranks) { >>>> /* take all ranks */ >>>> proc.vpid = ORTE_VPID_WILDCARD; >>>> - proc.epoch = ORTE_EPOCH_WILDCARD; >>>> + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_WILDCARD); >>>> if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, >>>> ORTE_NAME))) { >>>> ORTE_ERROR_LOG(ret); >>>> goto cleanup; >>>> >>>> Modified: trunk/orte/util/comm/comm.c >>>> ============================================================================== >>>> --- trunk/orte/util/comm/comm.c (original) >>>> +++ trunk/orte/util/comm/comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -433,8 +433,13 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, >>>> orte_jobid_t job, orte_vpid_t vpid, >>>> orte_epoch_t epoch, int *num_procs, >>>> orte_proc_t ***proc_info_array) >>>> +#else >>>> +int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, >>>> orte_jobid_t job, orte_vpid_t vpid, >>>> + int *num_procs, orte_proc_t >>>> ***proc_info_array) >>>> +#endif >>>> { >>>> int ret; >>>> int32_t cnt, cnt_procs, n; >>>> @@ -463,11 +468,13 @@ >>>> OBJ_RELEASE(cmd); >>>> return ret; >>>> } >>>> +#if ORTE_ENABLE_EPOCH >>>> if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { >>>> ORTE_ERROR_LOG(ret); >>>> OBJ_RELEASE(cmd); >>>> return ret; >>>> } >>>> +#endif >>>> /* define a max time to wait for send to complete */ >>>> timer_fired = false; >>>> error_exit = ORTE_SUCCESS; >>>> >>>> Modified: trunk/orte/util/comm/comm.h >>>> ============================================================================== >>>> --- trunk/orte/util/comm/comm.h (original) >>>> +++ trunk/orte/util/comm/comm.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -52,7 +52,10 @@ >>>> int *num_nodes, orte_node_t >>>> ***node_info_array); >>>> >>>> ORTE_DECLSPEC int orte_util_comm_query_proc_info(const orte_process_name_t >>>> *hnp, orte_jobid_t job, orte_vpid_t vpid, >>>> - orte_epoch_t epoch, int >>>> *num_procs, orte_proc_t ***proc_info_array); >>>> +#if ORTE_ENABLE_EPOCH >>>> + orte_epoch_t epoch, >>>> +#endif >>>> + int *num_procs, >>>> orte_proc_t ***proc_info_array); >>>> >>>> ORTE_DECLSPEC int orte_util_comm_spawn_job(const orte_process_name_t *hnp, >>>> orte_job_t *jdata); >>>> >>>> >>>> Modified: trunk/orte/util/hnp_contact.c >>>> ============================================================================== >>>> --- trunk/orte/util/hnp_contact.c (original) >>>> +++ trunk/orte/util/hnp_contact.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -55,7 +55,8 @@ >>>> { >>>> ptr->name.jobid = ORTE_JOBID_INVALID; >>>> ptr->name.vpid = ORTE_VPID_INVALID; >>>> - ptr->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); >>>> + >>>> ptr->rml_uri = NULL; >>>> } >>>> static void orte_hnp_contact_destruct(orte_hnp_contact_t *ptr) >>>> >>>> Modified: trunk/orte/util/name_fns.c >>>> ============================================================================== >>>> --- trunk/orte/util/name_fns.c (original) >>>> +++ trunk/orte/util/name_fns.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -46,7 +46,7 @@ >>>> { >>>> list->name.jobid = ORTE_JOBID_INVALID; >>>> list->name.vpid = ORTE_VPID_INVALID; >>>> - list->name.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(list->name.epoch,ORTE_EPOCH_MIN); >>>> } >>>> >>>> /* destructor - used to free any resources held by instance */ >>>> @@ -116,7 +116,10 @@ >>>> char* orte_util_print_name_args(const orte_process_name_t *name) >>>> { >>>> orte_print_args_buffers_t *ptr; >>>> - char *job, *vpid, *epoch; >>>> + char *job, *vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> + char *epoch; >>>> +#endif >>>> >>>> /* protect against NULL names */ >>>> if (NULL == name) { >>>> @@ -141,7 +144,7 @@ >>>> */ >>>> job = orte_util_print_jobids(name->jobid); >>>> vpid = orte_util_print_vpids(name->vpid); >>>> - epoch = orte_util_print_epoch(name->epoch); >>>> + ORTE_EPOCH_SET(epoch,orte_util_print_epoch(name->epoch)); >>>> >>>> /* get the next buffer */ >>>> ptr = get_print_name_buffer(); >>>> @@ -156,9 +159,15 @@ >>>> ptr->cntr = 0; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> snprintf(ptr->buffers[ptr->cntr++], >>>> ORTE_PRINT_NAME_ARGS_MAX_SIZE, >>>> "[%s,%s,%s]", job, vpid, epoch); >>>> +#else >>>> + snprintf(ptr->buffers[ptr->cntr++], >>>> + ORTE_PRINT_NAME_ARGS_MAX_SIZE, >>>> + "[%s,%s]", job, vpid); >>>> +#endif >>>> >>>> return ptr->buffers[ptr->cntr-1]; >>>> } >>>> @@ -282,6 +291,7 @@ >>>> return ptr->buffers[ptr->cntr-1]; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> char* orte_util_print_epoch(const orte_epoch_t epoch) >>>> { >>>> orte_print_args_buffers_t *ptr; >>>> @@ -309,6 +319,7 @@ >>>> } >>>> return ptr->buffers[ptr->cntr-1]; >>>> } >>>> +#endif >>>> >>>> >>>> >>>> @@ -403,6 +414,7 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> int orte_util_convert_epoch_to_string(char **epoch_string, const >>>> orte_epoch_t epoch) >>>> { >>>> /* check for wildcard value - handle appropriately */ >>>> @@ -425,7 +437,6 @@ >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> - >>>> int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* >>>> epoch_string) >>>> { >>>> if (NULL == epoch_string) { /* got an error */ >>>> @@ -450,6 +461,7 @@ >>>> >>>> return ORTE_SUCCESS; >>>> } >>>> +#endif >>>> >>>> int orte_util_convert_string_to_process_name(orte_process_name_t *name, >>>> const char* name_string) >>>> @@ -457,13 +469,15 @@ >>>> char *temp, *token; >>>> orte_jobid_t job; >>>> orte_vpid_t vpid; >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t epoch; >>>> +#endif >>>> int return_code=ORTE_SUCCESS; >>>> - >>>> + >>>> /* set default */ >>>> name->jobid = ORTE_JOBID_INVALID; >>>> name->vpid = ORTE_VPID_INVALID; >>>> - name->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(name->epoch,ORTE_EPOCH_MIN); >>>> >>>> /* check for NULL string - error */ >>>> if (NULL == name_string) { >>>> @@ -510,6 +524,7 @@ >>>> vpid = strtoul(token, NULL, 10); >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field >>>> -> epoch*/ >>>> >>>> /* check for error */ >>>> @@ -528,10 +543,11 @@ >>>> } else { >>>> epoch = strtoul(token, NULL, 10); >>>> } >>>> +#endif >>>> >>>> name->jobid = job; >>>> name->vpid = vpid; >>>> - name->epoch = epoch; >>>> + ORTE_EPOCH_SET(name->epoch,epoch); >>>> >>>> free(temp); >>>> >>>> @@ -568,6 +584,7 @@ >>>> asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned >>>> long)name->vpid); >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> if (ORTE_EPOCH_WILDCARD == name->epoch) { >>>> asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, >>>> ORTE_SCHEMA_WILDCARD_STRING); >>>> } else if (ORTE_EPOCH_INVALID == name->epoch) { >>>> @@ -575,6 +592,10 @@ >>>> } else { >>>> asprintf(name_string, "%s%c%lu", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, >>>> (unsigned long)name->epoch); >>>> } >>>> +#else >>>> + asprintf(name_string, "%s", tmp2); >>>> +#endif >>>> + >>>> free(tmp); >>>> free(tmp2); >>>> >>>> @@ -585,8 +606,11 @@ >>>> /**** CREATE PROCESS NAME ****/ >>>> int orte_util_create_process_name(orte_process_name_t **name, >>>> orte_jobid_t job, >>>> - orte_vpid_t vpid, >>>> - orte_epoch_t epoch) >>>> + orte_vpid_t vpid >>>> +#if ORTE_ENABLE_EPOCH >>>> + ,orte_epoch_t epoch >>>> +#endif >>>> + ) >>>> { >>>> *name = NULL; >>>> >>>> @@ -598,7 +622,8 @@ >>>> >>>> (*name)->jobid = job; >>>> (*name)->vpid = vpid; >>>> - (*name)->epoch = epoch; >>>> + ORTE_EPOCH_SET((*name)->epoch,epoch); >>>> + >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> @@ -655,6 +680,7 @@ >>>> } >>>> } >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* Get here if jobid's and vpid's are equal, or not being checked. >>>> * Now check epoch. >>>> */ >>>> @@ -666,6 +692,7 @@ >>>> return OPAL_VALUE1_GREATER; >>>> } >>>> } >>>> +#endif >>>> >>>> /* only way to get here is if all fields are being checked and are equal, >>>> * or jobid not checked, but vpid equal, >>>> >>>> Modified: trunk/orte/util/name_fns.h >>>> ============================================================================== >>>> --- trunk/orte/util/name_fns.h (original) >>>> +++ trunk/orte/util/name_fns.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -61,9 +61,13 @@ >>>> #define ORTE_VPID_PRINT(n) \ >>>> orte_util_print_vpids(n) >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> ORTE_DECLSPEC char* orte_util_print_epoch(const orte_epoch_t epoch); >>>> #define ORTE_EPOCH_PRINT(n) \ >>>> orte_util_print_epoch(n) >>>> +#else >>>> +#define ORTE_EPOCH_PRINT(n) >>>> +#endif >>>> >>>> ORTE_DECLSPEC char* orte_util_print_job_family(const orte_jobid_t job); >>>> #define ORTE_JOB_FAMILY_PRINT(n) \ >>>> @@ -104,6 +108,24 @@ >>>> #define ORTE_JOBID_IS_DAEMON(n) \ >>>> !((n) & 0x0000ffff) >>>> >>>> +/* Macro for getting the epoch out of the process name */ >>>> +#if ORTE_ENABLE_EPOCH >>>> +#define ORTE_EPOCH_GET(n) \ >>>> + ((n)->epoch) >>>> +#else >>>> +#define ORTE_EPOCH_GET(n) >>>> +#endif >>>> + >>>> +/* Macro for setting the epoch in the process name */ >>>> +#if ORTE_ENABLE_EPOCH >>>> +#define ORTE_EPOCH_SET(n,m) \ >>>> + ( (n) = (m) ) >>>> +#else >>>> +#define ORTE_EPOCH_SET(n,m) \ >>>> + do { \ >>>> + } while(0); >>>> +#endif >>>> + >>>> /* List of names for general use */ >>>> struct orte_namelist_t { >>>> opal_list_item_t item; /**< Allows this item to be placed on a list >>>> */ >>>> @@ -117,16 +139,24 @@ >>>> ORTE_DECLSPEC int orte_util_convert_string_to_jobid(orte_jobid_t *jobid, >>>> const char* jobidstring); >>>> ORTE_DECLSPEC int orte_util_convert_vpid_to_string(char **vpid_string, >>>> const orte_vpid_t vpid); >>>> ORTE_DECLSPEC int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, >>>> const char* vpidstring); >>>> +#if ORTE_ENABLE_EPOCH >>>> ORTE_DECLSPEC int orte_util_convert_epoch_to_string(char **epoch_string, >>>> const orte_epoch_t epoch); >>>> ORTE_DECLSPEC int orte_util_convert_string_to_epoch(orte_vpid_t *epoch, >>>> const char* epochstring); >>>> +#endif >>>> ORTE_DECLSPEC int >>>> orte_util_convert_string_to_process_name(orte_process_name_t *name, >>>> const char* name_string); >>>> ORTE_DECLSPEC int orte_util_convert_process_name_to_string(char** >>>> name_string, >>>> const orte_process_name_t *name); >>>> +#if ORTE_ENABLE_EPOCH >>>> ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, >>>> orte_jobid_t job, >>>> orte_vpid_t vpid, >>>> orte_epoch_t epoch); >>>> +#else >>>> +ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t >>>> **name, >>>> + orte_jobid_t job, >>>> + orte_vpid_t vpid); >>>> +#endif >>>> ORTE_DECLSPEC int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t >>>> fields, >>>> const orte_process_name_t* name1, >>>> const orte_process_name_t* name2); >>>> >>>> Modified: trunk/orte/util/nidmap.c >>>> ============================================================================== >>>> --- trunk/orte/util/nidmap.c (original) >>>> +++ trunk/orte/util/nidmap.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -249,7 +249,7 @@ >>>> */ >>>> /* construct the URI */ >>>> proc.vpid = node->daemon; >>>> - proc.epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN); >>>> >>>> orte_util_convert_process_name_to_string(&proc_name, &proc); >>>> asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, >>>> (int)orte_process_info.my_port); >>>> @@ -1001,6 +1001,7 @@ >>>> } >>>> #endif >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* Look up the current epoch value that we have stored locally. >>>> * >>>> * Note that this will not ping the HNP to get the most up to date epoch >>>> stored >>>> @@ -1023,7 +1024,9 @@ >>>> /*print_orte_job_data();*/ >>>> return e; >>>> } >>>> +#endif >>>> >>>> +#if ORTE_RESIL_ORTE >>>> bool orte_util_proc_is_running(orte_process_name_t *proc) { >>>> int i; >>>> unsigned int j; >>>> @@ -1078,7 +1081,9 @@ >>>> >>>> return ORTE_ERROR; >>>> } >>>> +#endif >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> /* >>>> * This function performs both the get and set operations on the epoch for a >>>> * sepcific process name. If the epoch passed into the function is >>>> @@ -1091,6 +1096,11 @@ >>>> orte_job_t *jdata; >>>> orte_proc_t *pdata; >>>> >>>> + if (ORTE_JOBID_INVALID == proc->jobid || >>>> + ORTE_VPID_INVALID == proc->vpid) { >>>> + return ORTE_EPOCH_INVALID; >>>> + } >>>> + >>>> /* Sanity check just to make sure we don't overwrite our existing >>>> * orte_job_data. >>>> */ >>>> @@ -1165,4 +1175,5 @@ >>>> return ORTE_EPOCH_MIN; >>>> } >>>> } >>>> +#endif >>>> >>>> >>>> Modified: trunk/orte/util/nidmap.h >>>> ============================================================================== >>>> --- trunk/orte/util/nidmap.h (original) >>>> +++ trunk/orte/util/nidmap.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -48,11 +48,19 @@ >>>> ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t >>>> *proc); >>>> ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc); >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> ORTE_DECLSPEC orte_epoch_t orte_util_lookup_epoch(orte_process_name_t >>>> *proc); >>>> ORTE_DECLSPEC orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, >>>> orte_epoch_t epoch); >>>> +#endif >>>> >>>> ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, >>>> orte_proc_state_t state); >>>> + >>>> +#if ORTE_RESIL_ORTE >>>> +#define PROC_IS_RUNNING(n) orte_util_proc_is_running(n) >>>> ORTE_DECLSPEC bool orte_util_proc_is_running(orte_process_name_t *proc); >>>> +#else >>>> +#define PROC_IS_RUNNING(n) ( true ) >>>> +#endif >>>> >>>> ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr); >>>> ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); >>>> @@ -72,5 +80,8 @@ >>>> END_C_DECLS >>>> >>>> /* Local functions */ >>>> +#if ORTE_ENABLE_EPOCH >>>> orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, >>>> orte_epoch_t epoch); >>>> #endif >>>> + >>>> +#endif >>>> >>>> Modified: trunk/orte/util/proc_info.c >>>> ============================================================================== >>>> --- trunk/orte/util/proc_info.c (original) >>>> +++ trunk/orte/util/proc_info.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug >>>> 2011) >>>> @@ -36,13 +36,19 @@ >>>> >>>> #include "orte/util/proc_info.h" >>>> >>>> +#if ORTE_ENABLE_EPOCH >>>> +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, >>>> ORTE_EPOCH_MIN} >>>> +#else >>>> +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID} >>>> +#endif >>>> + >>>> ORTE_DECLSPEC orte_proc_info_t orte_process_info = { >>>> - /* .my_name = */ {ORTE_JOBID_INVALID, >>>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>>> - /* .my_daemon = */ {ORTE_JOBID_INVALID, >>>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>>> + /* .my_name = */ ORTE_NAME_INVALID, >>>> + /* .my_daemon = */ ORTE_NAME_INVALID, >>>> /* .my_daemon_uri = */ NULL, >>>> - /* .my_hnp = */ {ORTE_JOBID_INVALID, >>>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>>> + /* .my_hnp = */ ORTE_NAME_INVALID, >>>> /* .my_hnp_uri = */ NULL, >>>> - /* .my_parent = */ {ORTE_JOBID_INVALID, >>>> ORTE_VPID_INVALID, ORTE_EPOCH_MIN}, >>>> + /* .my_parent = */ ORTE_NAME_INVALID, >>>> /* .hnp_pid = */ 0, >>>> /* .app_num = */ 0, >>>> /* .num_procs = */ 1, >>>> >>>> Modified: trunk/test/util/orte_session_dir.c >>>> ============================================================================== >>>> --- trunk/test/util/orte_session_dir.c (original) >>>> +++ trunk/test/util/orte_session_dir.c 2011-08-26 18:16:14 EDT (Fri, >>>> 26 Aug 2011) >>>> @@ -57,7 +57,7 @@ >>>> orte_process_info.my_name->cellid = 0; >>>> orte_process_info.my_name->jobid = 0; >>>> orte_process_info.my_name->vpid = 0; >>>> - orte_process_info.my_name->epoch = ORTE_EPOCH_MIN; >>>> + ORTE_EPOCH_SET(orte_process_info.my_name->epoch,ORTE_EPOCH_MIN); >>>> >>>> test_init("orte_session_dir_t"); >>>> test_out = fopen( "test_session_dir_out", "w+" ); >>>> _______________________________________________ >>>> svn-full mailing list >>>> svn-f...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full >>> >>> >>> _______________________________________________ >>> devel mailing list >>> de...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >> >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel