The epoch and resilient rote code is now macro'd away. To enable use --enable-resilient-orte
which defines: ORTE_ENABLE_EPOCH ORTE_RESIL_ORTE -- Wesley On Aug 26, 2011, at 6:16 PM, wbl...@osl.iu.edu wrote: > Author: wbland > Date: 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > New Revision: 25093 > URL: https://svn.open-mpi.org/trac/ompi/changeset/25093 > > Log: > By popular demand the epoch code is now disabled by default. > > To enable the epochs and the resilient orte code, use the configure flag: > > --enable-resilient-orte > > This will define both: > > ORTE_ENABLE_EPOCH > ORTE_RESIL_ORTE > > Text files modified: > trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c | 12 ++++ > > trunk/ompi/mca/coll/sm2/coll_sm2_module.c | 3 > > trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 49 > ++++++++---------- > trunk/ompi/mca/dpm/orte/dpm_orte.c | 2 > > trunk/ompi/mca/pml/bfo/pml_bfo_failover.c | 10 +-- > > trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h | 6 -- > > trunk/ompi/proc/proc.c | 6 +- > > trunk/opal/config/opal_configure_options.m4 | 8 +++ > > trunk/orte/include/orte/types.h | 24 > +++++++++ > trunk/orte/mca/db/daemon/db_daemon.c | 2 > > trunk/orte/mca/errmgr/app/errmgr_app.c | 19 > ++++++- > trunk/orte/mca/errmgr/base/errmgr_base_fns.c | 12 ++-- > > trunk/orte/mca/errmgr/base/errmgr_base_tool.c | 6 +- > > trunk/orte/mca/errmgr/hnp/errmgr_hnp.c | 99 > +++++++++++++++++++++++++++------------ > trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 6 +- > > trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c | 6 +- > > trunk/orte/mca/errmgr/orted/errmgr_orted.c | 71 > +++++++++++++++++++++------- > trunk/orte/mca/ess/alps/ess_alps_module.c | 4 > > trunk/orte/mca/ess/base/base.h | 4 + > > trunk/orte/mca/ess/base/ess_base_select.c | 14 ++--- > > trunk/orte/mca/ess/env/ess_env_module.c | 3 > > trunk/orte/mca/ess/ess.h | 4 + > > trunk/orte/mca/ess/generic/ess_generic_module.c | 6 +- > > trunk/orte/mca/ess/hnp/ess_hnp_module.c | 2 > > trunk/orte/mca/ess/lsf/ess_lsf_module.c | 3 > > trunk/orte/mca/ess/singleton/ess_singleton_module.c | 2 > > trunk/orte/mca/ess/slave/ess_slave_module.c | 3 > > trunk/orte/mca/ess/slurm/ess_slurm_module.c | 3 > > trunk/orte/mca/ess/slurmd/ess_slurmd_module.c | 4 > > trunk/orte/mca/ess/tm/ess_tm_module.c | 2 > > trunk/orte/mca/filem/rsh/filem_rsh_module.c | 6 +- > > trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c | 21 > ++----- > trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c | 8 +- > > trunk/orte/mca/iof/base/base.h | 8 +- > > trunk/orte/mca/iof/base/iof_base_open.c | 2 > > trunk/orte/mca/iof/hnp/iof_hnp.c | 7 +- > > trunk/orte/mca/iof/hnp/iof_hnp_receive.c | 6 +- > > trunk/orte/mca/iof/orted/iof_orted.c | 2 > > trunk/orte/mca/odls/base/odls_base_default_fns.c | 7 +- > > trunk/orte/mca/odls/base/odls_base_open.c | 5 - > > trunk/orte/mca/odls/base/odls_base_state.c | 6 +- > > trunk/orte/mca/oob/tcp/oob_tcp_msg.c | 2 > > trunk/orte/mca/oob/tcp/oob_tcp_peer.c | 5 ++ > > trunk/orte/mca/plm/base/plm_base_jobid.c | 4 > > trunk/orte/mca/plm/base/plm_base_launch_support.c | 3 > > trunk/orte/mca/plm/base/plm_base_orted_cmds.c | 8 +-- > > trunk/orte/mca/plm/base/plm_base_receive.c | 7 ++ > > trunk/orte/mca/plm/base/plm_base_rsh_support.c | 4 + > > trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 23 > +++++---- > trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c | 3 > > trunk/orte/mca/rmaps/seq/rmaps_seq.c | 3 > > trunk/orte/mca/rmcast/base/rmcast_base_open.c | 6 +- > > trunk/orte/mca/rmcast/tcp/rmcast_tcp.c | 4 > > trunk/orte/mca/rmcast/udp/rmcast_udp.c | 4 > > trunk/orte/mca/rml/base/rml_base_components.c | 5 + > > trunk/orte/mca/rml/rml_types.h | 6 + > > trunk/orte/mca/routed/base/routed_base_components.c | 6 +- > > trunk/orte/mca/routed/base/routed_base_register_sync.c | 4 + > > trunk/orte/mca/routed/binomial/routed_binomial.c | 54 > ++++++++++++--------- > trunk/orte/mca/routed/cm/routed_cm.c | 19 > +++---- > trunk/orte/mca/routed/direct/routed_direct.c | 3 > > trunk/orte/mca/routed/linear/routed_linear.c | 17 +++--- > > trunk/orte/mca/routed/radix/routed_radix.c | 22 > ++++---- > trunk/orte/mca/routed/slave/routed_slave.c | 6 +- > > trunk/orte/mca/sensor/file/sensor_file.c | 2 > > trunk/orte/mca/snapc/base/snapc_base_fns.c | 4 > > trunk/orte/mca/snapc/full/snapc_full_global.c | 12 ++-- > > trunk/orte/mca/snapc/full/snapc_full_local.c | 6 +- > > trunk/orte/mca/snapc/full/snapc_full_module.c | 4 > > trunk/orte/mca/sstore/base/sstore_base_fns.c | 6 +- > > trunk/orte/mca/sstore/central/sstore_central_global.c | 3 > > trunk/orte/mca/sstore/central/sstore_central_local.c | 6 +- > > trunk/orte/mca/sstore/stage/sstore_stage_global.c | 7 +- > > trunk/orte/mca/sstore/stage/sstore_stage_local.c | 12 ++-- > > trunk/orte/orted/orted_comm.c | 20 > ++++---- > trunk/orte/orted/orted_main.c | 7 +- > > trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c | 4 + > > trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c | 4 + > > trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c | 6 ++ > > trunk/orte/runtime/data_type_support/orte_dt_print_fns.c | 19 > +++++++ > trunk/orte/runtime/data_type_support/orte_dt_size_fns.c | 2 > > trunk/orte/runtime/data_type_support/orte_dt_support.h | 11 ++++ > > trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c | 10 +++ > > trunk/orte/runtime/orte_data_server.c | 2 > > trunk/orte/runtime/orte_globals.c | 4 + > > trunk/orte/runtime/orte_init.c | 9 +++ > > trunk/orte/runtime/orte_wait.h | 6 +- > > trunk/orte/test/system/oob_stress.c | 3 > > trunk/orte/test/system/orte_ring.c | 6 - > > trunk/orte/test/system/orte_spawn.c | 4 > > trunk/orte/tools/orte-ps/orte-ps.c | 10 +++ > > trunk/orte/tools/orte-top/orte-top.c | 2 > > trunk/orte/util/comm/comm.c | 7 ++ > > trunk/orte/util/comm/comm.h | 5 + > > trunk/orte/util/hnp_contact.c | 3 > > trunk/orte/util/name_fns.c | 47 > ++++++++++++++---- > trunk/orte/util/name_fns.h | 30 > ++++++++++++ > trunk/orte/util/nidmap.c | 13 ++++ > > trunk/orte/util/nidmap.h | 11 ++++ > > trunk/orte/util/proc_info.c | 14 ++++- > > trunk/test/util/orte_session_dir.c | 2 > > 101 files changed, 652 insertions(+), 362 deletions(-) > > Modified: trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c > ============================================================================== > --- trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c > (original) > +++ trunk/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c > 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -693,8 +693,16 @@ > bool found = false; > > BTL_VERBOSE(("Searching for ep and proc with follow parameters:" > - "jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d", > - process_name->jobid, process_name->vpid, > process_name->epoch, subnet_id, lid)); > + "jobid %d, vpid %d, " > +#if ORTE_ENABLE_EPOCH > + "epoch %d, " > +#endif > + "sid %" PRIx64 ", lid %d", > + process_name->jobid, process_name->vpid, > +#if ORTE_ENABLE_EPOCH > + process_name->epoch, > +#endif > + subnet_id, lid)); > /* find ibproc */ > OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); > for (ib_proc = (mca_btl_openib_proc_t*) > > Modified: trunk/ompi/mca/coll/sm2/coll_sm2_module.c > ============================================================================== > --- trunk/ompi/mca/coll/sm2/coll_sm2_module.c (original) > +++ trunk/ompi/mca/coll/sm2/coll_sm2_module.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -1208,7 +1208,8 @@ > peer = OBJ_NEW(orte_namelist_t); > peer->name.jobid = > comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid; > peer->name.vpid = > comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid; > - peer->name.epoch = > comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch; > + > ORTE_EPOCH_SET(peer->name.epoch,comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch); > + > opal_list_append(&peers, &peer->item); > } > /* prepare send data */ > > Modified: trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c > ============================================================================== > --- trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c (original) > +++ trunk/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -702,7 +702,7 @@ > void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t > *peer_ref) { > peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; > peer_ref->proc_name.vpid = ORTE_VPID_INVALID; > - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); > > OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t); > OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t); > @@ -730,7 +730,7 @@ > > peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; > peer_ref->proc_name.vpid = ORTE_VPID_INVALID; > - peer_ref->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); > > while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) { > HOKE_TRAFFIC_MSG_REF_RETURN(item); > @@ -840,7 +840,7 @@ > > msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; > msg_ref->proc_name.vpid = ORTE_VPID_INVALID; > - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); > > msg_ref->matched = INVALID_INT; > msg_ref->done = INVALID_INT; > @@ -868,7 +868,7 @@ > > msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; > msg_ref->proc_name.vpid = ORTE_VPID_INVALID; > - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); > > msg_ref->matched = INVALID_INT; > msg_ref->done = INVALID_INT; > @@ -902,7 +902,7 @@ > > msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; > msg_ref->proc_name.vpid = ORTE_VPID_INVALID; > - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); > > msg_ref->done = INVALID_INT; > msg_ref->active = INVALID_INT; > @@ -934,7 +934,7 @@ > > msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; > msg_ref->proc_name.vpid = ORTE_VPID_INVALID; > - msg_ref->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); > > msg_ref->done = INVALID_INT; > msg_ref->active = INVALID_INT; > @@ -954,7 +954,7 @@ > > msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; > msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; > - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); > } > > void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( > ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) { > @@ -962,7 +962,7 @@ > > msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; > msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; > - msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); > } > > > @@ -1015,7 +1015,7 @@ > } > > > -#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, > v_comm, p_jobid, p_vpid, p_epoch) \ > +#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, > v_comm, p_jobid, p_vpid) \ > { \ > HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \ > \ > @@ -1034,7 +1034,7 @@ > \ > msg_ref->proc_name.jobid = p_jobid; \ > msg_ref->proc_name.vpid = p_vpid; \ > - msg_ref->proc_name.epoch = p_epoch; \ > + > ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); > \ > \ > msg_ref->matched = 0; \ > msg_ref->done = 0; \ > @@ -1043,7 +1043,7 @@ > msg_ref->active_drain = 0; \ > } > > -#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, > v_rank, v_comm, p_jobid, p_vpid, p_epoch) \ > +#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, > v_rank, v_comm, p_jobid, p_vpid) \ > { \ > HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \ > \ > @@ -1063,7 +1063,7 @@ > \ > msg_ref->proc_name.jobid = p_jobid; \ > msg_ref->proc_name.vpid = p_vpid; \ > - msg_ref->proc_name.epoch = p_epoch; \ > + > ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); > \ > } > > > @@ -1466,7 +1466,7 @@ > > new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid; > new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid; > - new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch; > + > ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch); > > opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, > &(new_peer_ref->super)); > } > @@ -3237,13 +3237,11 @@ > CREATE_NEW_MSG((*msg_ref), msg_type, > count, ddt_size, tag, dest, comm, > peer_ref->proc_name.jobid, > - peer_ref->proc_name.vpid, > - peer_ref->proc_name.epoch); > + peer_ref->proc_name.vpid); > } else { > CREATE_NEW_MSG((*msg_ref), msg_type, > count, ddt_size, tag, dest, comm, > - ORTE_JOBID_INVALID, ORTE_VPID_INVALID, > - ORTE_EPOCH_INVALID); > + ORTE_JOBID_INVALID, ORTE_VPID_INVALID); > } > > if( msg_type == COORD_MSG_TYPE_P_SEND || > @@ -3377,7 +3375,7 @@ > if( NULL == from_peer_ref && NULL != to_peer_ref ) { > (*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid; > (*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid; > - (*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch; > + > ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch); > } > > return exit_status; > @@ -3808,8 +3806,7 @@ > CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type, > count, NULL, tag, dest, comm, > peer_ref->proc_name.jobid, > - peer_ref->proc_name.vpid, > - peer_ref->proc_name.epoch); > + peer_ref->proc_name.vpid); > > (*msg_ref)->done = 0; > (*msg_ref)->active = 0; > @@ -5284,8 +5281,7 @@ > */ > peer_name.jobid = ORTE_PROC_MY_NAME->jobid; > peer_name.vpid = peer_idx; > - peer_name.epoch = ORTE_EPOCH_INVALID; > - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); > + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); > > if( NULL == (peer_ref = find_peer(peer_name))) { > opal_output(mca_crcp_bkmrk_component.super.output_handle, > @@ -5346,8 +5342,7 @@ > > peer_name.jobid = ORTE_PROC_MY_NAME->jobid; > peer_name.vpid = peer_idx; > - peer_name.epoch = ORTE_EPOCH_INVALID; > - peer_name.epoch = orte_ess.proc_get_epoch(&peer_name); > + ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); > > if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name, > OMPI_CRCP_COORD_BOOKMARK_TAG, > @@ -5529,7 +5524,8 @@ > HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret); > d_msg_ack->peer.jobid = peer_ref->proc_name.jobid; > d_msg_ack->peer.vpid = peer_ref->proc_name.vpid; > - d_msg_ack->peer.epoch = peer_ref->proc_name.epoch; > + ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch); > + > d_msg_ack->complete = false; > opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super)); > OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle, > @@ -6169,8 +6165,7 @@ > count, datatype_size, tag, rank, > ompi_comm_lookup(comm_id), > peer_ref->proc_name.jobid, > - peer_ref->proc_name.vpid, > - peer_ref->proc_name.epoch); > + peer_ref->proc_name.vpid); > > traffic_message_create_drain_message(true, num_left_unresolved, > peer_ref, > > Modified: trunk/ompi/mca/dpm/orte/dpm_orte.c > ============================================================================== > --- trunk/ompi/mca/dpm/orte/dpm_orte.c (original) > +++ trunk/ompi/mca/dpm/orte/dpm_orte.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -1130,7 +1130,7 @@ > /* flag the identity of the remote proc */ > carport.jobid = mev->sender.jobid; > carport.vpid = mev->sender.vpid; > - carport.epoch = mev->sender.epoch; > + ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch); > > /* release the event */ > OBJ_RELEASE(mev); > > Modified: trunk/ompi/mca/pml/bfo/pml_bfo_failover.c > ============================================================================== > --- trunk/ompi/mca/pml/bfo/pml_bfo_failover.c (original) > +++ trunk/ompi/mca/pml/bfo/pml_bfo_failover.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -1,8 +1,5 @@ > /* > * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. > - * Copyright (c) 2004-2011 The University of Tennessee and The University > - * of Tennessee Research Foundation. All rights > - * reserved. > * $COPYRIGHT$ > * > * Additional copyrights may follow > @@ -398,13 +395,13 @@ > (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { > orte_proc.jobid = hdr->hdr_restart.hdr_jobid; > orte_proc.vpid = hdr->hdr_restart.hdr_vpid; > - orte_proc.epoch = hdr->hdr_restart.hdr_epoch; > + > ompi_proc = ompi_proc_find(&orte_proc); > opal_output_verbose(20, mca_pml_bfo_output, > "RNDVRESTARTNOTIFY: received: does not match > request, sending NACK back " > "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d > SRC:req=%d,hdr=%d " > "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, > peer=%d, hdr->hdr_jobid=%d, " > - "hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, > ompi_proc->proc_hostname=%s", > + "hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s", > (uint16_t)recvreq->req_msgseq, > hdr->hdr_match.hdr_seq, > recvreq->req_recv.req_base.req_comm->c_contextid, > hdr->hdr_match.hdr_ctx, > > recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, > @@ -413,7 +410,7 @@ > recvreq->remote_req_send.pval, (void *)recvreq, > > recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, > hdr->hdr_restart.hdr_jobid, > hdr->hdr_restart.hdr_vpid, > - hdr->hdr_restart.hdr_epoch, > ompi_proc->proc_hostname); > + ompi_proc->proc_hostname); > mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); > return; > } > @@ -715,7 +712,6 @@ > restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed > for NACKs */ > restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid; > restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid; > - restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch; > > bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc); > > > Modified: trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h > ============================================================================== > --- trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h (original) > +++ trunk/ompi/mca/pml/bfo/pml_bfo_hdr.h 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -2,9 +2,6 @@ > * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana > * University Research and Technology > * Corporation. All rights reserved. > - * Copyright (c) 2004-2011 The University of Tennessee and The University > - * of Tennessee Research Foundation. All rights > - * reserved. > * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, > * University of Stuttgart. All rights reserved. > * Copyright (c) 2004-2005 The Regents of the University of California. > @@ -415,7 +412,6 @@ > int32_t hdr_dst_rank; /**< needed to send NACK */ > uint32_t hdr_jobid; /**< needed to send NACK */ > uint32_t hdr_vpid; /**< needed to send NACK */ > - uint32_t hdr_epoch; /**< needed to send NACK */ > }; > typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t; > > @@ -428,7 +424,6 @@ > (h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \ > (h).hdr_jobid = ntohl((h).hdr_jobid); \ > (h).hdr_vpid = ntohl((h).hdr_vpid); \ > - (h).hdr_epoch = ntohl((h).hdr_epoch); \ > } while (0) > > #define MCA_PML_BFO_RESTART_HDR_HTON(h) \ > @@ -437,7 +432,6 @@ > (h).hdr_dst_rank = htonl((h).hdr_dst_rank); \ > (h).hdr_jobid = htonl((h).hdr_jobid); \ > (h).hdr_vpid = htonl((h).hdr_vpid); \ > - (h).hdr_epoch = htonl((h).hdr_epoch); \ > } while (0) > > #endif /* PML_BFO */ > > Modified: trunk/ompi/proc/proc.c > ============================================================================== > --- trunk/ompi/proc/proc.c (original) > +++ trunk/ompi/proc/proc.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -108,7 +108,8 @@ > > proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; > proc->proc_name.vpid = i; > - proc->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN); > + > if (i == ORTE_PROC_MY_NAME->vpid) { > ompi_proc_local_proc = proc; > proc->proc_flags = OPAL_PROC_ALL_LOCAL; > @@ -362,8 +363,7 @@ > > /* Does not change: proc->proc_name.vpid */ > proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; > - proc->proc_name.epoch = ORTE_EPOCH_INVALID; > - proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name); > + > ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name)); > > /* Make sure to clear the local flag before we set it below */ > proc->proc_flags = 0; > > Modified: trunk/opal/config/opal_configure_options.m4 > ============================================================================== > --- trunk/opal/config/opal_configure_options.m4 (original) > +++ trunk/opal/config/opal_configure_options.m4 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -416,6 +416,14 @@ > AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1") > > # > +# Compile in resilient runtime code > +# > +AC_ARG_ENABLE(resilient-orte, > + [AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient runtime > code.])]) > +AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"]) > +AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"]) > + > +# > # Do we want to install binaries? > # > AC_ARG_ENABLE([binaries], > > Modified: trunk/orte/include/orte/types.h > ============================================================================== > --- trunk/orte/include/orte/types.h (original) > +++ trunk/orte/include/orte/types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -81,24 +81,43 @@ > #define ORTE_VPID_T OPAL_UINT32 > #define ORTE_VPID_MAX UINT32_MAX-2 > #define ORTE_VPID_MIN 0 > + > +#if ORTE_ENABLE_EPOCH > typedef uint32_t orte_epoch_t; > #define ORTE_EPOCH_T OPAL_UINT32 > #define ORTE_EPOCH_MAX UINT32_MAX-2 > #define ORTE_EPOCH_MIN 0 > +#endif > > +#if ORTE_ENABLE_EPOCH > #define ORTE_PROCESS_NAME_HTON(n) \ > do { \ > n.jobid = htonl(n.jobid); \ > n.vpid = htonl(n.vpid); \ > n.epoch = htonl(n.epoch); \ > } while (0) > +#else > +#define ORTE_PROCESS_NAME_HTON(n) \ > +do { \ > + n.jobid = htonl(n.jobid); \ > + n.vpid = htonl(n.vpid); \ > +} while (0) > +#endif > > +#if ORTE_ENABLE_EPOCH > #define ORTE_PROCESS_NAME_NTOH(n) \ > do { \ > n.jobid = ntohl(n.jobid); \ > n.vpid = ntohl(n.vpid); \ > n.epoch = ntohl(n.epoch); \ > } while (0) > +#else > +#define ORTE_PROCESS_NAME_NTOH(n) \ > +do { \ > + n.jobid = ntohl(n.jobid); \ > + n.vpid = ntohl(n.vpid); \ > +} while (0) > +#endif > > #define ORTE_NAME_ARGS(n) \ > (unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : > (unsigned long)(n)->jobid), \ > @@ -127,6 +146,7 @@ > struct orte_process_name_t { > orte_jobid_t jobid; /**< Job number */ > orte_vpid_t vpid; /**< Process id - equivalent to rank */ > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; /**< Epoch - used to measure the generation of a > recovered process. > * The epoch will start at ORTE_EPOCH_MIN and > * increment every time the process is detected > as > @@ -135,6 +155,7 @@ > * processes that did not directly detect the > * failure to increment their epochs. > */ > +#endif > }; > typedef struct orte_process_name_t orte_process_name_t; > > @@ -157,7 +178,10 @@ > #define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an > orte_process_name_t */ > #define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */ > #define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */ > + > +#if ORTE_ENABLE_EPOCH > #define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch > */ > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > /* State-related types */ > > Modified: trunk/orte/mca/db/daemon/db_daemon.c > ============================================================================== > --- trunk/orte/mca/db/daemon/db_daemon.c (original) > +++ trunk/orte/mca/db/daemon/db_daemon.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -386,7 +386,7 @@ > dat = OBJ_NEW(orte_db_data_t); > dat->name.jobid = sender->jobid; > dat->name.vpid = sender->vpid; > - dat->name.epoch= sender->epoch; > + ORTE_EPOCH_SET(dat->name.epoch,sender->epoch); > dat->key = key; > count=1; > opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32); > > Modified: trunk/orte/mca/errmgr/app/errmgr_app.c > ============================================================================== > --- trunk/orte/mca/errmgr/app/errmgr_app.c (original) > +++ trunk/orte/mca/errmgr/app/errmgr_app.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -82,8 +82,10 @@ > NULL, > NULL, > NULL, > - orte_errmgr_base_register_migration_warning, > - orte_errmgr_base_set_fault_callback > + orte_errmgr_base_register_migration_warning > +#if ORTE_RESIL_ORTE > + ,orte_errmgr_base_set_fault_callback > +#endif > }; > > /************************ > @@ -93,18 +95,23 @@ > { > int ret = ORTE_SUCCESS; > > +#if ORTE_RESIL_ORTE > ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, > ORTE_RML_TAG_EPOCH_CHANGE, > ORTE_RML_PERSISTENT, > epoch_change_recv, > NULL); > +#endif > + > return ret; > } > > static int finalize(void) > { > +#if ORTE_RESIL_ORTE > orte_rml.recv_cancel(ORTE_NAME_WILDCARD, > ORTE_RML_TAG_EPOCH_CHANGE); > +#endif > > return ORTE_SUCCESS; > } > @@ -151,6 +158,7 @@ > return ORTE_SUCCESS; > } > > +#if ORTE_RESIL_ORTE > void epoch_change_recv(int status, > orte_process_name_t *sender, > opal_buffer_t *buffer, > @@ -209,15 +217,20 @@ > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > (*fault_cbfunc)(procs); > + } else if (NULL == fault_cbfunc) { > + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, > + "%s errmgr:app Calling fault callback failed (NULL > pointer)!", > + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > } else { > OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, > - "%s errmgr:app Calling fault callback failed!", > + "%s errmgr:app Calling fault callback failed (num_dead > <= 0)!", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > } > > free(proc); > OBJ_RELEASE(procs); > } > +#endif > > static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, > orte_std_cntr_t num_procs) > { > > Modified: trunk/orte/mca/errmgr/base/errmgr_base_fns.c > ============================================================================== > --- trunk/orte/mca/errmgr/base/errmgr_base_fns.c (original) > +++ trunk/orte/mca/errmgr/base/errmgr_base_fns.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -97,13 +97,13 @@ > { > item->proc_name.vpid = ORTE_VPID_INVALID; > item->proc_name.jobid = ORTE_JOBID_INVALID; > - item->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); > } > > void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item) > { > item->proc_name.vpid = ORTE_VPID_INVALID; > - item->proc_name.epoch = ORTE_EPOCH_INVALID; > + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); > item->proc_name.jobid = ORTE_JOBID_INVALID; > } > > @@ -139,13 +139,13 @@ > void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) > { > item->proc_name.vpid = ORTE_VPID_INVALID; > - item->proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); > item->proc_name.jobid = ORTE_JOBID_INVALID; > > item->node_name = NULL; > > item->map_proc_name.vpid = ORTE_VPID_INVALID; > - item->map_proc_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN); > item->map_proc_name.jobid = ORTE_JOBID_INVALID; > > item->map_node_name = NULL; > @@ -156,7 +156,7 @@ > void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) > { > item->proc_name.vpid = ORTE_VPID_INVALID; > - item->proc_name.epoch = ORTE_EPOCH_INVALID; > + ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); > item->proc_name.jobid = ORTE_JOBID_INVALID; > > if( NULL != item->node_name ) { > @@ -165,7 +165,7 @@ > } > > item->map_proc_name.vpid = ORTE_VPID_INVALID; > - item->map_proc_name.epoch = ORTE_EPOCH_INVALID; > + ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID); > item->map_proc_name.jobid = ORTE_JOBID_INVALID; > > if( NULL != item->map_node_name ) { > > Modified: trunk/orte/mca/errmgr/base/errmgr_base_tool.c > ============================================================================== > --- trunk/orte/mca/errmgr/base/errmgr_base_tool.c (original) > +++ trunk/orte/mca/errmgr/base/errmgr_base_tool.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -267,7 +267,7 @@ > */ > errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID; > errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID; > - errmgr_cmdline_sender.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN); > if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, > ORTE_RML_TAG_MIGRATE, > 0, > @@ -379,14 +379,14 @@ > if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, > ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) { > swap_dest.jobid = errmgr_cmdline_sender.jobid; > swap_dest.vpid = errmgr_cmdline_sender.vpid; > - swap_dest.epoch = errmgr_cmdline_sender.epoch; > + ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch); > > errmgr_cmdline_sender = *sender; > > orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS); > > errmgr_cmdline_sender.jobid = swap_dest.jobid; > errmgr_cmdline_sender.vpid = swap_dest.vpid; > - errmgr_cmdline_sender.epoch = swap_dest.epoch; > + ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch); > > goto cleanup; > } > > Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp.c > ============================================================================== > --- trunk/orte/mca/errmgr/hnp/errmgr_hnp.c (original) > +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -53,6 +53,7 @@ > #include "orte/runtime/orte_globals.h" > #include "orte/runtime/orte_locks.h" > #include "orte/runtime/orte_quit.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/errmgr/errmgr.h" > #include "orte/mca/errmgr/base/base.h" > @@ -83,9 +84,11 @@ > orte_errmgr_hnp_global_suggest_map_targets, > /* FT Event hook */ > orte_errmgr_hnp_global_ft_event, > - orte_errmgr_base_register_migration_warning, > + orte_errmgr_base_register_migration_warning > +#if ORTE_RESIL_ORTE > /* Set the callback */ > - orte_errmgr_base_set_fault_callback > + ,orte_errmgr_base_set_fault_callback > +#endif > }; > > > @@ -97,14 +100,16 @@ > static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t > jobstate, > orte_proc_state_t state, > orte_exit_code_t exit_code); > static void check_job_complete(orte_job_t *jdata); > -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t > epoch); > +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); > static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, > orte_proc_state_t state, orte_exit_code_t exit_code); > static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); > +#if ORTE_RESIL_ORTE > static int send_to_local_applications(opal_pointer_array_t *dead_names); > static void failure_notification(int status, orte_process_name_t* sender, > opal_buffer_t *buffer, orte_rml_tag_t tag, > void* cbdata); > +#endif > > /************************ > * API Definitions > @@ -380,16 +385,21 @@ > **********************/ > int orte_errmgr_hnp_base_global_init(void) > { > - int ret; > + int ret = ORTE_SUCCESS; > > +#if ORTE_RESIL_ORTE > ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, > ORTE_RML_TAG_FAILURE_NOTICE, > ORTE_RML_PERSISTENT, failure_notification, > NULL); > +#endif > + > return ret; > } > > int orte_errmgr_hnp_base_global_finalize(void) > { > +#if ORTE_RESIL_ORTE > orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); > +#endif > > return ORTE_SUCCESS; > } > @@ -406,6 +416,7 @@ > orte_odls_child_t *child; > int rc; > orte_app_context_t *app; > + orte_proc_t *pdat; > > OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, > "%s errmgr:hnp: job %s reported state %s" > @@ -538,7 +549,7 @@ > ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, > exit_code); > /* order all local procs for this job to be killed */ > - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); > + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); > check_job_complete(jdata); /* set the local proc states */ > /* the job object for this job will have been NULL'd > * in the array if the job was solely local. If it isn't > @@ -550,7 +561,7 @@ > break; > case ORTE_JOB_STATE_COMM_FAILED: > /* order all local procs for this job to be killed */ > - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); > + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); > check_job_complete(jdata); /* set the local proc states */ > /* the job object for this job will have been NULL'd > * in the array if the job was solely local. If it isn't > @@ -562,7 +573,7 @@ > break; > case ORTE_JOB_STATE_HEARTBEAT_FAILED: > /* order all local procs for this job to be killed */ > - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); > + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); > check_job_complete(jdata); /* set the local proc states */ > /* the job object for this job will have been NULL'd > * in the array if the job was solely local. If it isn't > @@ -632,10 +643,6 @@ > } > } > > - if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { > - exit_code = 0; > - } > - > orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); > check_job_complete(jdata); /* need to set the job state */ > /* the job object for this job will have been NULL'd > @@ -679,7 +686,7 @@ > > case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: > if (jdata->enable_recovery) { > - killprocs(proc->jobid, proc->vpid, proc->epoch); > + killprocs(proc->jobid, proc->vpid); > /* is this a local proc */ > if (NULL != (child = proc_is_local(proc))) { > /* local proc - see if it has reached its restart limit */ > @@ -778,18 +785,37 @@ > opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM > FAILED DAEMON %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > ORTE_NAME_PRINT(proc)); > /* kill all local procs */ > - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, > ORTE_EPOCH_WILDCARD); > + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); > /* kill all jobs */ > hnp_abort(ORTE_JOBID_WILDCARD, exit_code); > /* check if all is complete so we can terminate */ > check_job_complete(jdata); > } > } else { > +#if !ORTE_RESIL_ORTE > + if (NULL == (pdat = > (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { > + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); > + orte_show_help("help-orte-errmgr-hnp.txt", > "errmgr-hnp:daemon-died", true, > + ORTE_VPID_PRINT(proc->vpid), > "Unknown"); > + } else { > + orte_show_help("help-orte-errmgr-hnp.txt", > "errmgr-hnp:daemon-died", true, > + ORTE_VPID_PRINT(proc->vpid), > + (NULL == pdat->node) ? "Unknown" : > + ((NULL == pdat->node->name) ? > "Unknown" : pdat->node->name)); > + } > +#endif > if (ORTE_SUCCESS != > orte_errmgr_hnp_record_dead_process(proc)) { > /* The process is already dead so don't keep trying > to do > * this stuff. */ > return ORTE_SUCCESS; > } > + > +#if !ORTE_RESIL_ORTE > + /* kill all local procs */ > + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); > + /* kill all jobs */ > + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); > +#endif > /* We'll check if the job was complete when we get the > * message back from the HNP notifying us of the dead > * process */ > @@ -805,7 +831,7 @@ > } else { > orte_errmgr_hnp_record_dead_process(proc); > /* kill all local procs */ > - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, > ORTE_EPOCH_WILDCARD); > + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); > /* kill all jobs */ > hnp_abort(ORTE_JOBID_WILDCARD, exit_code); > return ORTE_ERR_UNRECOVERABLE; > @@ -824,6 +850,7 @@ > return ORTE_SUCCESS; > } > > +#if ORTE_RESIL_ORTE > static void failure_notification(int status, orte_process_name_t* sender, > opal_buffer_t *buffer, orte_rml_tag_t tag, > void* cbdata) > @@ -984,6 +1011,7 @@ > > OBJ_RELEASE(dead_names); > } > +#endif > > /***************** > * Local Functions > @@ -1354,7 +1382,6 @@ > ORTE_UPDATE_EXIT_STATUS(proc->exit_code); > } > break; > -#if 0 > case ORTE_PROC_STATE_ABORTED_BY_SIG: > OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, > "%s errmgr:hnp:check_job_completed proc %s > aborted by signal", > @@ -1370,7 +1397,6 @@ > ORTE_UPDATE_EXIT_STATUS(proc->exit_code); > } > break; > -#endif > case ORTE_PROC_STATE_TERM_WO_SYNC: > OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, > "%s errmgr:hnp:check_job_completed proc %s > terminated without sync", > @@ -1393,7 +1419,6 @@ > } > break; > case ORTE_PROC_STATE_COMM_FAILED: > -#if 1 > if (!jdata->abort) { > jdata->state = ORTE_JOB_STATE_COMM_FAILED; > /* point to the lowest rank to cause the problem */ > @@ -1403,7 +1428,6 @@ > jdata->abort = true; > ORTE_UPDATE_EXIT_STATUS(proc->exit_code); > } > -#endif > break; > case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: > if (!jdata->abort) { > @@ -1530,9 +1554,6 @@ > */ > CHECK_DAEMONS: > if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { > -#if 0 > - if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract > one for the HNP */ > -#endif > if (0 == orte_routed.num_routes()) { > /* orteds are done! */ > OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, > @@ -1696,7 +1717,7 @@ > } > } > > -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) > +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) > { > opal_pointer_array_t cmd; > orte_proc_t proc; > @@ -1707,7 +1728,9 @@ > orte_sensor.stop(job); > } > > - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && > ORTE_EPOCH_WILDCARD == epoch) { > + if (ORTE_JOBID_WILDCARD == job > + && ORTE_VPID_WILDCARD == vpid > + && ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { > if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { > ORTE_ERROR_LOG(rc); > } > @@ -1718,7 +1741,7 @@ > OBJ_CONSTRUCT(&proc, orte_proc_t); > proc.name.jobid = job; > proc.name.vpid = vpid; > - proc.name.epoch = epoch; > + ORTE_EPOCH_SET(proc.name.epoch,epoch); > opal_pointer_array_add(&cmd, &proc); > if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { > ORTE_ERROR_LOG(rc); > @@ -1913,13 +1936,15 @@ > } > > if (NULL != (pdat = > (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && > - ORTE_PROC_STATE_TERMINATED < pdat->state) { > + ORTE_PROC_STATE_TERMINATED > pdat->state) { > > +#if ORTE_ENABLE_EPOCH > /* Make sure that the epochs match. */ > if (proc->epoch != pdat->name.epoch) { > opal_output(1, "The epoch does not match the current epoch. > Throwing the request out."); > return ORTE_SUCCESS; > } > +#endif > > dead_names = OBJ_NEW(opal_pointer_array_t); > > @@ -1935,6 +1960,7 @@ > } > } > > +#if ORTE_RESIL_ORTE > if (!mca_errmgr_hnp_component.term_in_progress) { > /* > * Send a message to the other daemons so they know that a daemon > has > @@ -1949,7 +1975,7 @@ > OBJ_RELEASE(buffer); > } else { > > - /* Iterate of the list of dead procs and send them along with > + /* Iterate over the list of dead procs and send them along > with > * the rest. The HNP needs this info so it can tell the other > * ORTEDs and they can inform the appropriate applications. > */ > @@ -1973,6 +1999,9 @@ > } else { > orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); > } > +#else > + orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); > +#endif > } > > return ORTE_SUCCESS; > @@ -2011,6 +2040,7 @@ > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > ORTE_NAME_PRINT(&pdat->name))); > > +#if ORTE_RESIL_ORTE > /* Make sure the epochs match, if not it probably means that we > * already reported this failure. */ > if (name_item->epoch != pdat->name.epoch) { > @@ -2018,6 +2048,7 @@ > } > > orte_util_set_epoch(name_item, name_item->epoch + 1); > +#endif > > /* Remove it from the job array */ > opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); > @@ -2034,6 +2065,7 @@ > > OBJ_RELEASE(pdat); > > +#if ORTE_RESIL_ORTE > /* Create a new proc object that will keep track of the epoch > * information */ > pdat = OBJ_NEW(orte_proc_t); > @@ -2041,14 +2073,15 @@ > pdat->name.vpid = name_item->vpid; > pdat->name.epoch = name_item->epoch + 1; > > - /* Set the state as terminated so we'll know the process isn't > - * actually there. */ > - pdat->state = ORTE_PROC_STATE_TERMINATED; > - > opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); > jdat->num_procs++; > jdat->num_terminated++; > +#endif > + /* Set the state as terminated so we'll know the process isn't > + * actually there. */ > + pdat->state = ORTE_PROC_STATE_TERMINATED; > } else { > +#if ORTE_RESIL_ORTE > opal_output(0, "Proc data not found for %s", > ORTE_NAME_PRINT(name_item)); > /* Create a new proc object that will keep track of the epoch > * information */ > @@ -2064,11 +2097,13 @@ > opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); > jdat->num_procs++; > jdat->num_terminated++; > +#endif > } > > check_job_complete(jdat); > } > > +#if ORTE_RESIL_ORTE > if (!orte_orteds_term_ordered) { > /* Need to update the orted routing module. */ > orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); > @@ -2077,10 +2112,12 @@ > (*fault_cbfunc)(dead_procs); > } > } > +#endif > > return ORTE_SUCCESS; > } > > +#if ORTE_RESIL_ORTE > int send_to_local_applications(opal_pointer_array_t *dead_names) { > opal_buffer_t *buf; > int ret = ORTE_SUCCESS; > @@ -2121,3 +2158,5 @@ > > return ret; > } > +#endif > + > > Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c > ============================================================================== > --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c (original) > +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_autor.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -522,7 +522,7 @@ > wp_item = OBJ_NEW(errmgr_autor_wp_item_t); > wp_item->name.jobid = proc->jobid; > wp_item->name.vpid = proc->vpid; > - wp_item->name.epoch = proc->epoch; > + ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); > wp_item->state = state; > > opal_list_append(procs_pending_recovery, &(wp_item->super)); > @@ -626,7 +626,7 @@ > { > wp->name.jobid = ORTE_JOBID_INVALID; > wp->name.vpid = ORTE_VPID_INVALID; > - wp->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN); > > wp->state = 0; > } > @@ -635,7 +635,7 @@ > { > wp->name.jobid = ORTE_JOBID_INVALID; > wp->name.vpid = ORTE_VPID_INVALID; > - wp->name.epoch = ORTE_EPOCH_INVALID; > + ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); > > wp->state = 0; > } > > Modified: trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c > ============================================================================== > --- trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c (original) > +++ trunk/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -750,7 +750,7 @@ > close_iof_stdin = true; > iof_name.jobid = proc->name.jobid; > iof_name.vpid = proc->name.vpid; > - iof_name.epoch = proc->name.epoch; > + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); > } > } > } > @@ -807,7 +807,7 @@ > close_iof_stdin = true; > iof_name.jobid = proc->name.jobid; > iof_name.vpid = proc->name.vpid; > - iof_name.epoch = proc->name.epoch; > + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); > } > } > } > @@ -855,7 +855,7 @@ > close_iof_stdin = true; > iof_name.jobid = proc->name.jobid; > iof_name.vpid = proc->name.vpid; > - iof_name.epoch = proc->name.epoch; > + ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); > } > } > } > > Modified: trunk/orte/mca/errmgr/orted/errmgr_orted.c > ============================================================================== > --- trunk/orte/mca/errmgr/orted/errmgr_orted.c (original) > +++ trunk/orte/mca/errmgr/orted/errmgr_orted.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -34,6 +34,7 @@ > #include "orte/util/show_help.h" > #include "orte/util/nidmap.h" > #include "orte/runtime/orte_globals.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > #include "orte/mca/rml/rml.h" > #include "orte/mca/odls/odls.h" > #include "orte/mca/odls/base/base.h" > @@ -41,7 +42,9 @@ > #include "orte/mca/plm/plm_types.h" > #include "orte/mca/routed/routed.h" > #include "orte/mca/sensor/sensor.h" > +#include "orte/mca/ess/ess.h" > #include "orte/runtime/orte_quit.h" > +#include "orte/runtime/orte_globals.h" > > #include "orte/mca/errmgr/errmgr.h" > #include "orte/mca/errmgr/base/base.h" > @@ -59,13 +62,15 @@ > static void update_local_children(orte_odls_job_t *jobdat, > orte_job_state_t jobstate, > orte_proc_state_t state); > -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t > epoch); > +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); > static int record_dead_process(orte_process_name_t *proc); > -static int send_to_local_applications(opal_pointer_array_t *dead_names); > static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); > +#if ORTE_RESIL_ORTE > +static int send_to_local_applications(opal_pointer_array_t *dead_names); > static void failure_notification(int status, orte_process_name_t* sender, > opal_buffer_t *buffer, orte_rml_tag_t tag, > void* cbdata); > +#endif > > /* > * Module functions: Global > @@ -104,8 +109,10 @@ > predicted_fault, > suggest_map_targets, > ft_event, > - orte_errmgr_base_register_migration_warning, > - orte_errmgr_base_set_fault_callback /* Set callback function */ > + orte_errmgr_base_register_migration_warning > +#if ORTE_RESIL_ORTE > + ,orte_errmgr_base_set_fault_callback /* Set callback function */ > +#endif > }; > > /************************ > @@ -113,16 +120,22 @@ > ************************/ > static int init(void) > { > - int ret; > + int ret = ORTE_SUCCESS; > > +#if ORTE_RESIL_ORTE > ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, > ORTE_RML_TAG_FAILURE_NOTICE, > ORTE_RML_PERSISTENT, failure_notification, > NULL); > +#endif > + > return ret; > } > > static int finalize(void) > { > +#if ORTE_RESIL_ORTE > orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); > +#endif > + > return ORTE_SUCCESS; > } > > @@ -228,10 +241,10 @@ > /* update all procs in job */ > update_local_children(jobdat, jobstate, > ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); > /* order all local procs for this job to be killed */ > - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, > ORTE_EPOCH_WILDCARD); > + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); > case ORTE_JOB_STATE_COMM_FAILED: > /* kill all local procs */ > - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, > ORTE_EPOCH_WILDCARD); > + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); > /* tell the caller we can't recover */ > return ORTE_ERR_UNRECOVERABLE; > break; > @@ -276,7 +289,7 @@ > /* see if this was a lifeline */ > if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { > /* kill our children */ > - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, > ORTE_EPOCH_WILDCARD); > + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); > /* terminate - our routed children will see > * us leave and automatically die > */ > @@ -290,10 +303,18 @@ > if (0 == orte_routed.num_routes() && > 0 == opal_list_get_size(&orte_local_children)) { > orte_quit(); > + } else { > + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, > + "%s errmgr:orted not exiting, num_routes() == > %d, num children == %d", > + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > + orte_routed.num_routes(), > + opal_list_get_size(&orte_local_children))); > } > } > > +#if ORTE_RESIL_ORTE > record_dead_process(proc); > +#endif > > /* if not, then indicate we can continue */ > return ORTE_SUCCESS; > @@ -344,7 +365,7 @@ > /* Decrement the number of local procs */ > jobdat->num_local_procs--; > /* kill this proc */ > - killprocs(proc->jobid, proc->vpid, proc->epoch); > + killprocs(proc->jobid, proc->vpid); > } > app = > (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, > child->app_idx); > if( jobdat->enable_recovery && child->restarts < > app->max_restarts ) { > @@ -526,10 +547,12 @@ > ORTE_ERROR_LOG(rc); > goto FINAL_CLEANUP; > } > +#if ORTE_ENABLE_EPOCH > if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, > &child->name->epoch, 1, ORTE_EPOCH))) { > ORTE_ERROR_LOG(rc); > goto FINAL_CLEANUP; > } > +#endif > } > } > /* pack an invalid marker */ > @@ -660,7 +683,7 @@ > continue; > } > > - if (name_item->epoch < orte_util_lookup_epoch(name_item)) { > + if (0 < > ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { > continue; > } > > @@ -669,9 +692,11 @@ > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > ORTE_NAME_PRINT(name_item))); > > +#if ORTE_ENABLE_EPOCH > /* Increment the epoch */ > orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); > orte_util_set_epoch(name_item, name_item->epoch + 1); > +#endif > > OPAL_THREAD_LOCK(&orte_odls_globals.mutex); > > @@ -706,6 +731,7 @@ > return ORTE_SUCCESS; > } > > +#if ORTE_RESIL_ORTE > static void failure_notification(int status, orte_process_name_t* sender, > opal_buffer_t *buffer, orte_rml_tag_t tag, > void* cbdata) > @@ -714,7 +740,7 @@ > orte_std_cntr_t n; > int ret = ORTE_SUCCESS, num_failed; > int32_t i; > - orte_process_name_t *name_item, proc; > + orte_process_name_t *name_item; > > dead_names = OBJ_NEW(opal_pointer_array_t); > > @@ -746,7 +772,7 @@ > /* There shouldn't be an issue of receiving this message multiple > * times but it doesn't hurt to double check. > */ > - if (proc.epoch < orte_util_lookup_epoch(name_item)) { > + if (0 < > ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { > opal_output(1, "Received from proc %s local epoch %d", > ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); > continue; > } > @@ -767,6 +793,7 @@ > free(name_item); > } > } > +#endif > > /***************** > * Local Functions > @@ -948,11 +975,13 @@ > ORTE_ERROR_LOG(rc); > return rc; > } > +#if ORTE_ENABLE_EPOCH > /* Pack the child's epoch. */ > if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, > &(child->name->epoch), 1, ORTE_EPOCH))) { > ORTE_ERROR_LOG(rc); > return rc; > } > +#endif > /* pack the contact info */ > if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, > OPAL_STRING))) { > ORTE_ERROR_LOG(rc); > @@ -1015,7 +1044,7 @@ > } > } > > -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) > +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) > { > opal_pointer_array_t cmd; > orte_proc_t proc; > @@ -1026,7 +1055,9 @@ > orte_sensor.stop(job); > } > > - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && > ORTE_EPOCH_WILDCARD == epoch) { > + if (ORTE_JOBID_WILDCARD == job > + && ORTE_VPID_WILDCARD == vpid > + && 0 == ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) { > if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { > ORTE_ERROR_LOG(rc); > } > @@ -1037,7 +1068,7 @@ > OBJ_CONSTRUCT(&proc, orte_proc_t); > proc.name.jobid = job; > proc.name.vpid = vpid; > - proc.name.epoch = epoch; > + ORTE_EPOCH_SET(proc.name.epoch,epoch); > opal_pointer_array_add(&cmd, &proc); > if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { > ORTE_ERROR_LOG(rc); > @@ -1082,20 +1113,21 @@ > return rc; > } > > +#if ORTE_RESIL_ORTE > int send_to_local_applications(opal_pointer_array_t *dead_names) { > opal_buffer_t *buf; > int ret; > orte_process_name_t *name_item; > int size, i; > > - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, > - "%s Sending failure to local applications.", > - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > - > buf = OBJ_NEW(opal_buffer_t); > > size = opal_pointer_array_get_size(dead_names); > > + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, > + "%s Sending %d failure(s) to local applications.", > + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size)); > + > if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { > ORTE_ERROR_LOG(ret); > OBJ_RELEASE(buf); > @@ -1122,4 +1154,5 @@ > > return ORTE_SUCCESS; > } > +#endif > > > Modified: trunk/orte/mca/ess/alps/ess_alps_module.c > ============================================================================== > --- trunk/orte/mca/ess/alps/ess_alps_module.c (original) > +++ trunk/orte/mca/ess/alps/ess_alps_module.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -363,8 +363,8 @@ > > ORTE_PROC_MY_NAME->jobid = jobid; > ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); > + ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); > + > ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "ess:alps set name to %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > Modified: trunk/orte/mca/ess/base/base.h > ============================================================================== > --- trunk/orte/mca/ess/base/base.h (original) > +++ trunk/orte/mca/ess/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -57,7 +57,11 @@ > > ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available; > > +#if ORTE_ENABLE_EPOCH > ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t > *proc); > +#else > +ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc); > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > > > Modified: trunk/orte/mca/ess/base/ess_base_select.c > ============================================================================== > --- trunk/orte/mca/ess/base/ess_base_select.c (original) > +++ trunk/orte/mca/ess/base/ess_base_select.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -36,21 +36,19 @@ > * Generic function to retrieve the epoch of a specific process > * from the job data. > */ > +#if !ORTE_ENABLE_EPOCH > +int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { > + return 0; > +} > +#else > orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { > orte_epoch_t epoch = ORTE_EPOCH_INVALID; > > -#if !ORTE_DISABLE_FULL_SUPPORT > epoch = orte_util_lookup_epoch(proc); > -#endif > - > - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, > - "%s ess:generic: proc %s has epoch %d", > - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > - ORTE_NAME_PRINT(proc), > - epoch)); > > return epoch; > } > +#endif > > int > orte_ess_base_select(void) > > Modified: trunk/orte/mca/ess/env/ess_env_module.c > ============================================================================== > --- trunk/orte/mca/ess/env/ess_env_module.c (original) > +++ trunk/orte/mca/ess/env/ess_env_module.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -392,8 +392,7 @@ > > ORTE_PROC_MY_NAME->jobid = jobid; > ORTE_PROC_MY_NAME->vpid = vpid; > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "ess:env set name to %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > Modified: trunk/orte/mca/ess/ess.h > ============================================================================== > --- trunk/orte/mca/ess/ess.h (original) > +++ trunk/orte/mca/ess/ess.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -111,7 +111,11 @@ > * will get the most up to date version stored within the orte_proc_t struct. > * Obviously the epoch of the proc that is passed in will be ignored. > */ > +#if ORTE_ENABLE_EPOCH > typedef orte_epoch_t > (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); > +#else > +typedef int (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t > *proc); > +#endif > > /** > * Update the pidmap > > Modified: trunk/orte/mca/ess/generic/ess_generic_module.c > ============================================================================== > --- trunk/orte/mca/ess/generic/ess_generic_module.c (original) > +++ trunk/orte/mca/ess/generic/ess_generic_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -155,7 +155,7 @@ > goto error; > } > ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "%s completed name definition", > @@ -273,7 +273,7 @@ > if (vpid == ORTE_PROC_MY_NAME->vpid) { > ORTE_PROC_MY_DAEMON->jobid = 0; > ORTE_PROC_MY_DAEMON->vpid = i; > - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; > + > ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); > } > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "%s node %d name %s rank %s", > @@ -304,7 +304,7 @@ > if (vpid == ORTE_PROC_MY_NAME->vpid) { > ORTE_PROC_MY_DAEMON->jobid = 0; > ORTE_PROC_MY_DAEMON->vpid = i; > - ORTE_PROC_MY_DAEMON->epoch = > ORTE_PROC_MY_NAME->epoch; > + > ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); > } > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "%s node %d name %s rank %d", > > Modified: trunk/orte/mca/ess/hnp/ess_hnp_module.c > ============================================================================== > --- trunk/orte/mca/ess/hnp/ess_hnp_module.c (original) > +++ trunk/orte/mca/ess/hnp/ess_hnp_module.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -494,7 +494,7 @@ > proc = OBJ_NEW(orte_proc_t); > proc->name.jobid = ORTE_PROC_MY_NAME->jobid; > proc->name.vpid = ORTE_PROC_MY_NAME->vpid; > - proc->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); > > proc->pid = orte_process_info.pid; > proc->rml_uri = orte_rml.get_contact_info(); > > Modified: trunk/orte/mca/ess/lsf/ess_lsf_module.c > ============================================================================== > --- trunk/orte/mca/ess/lsf/ess_lsf_module.c (original) > +++ trunk/orte/mca/ess/lsf/ess_lsf_module.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -357,8 +357,7 @@ > > ORTE_PROC_MY_NAME->jobid = jobid; > ORTE_PROC_MY_NAME->vpid = vpid; > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > /* fix up the base name and make it the "real" name */ > lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); > > Modified: trunk/orte/mca/ess/singleton/ess_singleton_module.c > ============================================================================== > --- trunk/orte/mca/ess/singleton/ess_singleton_module.c (original) > +++ trunk/orte/mca/ess/singleton/ess_singleton_module.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -188,7 +188,7 @@ > /* set the name */ > ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); > ORTE_PROC_MY_NAME->vpid = 0; > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); > > } else { > /* > > Modified: trunk/orte/mca/ess/slave/ess_slave_module.c > ============================================================================== > --- trunk/orte/mca/ess/slave/ess_slave_module.c (original) > +++ trunk/orte/mca/ess/slave/ess_slave_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -280,8 +280,7 @@ > > ORTE_PROC_MY_NAME->jobid = jobid; > ORTE_PROC_MY_NAME->vpid = vpid; > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "ess:slave set name to %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > Modified: trunk/orte/mca/ess/slurm/ess_slurm_module.c > ============================================================================== > --- trunk/orte/mca/ess/slurm/ess_slurm_module.c (original) > +++ trunk/orte/mca/ess/slurm/ess_slurm_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -368,8 +368,7 @@ > /* fix up the vpid and make it the "real" vpid */ > slurm_nodeid = atoi(getenv("SLURM_NODEID")); > ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid; > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "ess:slurm set name to %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > Modified: trunk/orte/mca/ess/slurmd/ess_slurmd_module.c > ============================================================================== > --- trunk/orte/mca/ess/slurmd/ess_slurmd_module.c (original) > +++ trunk/orte/mca/ess/slurmd/ess_slurmd_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -195,7 +195,7 @@ > } > ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); > #endif > - ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); > /* get our local rank */ > if (NULL == (envar = getenv("SLURM_LOCALID"))) { > error = "could not get SLURM_LOCALID"; > @@ -260,7 +260,7 @@ > nodeid = strtol(envar, NULL, 10); > ORTE_PROC_MY_DAEMON->jobid = 0; > ORTE_PROC_MY_DAEMON->vpid = nodeid; > - ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); > > /* get the number of ppn */ > if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) { > > Modified: trunk/orte/mca/ess/tm/ess_tm_module.c > ============================================================================== > --- trunk/orte/mca/ess/tm/ess_tm_module.c (original) > +++ trunk/orte/mca/ess/tm/ess_tm_module.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -364,7 +364,7 @@ > > ORTE_PROC_MY_NAME->jobid = jobid; > ORTE_PROC_MY_NAME->vpid = vpid; > - ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); > > OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, > "ess:tm set name to %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > Modified: trunk/orte/mca/filem/rsh/filem_rsh_module.c > ============================================================================== > --- trunk/orte/mca/filem/rsh/filem_rsh_module.c (original) > +++ trunk/orte/mca/filem/rsh/filem_rsh_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -1097,11 +1097,11 @@ > if( NULL != proc_set ) { > wp_item->proc_set.source.jobid = proc_set->source.jobid; > wp_item->proc_set.source.vpid = proc_set->source.vpid; > - wp_item->proc_set.source.epoch = proc_set->source.epoch; > + > ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,proc_set->source.epoch); > > wp_item->proc_set.sink.jobid = proc_set->sink.jobid; > wp_item->proc_set.sink.vpid = proc_set->sink.vpid; > - wp_item->proc_set.sink.epoch = proc_set->sink.epoch; > + ORTE_EPOCH_SET(wp_item->proc_set.sink.epoch,proc_set->sink.epoch); > } > /* Copy the File Set */ > if( NULL != file_set ) { > @@ -1396,7 +1396,7 @@ > wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t); > wp_item->proc_set.source.jobid = sender->jobid; > wp_item->proc_set.source.vpid = sender->vpid; > - wp_item->proc_set.source.epoch = sender->epoch; > + ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,sender->epoch); > > opal_list_append(&work_pool_waiting, &(wp_item->super)); > } > > Modified: trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c > ============================================================================== > --- trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c (original) > +++ trunk/orte/mca/grpcomm/base/grpcomm_base_coll.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -168,8 +168,7 @@ > if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { > /* I send first */ > peer.vpid = vpids[1]; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > /* setup a temp buffer so I can inform the other side as to the > * number of entries in my buffer > @@ -226,8 +225,7 @@ > opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); > opal_dss.copy_payload(&buf, sendbuf); > peer.vpid = vpids[0]; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, > "%s grpcomm:coll:two-proc sending to %s", > @@ -320,8 +318,7 @@ > /* first send my current contents */ > nv = (rank - distance + np) % np; > peer.vpid = vpids[nv]; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > OBJ_CONSTRUCT(&buf, opal_buffer_t); > opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); > @@ -340,8 +337,7 @@ > num_recvd = 0; > nv = (rank + distance) % np; > peer.vpid = vpids[nv]; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > OBJ_CONSTRUCT(&bucket, opal_buffer_t); > if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, > @@ -439,8 +435,7 @@ > /* first send my current contents */ > nv = rank ^ distance; > peer.vpid = vpids[nv]; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > OBJ_CONSTRUCT(&buf, opal_buffer_t); > opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); > @@ -646,8 +641,7 @@ > proc.jobid = jobid; > proc.vpid = 0; > while (proc.vpid < jobdat->num_procs && 0 < > opal_list_get_size(&daemon_tree)) { > - proc.epoch = ORTE_EPOCH_INVALID; > - proc.epoch = orte_ess.proc_get_epoch(&proc); > + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); > > /* get the daemon that hosts this proc */ > daemonvpid = orte_ess.proc_get_daemon(&proc); > @@ -713,8 +707,7 @@ > /* send it */ > my_parent.jobid = ORTE_PROC_MY_NAME->jobid; > my_parent.vpid = orte_routed.get_routing_tree(NULL); > - my_parent.epoch = ORTE_EPOCH_INVALID; > - my_parent.epoch = orte_ess.proc_get_epoch(&my_parent); > + ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); > > OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, > "%s grpcomm:base:daemon_coll: daemon collective > not the HNP - sending to parent %s", > > Modified: trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c > ============================================================================== > --- trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c (original) > +++ trunk/orte/mca/grpcomm/hier/grpcomm_hier_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -95,7 +95,7 @@ > > my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; > my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; > - my_local_rank_zero_proc.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); > > if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { > ORTE_ERROR_LOG(rc); > @@ -270,7 +270,7 @@ > proc.jobid = ORTE_PROC_MY_NAME->jobid; > for (v=0; v < orte_process_info.num_procs; v++) { > proc.vpid = v; > - proc.epoch = orte_util_lookup_epoch(&proc); > + ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); > > /* is this proc local_rank=0 on its node? */ > if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { > @@ -285,7 +285,7 @@ > nm = OBJ_NEW(orte_namelist_t); > nm->name.jobid = proc.jobid; > nm->name.vpid = proc.vpid; > - nm->name.epoch = proc.epoch; > + ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); > > opal_list_append(&my_local_peers, &nm->item); > /* if I am not local_rank=0, is this one? */ > @@ -293,7 +293,7 @@ > 0 == orte_ess.get_local_rank(&proc)) { > my_local_rank_zero_proc.jobid = proc.jobid; > my_local_rank_zero_proc.vpid = proc.vpid; > - my_local_rank_zero_proc.epoch = proc.epoch; > + ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); > } > } > > > Modified: trunk/orte/mca/iof/base/base.h > ============================================================================== > --- trunk/orte/mca/iof/base/base.h (original) > +++ trunk/orte/mca/iof/base/base.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -135,7 +135,7 @@ > ep = OBJ_NEW(orte_iof_sink_t); \ > ep->name.jobid = (nm)->jobid; \ > ep->name.vpid = (nm)->vpid; \ > - ep->name.epoch = (nm)->epoch; \ > + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ > ep->tag = (tg); \ > if (0 <= (fid)) { \ > ep->wev->fd = (fid); \ > @@ -169,7 +169,7 @@ > rev = OBJ_NEW(orte_iof_read_event_t); \ > rev->name.jobid = (nm)->jobid; \ > rev->name.vpid = (nm)->vpid; \ > - rev->name.epoch = (nm)->epoch; \ > + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ > rev->tag = (tg); \ > rev->fd = (fid); \ > *(rv) = rev; \ > @@ -194,7 +194,7 @@ > ep = OBJ_NEW(orte_iof_sink_t); \ > ep->name.jobid = (nm)->jobid; \ > ep->name.vpid = (nm)->vpid; \ > - ep->name.epoch = (nm)->epoch; \ > + ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ > ep->tag = (tg); \ > if (0 <= (fid)) { \ > ep->wev->fd = (fid); \ > @@ -215,7 +215,7 @@ > rev = OBJ_NEW(orte_iof_read_event_t); \ > rev->name.jobid = (nm)->jobid; \ > rev->name.vpid = (nm)->vpid; \ > - rev->name.epoch= (nm)->epoch; \ > + ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ > rev->tag = (tg); \ > *(rv) = rev; \ > opal_event_set(opal_event_base, \ > > Modified: trunk/orte/mca/iof/base/iof_base_open.c > ============================================================================== > --- trunk/orte/mca/iof/base/iof_base_open.c (original) > +++ trunk/orte/mca/iof/base/iof_base_open.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -91,7 +91,7 @@ > { > ptr->daemon.jobid = ORTE_JOBID_INVALID; > ptr->daemon.vpid = ORTE_VPID_INVALID; > - ptr->daemon.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN); > ptr->wev = OBJ_NEW(orte_iof_write_event_t); > } > static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) > > Modified: trunk/orte/mca/iof/hnp/iof_hnp.c > ============================================================================== > --- trunk/orte/mca/iof/hnp/iof_hnp.c (original) > +++ trunk/orte/mca/iof/hnp/iof_hnp.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -186,7 +186,7 @@ > proct = OBJ_NEW(orte_iof_proc_t); > proct->name.jobid = dst_name->jobid; > proct->name.vpid = dst_name->vpid; > - proct->name.epoch = dst_name->epoch; > + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); > opal_list_append(&mca_iof_hnp_component.procs, &proct->super); > /* see if we are to output to a file */ > if (NULL != orte_output_filename) { > @@ -281,8 +281,7 @@ > &mca_iof_hnp_component.sinks); > sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; > sink->daemon.vpid = proc->node->daemon->name.vpid; > - sink->daemon.epoch = ORTE_EPOCH_INVALID; > - sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon); > + > ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); > } > } > > @@ -389,7 +388,7 @@ > &mca_iof_hnp_component.sinks); > sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; > sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; > - sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); > > return ORTE_SUCCESS; > } > > Modified: trunk/orte/mca/iof/hnp/iof_hnp_receive.c > ============================================================================== > --- trunk/orte/mca/iof/hnp/iof_hnp_receive.c (original) > +++ trunk/orte/mca/iof/hnp/iof_hnp_receive.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -109,21 +109,21 @@ > NULL, &mca_iof_hnp_component.sinks); > sink->daemon.jobid = mev->sender.jobid; > sink->daemon.vpid = mev->sender.vpid; > - sink->daemon.epoch = mev->sender.epoch; > + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); > } > if (ORTE_IOF_STDERR & stream) { > ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, > NULL, &mca_iof_hnp_component.sinks); > sink->daemon.jobid = mev->sender.jobid; > sink->daemon.vpid = mev->sender.vpid; > - sink->daemon.epoch = mev->sender.epoch; > + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); > } > if (ORTE_IOF_STDDIAG & stream) { > ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, > NULL, &mca_iof_hnp_component.sinks); > sink->daemon.jobid = mev->sender.jobid; > sink->daemon.vpid = mev->sender.vpid; > - sink->daemon.epoch = mev->sender.epoch; > + ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); > } > goto CLEAN_RETURN; > } > > Modified: trunk/orte/mca/iof/orted/iof_orted.c > ============================================================================== > --- trunk/orte/mca/iof/orted/iof_orted.c (original) > +++ trunk/orte/mca/iof/orted/iof_orted.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -163,7 +163,7 @@ > proct = OBJ_NEW(orte_iof_proc_t); > proct->name.jobid = dst_name->jobid; > proct->name.vpid = dst_name->vpid; > - proct->name.epoch = dst_name->epoch; > + ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); > opal_list_append(&mca_iof_orted_component.procs, &proct->super); > /* see if we are to output to a file */ > if (NULL != orte_output_filename) { > > Modified: trunk/orte/mca/odls/base/odls_base_default_fns.c > ============================================================================== > --- trunk/orte/mca/odls/base/odls_base_default_fns.c (original) > +++ trunk/orte/mca/odls/base/odls_base_default_fns.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -734,8 +734,7 @@ > proc.jobid = jobdat->jobid; > for (j=0; j < jobdat->num_procs; j++) { > proc.vpid = j; > - proc.epoch = ORTE_EPOCH_INVALID; > - proc.epoch = orte_ess.proc_get_epoch(&proc); > + ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); > /* get the vpid of the daemon that is to host this proc */ > if (ORTE_VPID_INVALID == (host_daemon = > orte_ess.proc_get_daemon(&proc))) { > ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); > @@ -1044,6 +1043,7 @@ > free(param); > free(value); > > +#if ORTE_ENABLE_EPOCH > /* setup the epoch */ > if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, > child->name->epoch))) { > ORTE_ERROR_LOG(rc); > @@ -1057,6 +1057,7 @@ > opal_setenv(param, value, true, env); > free(param); > free(value); > +#endif > > /* setup the vpid */ > if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, > child->name->vpid))) { > @@ -2721,7 +2722,7 @@ > OBJ_CONSTRUCT(&proctmp, orte_proc_t); > proctmp.name.jobid = ORTE_JOBID_WILDCARD; > proctmp.name.vpid = ORTE_VPID_WILDCARD; > - proctmp.name.epoch = ORTE_EPOCH_WILDCARD; > + ORTE_EPOCH_SET(proctmp.name.epoch,ORTE_EPOCH_WILDCARD); > opal_pointer_array_add(&procarray, &proctmp); > procptr = &procarray; > do_cleanup = true; > > Modified: trunk/orte/mca/odls/base/odls_base_open.c > ============================================================================== > --- trunk/orte/mca/odls/base/odls_base_open.c (original) > +++ trunk/orte/mca/odls/base/odls_base_open.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -187,7 +187,7 @@ > if (-1 == rank) { > /* wildcard */ > nm->name.vpid = ORTE_VPID_WILDCARD; > - nm->name.epoch = ORTE_EPOCH_WILDCARD; > + ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD); > } else if (rank < 0) { > /* error out on bozo case */ > orte_show_help("help-odls-base.txt", > @@ -200,8 +200,7 @@ > * will be in the job - we'll check later > */ > nm->name.vpid = rank; > - nm->name.epoch = ORTE_EPOCH_INVALID; > - nm->name.epoch = orte_ess.proc_get_epoch(&nm->name); > + > ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); > } > opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); > } > > Modified: trunk/orte/mca/odls/base/odls_base_state.c > ============================================================================== > --- trunk/orte/mca/odls/base/odls_base_state.c (original) > +++ trunk/orte/mca/odls/base/odls_base_state.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -77,17 +77,17 @@ > /* if I am the HNP, then use me as the source */ > p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; > p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; > - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); > } > else { > /* otherwise, set the HNP as the source */ > p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; > p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; > - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; > + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); > } > p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; > p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; > - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); > > opal_list_append(&(filem_request->process_sets), &(p_set->super) ); > > > Modified: trunk/orte/mca/oob/tcp/oob_tcp_msg.c > ============================================================================== > --- trunk/orte/mca/oob/tcp/oob_tcp_msg.c (original) > +++ trunk/orte/mca/oob/tcp/oob_tcp_msg.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -137,6 +137,7 @@ > bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct > mca_oob_tcp_peer_t * peer) > { > int rc; > + > while(1) { > rc = writev(peer->peer_sd, msg->msg_rwptr, msg->msg_rwnum); > if(rc < 0) { > @@ -338,6 +339,7 @@ > orte_process_name_t src = msg->msg_hdr.msg_src; > > OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); > + > if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, > &src) != OPAL_EQUAL) { > opal_hash_table_remove_value_uint64(&mca_oob_tcp_component.tcp_peers, > > orte_util_hash_name(&peer->peer_name)); > > Modified: trunk/orte/mca/oob/tcp/oob_tcp_peer.c > ============================================================================== > --- trunk/orte/mca/oob/tcp/oob_tcp_peer.c (original) > +++ trunk/orte/mca/oob/tcp/oob_tcp_peer.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -903,6 +903,11 @@ > static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user) > { > mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user; > + > + if (orte_abnormal_term_ordered) { > + return; > + } > + > OPAL_THREAD_LOCK(&peer->peer_lock); > switch(peer->peer_state) { > case MCA_OOB_TCP_CONNECT_ACK: > > Modified: trunk/orte/mca/plm/base/plm_base_jobid.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_jobid.c (original) > +++ trunk/orte/mca/plm/base/plm_base_jobid.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -62,12 +62,12 @@ > /* set the name */ > ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); > ORTE_PROC_MY_NAME->vpid = 0; > - ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); > > /* copy it to the HNP field */ > ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; > ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; > - ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_PROC_MY_NAME->epoch); > > /* done */ > return ORTE_SUCCESS; > > Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_launch_support.c (original) > +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -377,8 +377,7 @@ > /* push stdin - the IOF will know what to do with the specified target */ > name.jobid = job; > name.vpid = jdata->stdin_target; > - name.epoch = ORTE_EPOCH_INVALID; > - name.epoch = orte_ess.proc_get_epoch(&name); > + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); > > if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) { > ORTE_ERROR_LOG(rc); > > Modified: trunk/orte/mca/plm/base/plm_base_orted_cmds.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_orted_cmds.c (original) > +++ trunk/orte/mca/plm/base/plm_base_orted_cmds.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -163,8 +163,7 @@ > continue; > } > peer.vpid = v; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > /* don't worry about errors on the send here - just > * issue it and keep going > @@ -242,7 +241,7 @@ > OBJ_CONSTRUCT(&proc, orte_proc_t); > proc.name.jobid = jobid; > proc.name.vpid = ORTE_VPID_WILDCARD; > - proc.name.epoch = ORTE_EPOCH_WILDCARD; > + ORTE_EPOCH_SET(proc.name.epoch,ORTE_EPOCH_WILDCARD); > opal_pointer_array_add(&procs, &proc); > if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) { > ORTE_ERROR_LOG(rc); > @@ -340,8 +339,7 @@ > continue; > } > peer.vpid = v; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > /* check to see if this daemon is known to be "dead" */ > if (proc->state > ORTE_PROC_STATE_UNTERMINATED) { > /* don't try to send this */ > > Modified: trunk/orte/mca/plm/base/plm_base_receive.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_receive.c (original) > +++ trunk/orte/mca/plm/base/plm_base_receive.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -146,7 +146,9 @@ > orte_job_t *jdata, *parent; > opal_buffer_t answer; > orte_vpid_t vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; > +#endif > orte_proc_t *proc; > orte_proc_state_t state; > orte_exit_code_t exit_code; > @@ -394,8 +396,7 @@ > break; > } > name.vpid = vpid; > - name.epoch = ORTE_EPOCH_INVALID; > - name.epoch = orte_ess.proc_get_epoch(&name); > + > ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); > > /* unpack the pid */ > count = 1; > @@ -488,9 +489,11 @@ > } > name.vpid = vpid; > > +#if ORTE_ENABLE_EPOCH > count=1; > opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH); > name.epoch = epoch; > +#endif > > OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, > "%s plm:base:receive Described rank %s", > > Modified: trunk/orte/mca/plm/base/plm_base_rsh_support.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_rsh_support.c (original) > +++ trunk/orte/mca/plm/base/plm_base_rsh_support.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -1527,7 +1527,9 @@ > { > char *param, *path, *tmp, *cmd, *basename, *dest_dir; > int i; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; > +#endif > orte_process_name_t proc; > > /* if a prefix is set, pass it to the bootproxy in a special way */ > @@ -1638,6 +1640,7 @@ > opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv); > free(cmd); > > +#if ORTE_ENABLE_EPOCH > /* set the epoch */ > proc.jobid = jobid; > proc.vpid = vpid; > @@ -1648,6 +1651,7 @@ > opal_setenv(param, cmd, true, argv); > free(param); > free(cmd); > +#endif > > /* set the number of procs */ > asprintf(&cmd, "%d", (int)num_procs); > > Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c > ============================================================================== > --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) > +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -33,12 +33,14 @@ > #include "orte/mca/ess/ess.h" > #include "opal/mca/sysinfo/sysinfo_types.h" > > +#include "orte/types.h" > #include "orte/util/show_help.h" > #include "orte/util/name_fns.h" > #include "orte/runtime/orte_globals.h" > #include "orte/util/hostfile/hostfile.h" > #include "orte/util/dash_host/dash_host.h" > #include "orte/mca/errmgr/errmgr.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rmaps/base/rmaps_private.h" > #include "orte/mca/rmaps/base/base.h" > @@ -454,7 +456,7 @@ > */ > > /* We do set the epoch here since they all start with the same value. > */ > - proc->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); > > proc->app_idx = app_idx; > OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, > @@ -559,11 +561,12 @@ > } > } > proc->name.vpid = vpid; > - proc->name.epoch = ORTE_EPOCH_INVALID; > - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); > + > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > + > /* If there is an invalid epoch here, it's because it > doesn't exist yet. */ > - if (ORTE_NODE_RANK_INVALID == proc->name.epoch) { > - proc->name.epoch = ORTE_EPOCH_MIN; > + if (0 == > ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); > } > } > if (NULL == opal_pointer_array_get_item(jdata->procs, > proc->name.vpid)) { > @@ -601,8 +604,8 @@ > } > } > proc->name.vpid = vpid; > - proc->name.epoch = ORTE_EPOCH_INVALID; > - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); > + > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > } > if (NULL == opal_pointer_array_get_item(jdata->procs, > proc->name.vpid)) { > if (ORTE_SUCCESS != (rc = > opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { > @@ -835,7 +838,7 @@ > return ORTE_ERR_OUT_OF_RESOURCE; > } > proc->name.vpid = daemons->num_procs; /* take the next available > vpid */ > - proc->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); > proc->node = node; > proc->nodename = node->name; > OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, > @@ -1014,8 +1017,8 @@ > return ORTE_ERR_OUT_OF_RESOURCE; > } > proc->name.vpid = jdata->num_procs; /* take the next available vpid > */ > - proc->name.epoch = ORTE_EPOCH_INVALID; > - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); > + > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > proc->node = node; > proc->nodename = node->name; > OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, > > Modified: trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c > ============================================================================== > --- trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c (original) > +++ trunk/orte/mca/rmaps/rank_file/rmaps_rank_file.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -502,8 +502,7 @@ > } > proc->name.vpid = rank; > /* Either init or update the epoch. */ > - proc->name.epoch = ORTE_EPOCH_INVALID; > - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); > + > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > > proc->slot_list = strdup(rfmap->slot_list); > /* insert the proc into the proper place */ > > Modified: trunk/orte/mca/rmaps/seq/rmaps_seq.c > ============================================================================== > --- trunk/orte/mca/rmaps/seq/rmaps_seq.c (original) > +++ trunk/orte/mca/rmaps/seq/rmaps_seq.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -235,8 +235,7 @@ > } > /* assign the vpid */ > proc->name.vpid = vpid++; > - proc->name.epoch = ORTE_EPOCH_INVALID; > - proc->name.epoch = orte_ess.proc_get_epoch(&proc->name); > + > ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); > > /* add to the jdata proc array */ > if (ORTE_SUCCESS != (rc = > opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { > > Modified: trunk/orte/mca/rmcast/base/rmcast_base_open.c > ============================================================================== > --- trunk/orte/mca/rmcast/base/rmcast_base_open.c (original) > +++ trunk/orte/mca/rmcast/base/rmcast_base_open.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -341,7 +341,7 @@ > { > ptr->name.jobid = ORTE_JOBID_INVALID; > ptr->name.vpid = ORTE_VPID_INVALID; > - ptr->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); > ptr->channel = ORTE_RMCAST_INVALID_CHANNEL; > OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t); > ptr->seq_num = ORTE_RMCAST_SEQ_INVALID; > @@ -430,7 +430,7 @@ > { > ptr->name.jobid = ORTE_JOBID_INVALID; > ptr->name.vpid = ORTE_VPID_INVALID; > - ptr->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); > OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t); > } > static void recvlog_destruct(rmcast_recv_log_t *ptr) > @@ -439,7 +439,7 @@ > > ptr->name.jobid = ORTE_JOBID_INVALID; > ptr->name.vpid = ORTE_VPID_INVALID; > - ptr->name.epoch = ORTE_EPOCH_INVALID; > + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_INVALID); > while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) { > OBJ_RELEASE(item); > } > > Modified: trunk/orte/mca/rmcast/tcp/rmcast_tcp.c > ============================================================================== > --- trunk/orte/mca/rmcast/tcp/rmcast_tcp.c (original) > +++ trunk/orte/mca/rmcast/tcp/rmcast_tcp.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -681,7 +681,7 @@ > /* caller requested id of sender */ > name->jobid = recvptr->name.jobid; > name->vpid = recvptr->name.vpid; > - name->epoch= recvptr->name.epoch; > + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); > } > *seq_num = recvptr->seq_num; > *msg = recvptr->iovec_array; > @@ -776,7 +776,7 @@ > /* caller requested id of sender */ > name->jobid = recvptr->name.jobid; > name->vpid = recvptr->name.vpid; > - name->epoch= recvptr->name.epoch; > + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); > } > *seq_num = recvptr->seq_num; > if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { > > Modified: trunk/orte/mca/rmcast/udp/rmcast_udp.c > ============================================================================== > --- trunk/orte/mca/rmcast/udp/rmcast_udp.c (original) > +++ trunk/orte/mca/rmcast/udp/rmcast_udp.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -460,7 +460,7 @@ > /* caller requested id of sender */ > name->jobid = recvptr->name.jobid; > name->vpid = recvptr->name.vpid; > - name->epoch= recvptr->name.epoch; > + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); > } > *seq_num = recvptr->seq_num; > *msg = recvptr->iovec_array; > @@ -553,7 +553,7 @@ > /* caller requested id of sender */ > name->jobid = recvptr->name.jobid; > name->vpid = recvptr->name.vpid; > - name->epoch= recvptr->name.epoch; > + ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); > } > *seq_num = recvptr->seq_num; > if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { > > Modified: trunk/orte/mca/rml/base/rml_base_components.c > ============================================================================== > --- trunk/orte/mca/rml/base/rml_base_components.c (original) > +++ trunk/orte/mca/rml/base/rml_base_components.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -20,6 +20,7 @@ > #include "opal/util/output.h" > > #include "orte/mca/rml/rml.h" > +#include "orte/util/name_fns.h" > > #if !ORTE_DISABLE_FULL_SUPPORT > > @@ -67,14 +68,14 @@ > { > pkt->sender.jobid = ORTE_JOBID_INVALID; > pkt->sender.vpid = ORTE_VPID_INVALID; > - pkt->sender.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_MIN); > pkt->buffer = NULL; > } > static void msg_pkt_destructor(orte_msg_packet_t *pkt) > { > pkt->sender.jobid = ORTE_JOBID_INVALID; > pkt->sender.vpid = ORTE_VPID_INVALID; > - pkt->sender.epoch = ORTE_EPOCH_INVALID; > + ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_INVALID); > if (NULL != pkt->buffer) { > OBJ_RELEASE(pkt->buffer); > } > > Modified: trunk/orte/mca/rml/rml_types.h > ============================================================================== > --- trunk/orte/mca/rml/rml_types.h (original) > +++ trunk/orte/mca/rml/rml_types.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -62,7 +62,7 @@ > pkt = OBJ_NEW(orte_msg_packet_t); \ > pkt->sender.jobid = (sndr)->jobid; \ > pkt->sender.vpid = (sndr)->vpid; \ > - pkt->sender.epoch = (sndr)->epoch; \ > + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ > if ((crt)) { \ > pkt->buffer = OBJ_NEW(opal_buffer_t); \ > opal_dss.copy_payload(pkt->buffer, *(buf)); \ > @@ -85,7 +85,7 @@ > pkt = OBJ_NEW(orte_msg_packet_t); \ > pkt->sender.jobid = (sndr)->jobid; \ > pkt->sender.vpid = (sndr)->vpid; \ > - pkt->sender.epoch = (sndr)->epoch; \ > + ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ > if ((crt)) { \ > pkt->buffer = OBJ_NEW(opal_buffer_t); \ > opal_dss.copy_payload(pkt->buffer, *(buf)); \ > @@ -191,8 +191,10 @@ > > #define ORTE_RML_TAG_SUBSCRIBE 46 > > +#if ORTE_ENABLE_EPOCH > /* For Epoch Updates */ > #define ORTE_RML_TAG_EPOCH_CHANGE 47 > +#endif > > /* Notify of failed processes */ > #define ORTE_RML_TAG_FAILURE_NOTICE 48 > > Modified: trunk/orte/mca/routed/base/routed_base_components.c > ============================================================================== > --- trunk/orte/mca/routed/base/routed_base_components.c (original) > +++ trunk/orte/mca/routed/base/routed_base_components.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -65,7 +65,7 @@ > { > ptr->route.jobid = ORTE_JOBID_INVALID; > ptr->route.vpid = ORTE_VPID_INVALID; > - ptr->route.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN); > ptr->hnp_uri = NULL; > } > static void jfamdest(orte_routed_jobfam_t *ptr) > @@ -117,7 +117,7 @@ > jfam = OBJ_NEW(orte_routed_jobfam_t); > jfam->route.jobid = ORTE_PROC_MY_HNP->jobid; > jfam->route.vpid = ORTE_PROC_MY_HNP->vpid; > - jfam->route.epoch = ORTE_PROC_MY_HNP->epoch; > + ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch); > jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); > if (NULL != orte_process_info.my_hnp_uri) { > jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri); > @@ -252,7 +252,7 @@ > jfam->job_family = jobfamily; > jfam->route.jobid = name.jobid; > jfam->route.vpid = name.vpid; > - jfam->route.epoch = name.epoch; > + ORTE_EPOCH_SET(jfam->route.epoch,name.epoch); > jfam->hnp_uri = strdup(uri); > done: > free(uri); > > Modified: trunk/orte/mca/routed/base/routed_base_register_sync.c > ============================================================================== > --- trunk/orte/mca/routed/base/routed_base_register_sync.c (original) > +++ trunk/orte/mca/routed/base/routed_base_register_sync.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -127,7 +127,9 @@ > orte_std_cntr_t cnt; > char *rml_uri; > orte_vpid_t vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; > +#endif > int rc; > > if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { > @@ -146,11 +148,13 @@ > cnt = 1; > while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, > ORTE_VPID))) { > > +#if ORTE_ENABLE_EPOCH > cnt = 1; > if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, > ORTE_EPOCH))) { > ORTE_ERROR_LOG(rc); > continue; > } > +#endif > > if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, > OPAL_STRING))) { > ORTE_ERROR_LOG(rc); > > Modified: trunk/orte/mca/routed/binomial/routed_binomial.c > ============================================================================== > --- trunk/orte/mca/routed/binomial/routed_binomial.c (original) > +++ trunk/orte/mca/routed/binomial/routed_binomial.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -33,6 +33,7 @@ > #include "orte/runtime/orte_globals.h" > #include "orte/runtime/orte_wait.h" > #include "orte/runtime/runtime.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rml/base/rml_contact.h" > > @@ -147,7 +148,7 @@ > > if (proc->jobid == ORTE_JOBID_INVALID || > proc->vpid == ORTE_VPID_INVALID || > - proc->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -216,7 +217,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -274,8 +275,7 @@ > ORTE_NAME_PRINT(route))); > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = ORTE_EPOCH_INVALID; > - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); > + > ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); > > return ORTE_SUCCESS; > } > @@ -290,8 +290,7 @@ > jfam->job_family = jfamily; > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = ORTE_EPOCH_INVALID; > - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); > + > ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); > > opal_pointer_array_add(&orte_routed_jobfams, jfam); > return ORTE_SUCCESS; > @@ -317,11 +316,21 @@ > /* initialize */ > daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; > daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; > - daemon.epoch = ORTE_PROC_MY_DAEMON->epoch; > + ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch); > > +#if ORTE_ENABLE_EPOCH > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > target->epoch == ORTE_EPOCH_INVALID) { > +#else > + if (target->jobid == ORTE_JOBID_INVALID || > + target->vpid == ORTE_VPID_INVALID) { > +#endif > + ret = ORTE_NAME_INVALID; > + goto found; > + } > + > + if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { > ret = ORTE_NAME_INVALID; > goto found; > } > @@ -443,7 +452,7 @@ > > /* If the daemon to which we should be routing is dead, then > update > * the routing tree and start over. */ > - if (!orte_util_proc_is_running(&daemon)) { > + if (!PROC_IS_RUNNING(&daemon)) { > update_routing_tree(daemon.jobid); > goto startover; > } > @@ -461,8 +470,7 @@ > ret = &daemon; > > found: > - daemon.epoch = ORTE_EPOCH_INVALID; > - daemon.epoch = orte_ess.proc_get_epoch(&daemon); > + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); > > OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, > "%s routed_binomial_get(%s) --> %s", > @@ -879,7 +887,7 @@ > */ > local_lifeline.jobid = proc->jobid; > local_lifeline.vpid = proc->vpid; > - local_lifeline.epoch = proc->epoch; > + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); > lifeline = &local_lifeline; > > return ORTE_SUCCESS; > @@ -924,11 +932,11 @@ > * that process so we can check it's state. > */ > proc_name.vpid = peer; > - proc_name.epoch = orte_util_lookup_epoch(&proc_name); > + > ORTE_EPOCH_SET(proc_name.epoch,orte_util_lookup_epoch(&proc_name)); > > - if (!orte_util_proc_is_running(&proc_name) > - && ORTE_EPOCH_MIN < proc_name.epoch > - && ORTE_EPOCH_INVALID != proc_name.epoch) { > + if (!PROC_IS_RUNNING(&proc_name) > + && 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,proc_name.epoch) > + && 0 != > ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc_name.epoch)) { > OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, > "%s routed:binomial child %s is > dead", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > @@ -967,7 +975,7 @@ > } > > /* find the children of this rank */ > - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, > + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, > "%s routed:binomial find children of rank %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank)); > bitmap = opal_cube_dim(num_procs); > @@ -977,24 +985,25 @@ > > for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { > peer = rank | mask; > - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, > + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, > "%s routed:binomial find children checking peer > %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer)); > if (peer < num_procs) { > - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, > + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, > "%s routed:binomial find children computing > tree", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > /* execute compute on this child */ > if (0 <= (found = binomial_tree(peer, rank, me, num_procs, > nchildren, childrn, relatives, mine, jobid))) { > proc_name.vpid = found; > > - if (!orte_util_proc_is_running(&proc_name) && ORTE_EPOCH_MIN > < orte_util_lookup_epoch(&proc_name)) { > - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, > + if (!PROC_IS_RUNNING(&proc_name) > + && 0 < > ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,orte_util_lookup_epoch(&proc_name))) { > + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, > "%s routed:binomial find children > proc out of date - returning parent %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > parent)); > return parent; > } > - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, > + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, > "%s routed:binomial find children > returning found value %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > found)); > return found; > @@ -1029,8 +1038,7 @@ > ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid, > orte_process_info.max_procs, > &num_children, &my_children, NULL, true, > jobid); > - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_PARENT->epoch = > orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); > > if (0 < opal_output_get_verbosity(orte_routed_base_output)) { > opal_output(0, "%s: parent %d num_children %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children); > > Modified: trunk/orte/mca/routed/cm/routed_cm.c > ============================================================================== > --- trunk/orte/mca/routed/cm/routed_cm.c (original) > +++ trunk/orte/mca/routed/cm/routed_cm.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -35,6 +35,7 @@ > #include "orte/runtime/orte_globals.h" > #include "orte/runtime/orte_wait.h" > #include "orte/runtime/runtime.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rml/base/rml_contact.h" > > @@ -139,7 +140,7 @@ > > if (proc->jobid == ORTE_JOBID_INVALID || > proc->vpid == ORTE_VPID_INVALID || > - proc->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -200,7 +201,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -257,8 +258,7 @@ > ORTE_NAME_PRINT(route))); > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = ORTE_EPOCH_INVALID; > - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); > + > ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); > > return ORTE_SUCCESS; > } > @@ -273,8 +273,7 @@ > jfam->job_family = jfamily; > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = ORTE_EPOCH_INVALID; > - jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route); > + > ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); > > opal_pointer_array_add(&orte_routed_jobfams, jfam); > return ORTE_SUCCESS; > @@ -299,7 +298,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > ret = ORTE_NAME_INVALID; > goto found; > } > @@ -367,8 +366,7 @@ > } > > /* Initialize daemon's epoch, based on its current vpid/jobid */ > - daemon.epoch = ORTE_EPOCH_INVALID; > - daemon.epoch = orte_ess.proc_get_epoch(&daemon); > + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); > > /* if the daemon is me, then send direct to the target! */ > if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { > @@ -814,8 +812,7 @@ > */ > local_lifeline.jobid = proc->jobid; > local_lifeline.vpid = proc->vpid; > - local_lifeline.epoch = ORTE_EPOCH_INVALID; > - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); > + > ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); > > lifeline = &local_lifeline; > > > Modified: trunk/orte/mca/routed/direct/routed_direct.c > ============================================================================== > --- trunk/orte/mca/routed/direct/routed_direct.c (original) > +++ trunk/orte/mca/routed/direct/routed_direct.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -24,6 +24,7 @@ > #include "orte/util/name_fns.h" > #include "orte/util/proc_info.h" > #include "orte/runtime/orte_globals.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rml/base/rml_contact.h" > > @@ -135,7 +136,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > ret = ORTE_NAME_INVALID; > } else { > /* all routes are direct */ > > Modified: trunk/orte/mca/routed/linear/routed_linear.c > ============================================================================== > --- trunk/orte/mca/routed/linear/routed_linear.c (original) > +++ trunk/orte/mca/routed/linear/routed_linear.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -31,6 +31,7 @@ > #include "orte/runtime/orte_globals.h" > #include "orte/runtime/orte_wait.h" > #include "orte/runtime/runtime.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rml/base/rml_contact.h" > > @@ -132,7 +133,7 @@ > > if (proc->jobid == ORTE_JOBID_INVALID || > proc->vpid == ORTE_VPID_INVALID || > - proc->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -201,7 +202,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -259,7 +260,7 @@ > ORTE_NAME_PRINT(route))); > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = route->epoch; > + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); > return ORTE_SUCCESS; > } > } > @@ -273,7 +274,7 @@ > jfam->job_family = jfamily; > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = route->epoch; > + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); > opal_pointer_array_add(&orte_routed_jobfams, jfam); > return ORTE_SUCCESS; > } > @@ -373,8 +374,7 @@ > } > > /* Initialize daemon's epoch, based on its current vpid/jobid */ > - daemon.epoch = ORTE_EPOCH_INVALID; > - daemon.epoch = orte_ess.proc_get_epoch(&daemon); > + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); > > /* if the daemon is me, then send direct to the target! */ > if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { > @@ -395,8 +395,7 @@ > /* we are at end of chain - wrap around */ > daemon.vpid = 0; > } > - daemon.epoch = ORTE_EPOCH_INVALID; > - daemon.epoch = orte_ess.proc_get_epoch(&daemon); > + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); > ret = &daemon; > } > } > @@ -741,7 +740,7 @@ > */ > local_lifeline.jobid = proc->jobid; > local_lifeline.vpid = proc->vpid; > - local_lifeline.epoch = proc->epoch; > + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); > lifeline = &local_lifeline; > > return ORTE_SUCCESS; > > Modified: trunk/orte/mca/routed/radix/routed_radix.c > ============================================================================== > --- trunk/orte/mca/routed/radix/routed_radix.c (original) > +++ trunk/orte/mca/routed/radix/routed_radix.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -31,6 +31,7 @@ > #include "orte/runtime/orte_globals.h" > #include "orte/runtime/orte_wait.h" > #include "orte/runtime/runtime.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rml/base/rml_contact.h" > > @@ -145,7 +146,7 @@ > > if (proc->jobid == ORTE_JOBID_INVALID || > proc->vpid == ORTE_VPID_INVALID || > - proc->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -214,7 +215,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > return ORTE_ERR_BAD_PARAM; > } > > @@ -272,7 +273,7 @@ > ORTE_NAME_PRINT(route))); > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = route->epoch; > + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); > return ORTE_SUCCESS; > } > } > @@ -286,7 +287,7 @@ > jfam->job_family = jfamily; > jfam->route.jobid = route->jobid; > jfam->route.vpid = route->vpid; > - jfam->route.epoch = route->epoch; > + ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); > opal_pointer_array_add(&orte_routed_jobfams, jfam); > return ORTE_SUCCESS; > } > @@ -310,7 +311,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > ret = ORTE_NAME_INVALID; > goto found; > } > @@ -413,8 +414,7 @@ > if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { > /* yep - we need to step through this child */ > daemon.vpid = child->vpid; > - daemon.epoch = ORTE_EPOCH_INVALID; > - daemon.epoch = orte_ess.proc_get_epoch(&daemon); > + > ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); > ret = &daemon; > goto found; > } > @@ -425,8 +425,7 @@ > * any of our children, so we have to step up through our parent > */ > daemon.vpid = ORTE_PROC_MY_PARENT->vpid; > - daemon.epoch = ORTE_EPOCH_INVALID; > - daemon.epoch = orte_ess.proc_get_epoch(&daemon); > + ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); > > ret = &daemon; > > @@ -788,7 +787,7 @@ > */ > local_lifeline.jobid = proc->jobid; > local_lifeline.vpid = proc->vpid; > - local_lifeline.epoch = proc->epoch; > + ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); > lifeline = &local_lifeline; > > return ORTE_SUCCESS; > @@ -881,8 +880,7 @@ > ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel; > ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel); > } > - ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID; > - ORTE_PROC_MY_PARENT->epoch = > orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT); > + > ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); > > /* compute my direct children and the bitmap that shows which vpids > * lie underneath their branch > > Modified: trunk/orte/mca/routed/slave/routed_slave.c > ============================================================================== > --- trunk/orte/mca/routed/slave/routed_slave.c (original) > +++ trunk/orte/mca/routed/slave/routed_slave.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -26,6 +26,7 @@ > #include "orte/runtime/orte_globals.h" > #include "orte/runtime/orte_wait.h" > #include "orte/runtime/runtime.h" > +#include "orte/runtime/data_type_support/orte_dt_support.h" > > #include "orte/mca/rml/base/rml_contact.h" > > @@ -134,7 +135,7 @@ > > if (target->jobid == ORTE_JOBID_INVALID || > target->vpid == ORTE_VPID_INVALID || > - target->epoch == ORTE_EPOCH_INVALID) { > + 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { > ret = ORTE_NAME_INVALID; > } else { > /* a slave must always route via its parent daemon */ > @@ -275,8 +276,7 @@ > */ > local_lifeline.jobid = proc->jobid; > local_lifeline.vpid = proc->vpid; > - local_lifeline.epoch = ORTE_EPOCH_INVALID; > - local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline); > + > ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); > > lifeline = &local_lifeline; > > > Modified: trunk/orte/mca/sensor/file/sensor_file.c > ============================================================================== > --- trunk/orte/mca/sensor/file/sensor_file.c (original) > +++ trunk/orte/mca/sensor/file/sensor_file.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -70,7 +70,9 @@ > opal_list_item_t super; > orte_jobid_t jobid; > orte_vpid_t vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; > +#endif > char *file; > int tick; > bool check_size; > > Modified: trunk/orte/mca/snapc/base/snapc_base_fns.c > ============================================================================== > --- trunk/orte/mca/snapc/base/snapc_base_fns.c (original) > +++ trunk/orte/mca/snapc/base/snapc_base_fns.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -81,7 +81,7 @@ > { > snapshot->process_name.jobid = 0; > snapshot->process_name.vpid = 0; > - snapshot->process_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); > > snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; > > @@ -92,7 +92,7 @@ > { > snapshot->process_name.jobid = 0; > snapshot->process_name.vpid = 0; > - snapshot->process_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); > > snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; > > > Modified: trunk/orte/mca/snapc/full/snapc_full_global.c > ============================================================================== > --- trunk/orte/mca/snapc/full/snapc_full_global.c (original) > +++ trunk/orte/mca/snapc/full/snapc_full_global.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -427,7 +427,7 @@ > new_proc = OBJ_NEW(orte_proc_t); > new_proc->name.jobid = proc->name.jobid; > new_proc->name.vpid = proc->name.vpid; > - new_proc->name.epoch = proc->name.epoch; > + ORTE_EPOCH_SET(new_proc->name.epoch,proc->name.epoch); > new_proc->node = OBJ_NEW(orte_node_t); > new_proc->node->name = proc->node->name; > opal_list_append(migrating_procs, &new_proc->super); > @@ -618,7 +618,7 @@ > > orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; > orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; > - orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch; > + > ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); > > mask = ORTE_NS_CMP_JOBID; > > @@ -636,7 +636,7 @@ > > app_snapshot->process_name.jobid = procs[p]->name.jobid; > app_snapshot->process_name.vpid = procs[p]->name.vpid; > - app_snapshot->process_name.epoch = procs[p]->name.epoch; > + > ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); > > opal_list_append(&(orted_snapshot->super.local_snapshots), > &(app_snapshot->super)); > } > @@ -800,7 +800,7 @@ > > app_snapshot->process_name.jobid = procs[p]->name.jobid; > app_snapshot->process_name.vpid = procs[p]->name.vpid; > - app_snapshot->process_name.epoch = procs[p]->name.epoch; > + > ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); > > opal_list_append(&(orted_snapshot->super.local_snapshots), > &(app_snapshot->super)); > } > @@ -816,7 +816,7 @@ > > orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; > orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; > - orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch; > + > ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); > > mask = ORTE_NS_CMP_ALL; > > @@ -837,7 +837,7 @@ > > app_snapshot->process_name.jobid = procs[p]->name.jobid; > app_snapshot->process_name.vpid = procs[p]->name.vpid; > - app_snapshot->process_name.epoch = procs[p]->name.epoch; > + > ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); > > opal_list_append(&(orted_snapshot->super.local_snapshots), > &(app_snapshot->super)); > } > > Modified: trunk/orte/mca/snapc/full/snapc_full_local.c > ============================================================================== > --- trunk/orte/mca/snapc/full/snapc_full_local.c (original) > +++ trunk/orte/mca/snapc/full/snapc_full_local.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -2033,7 +2033,7 @@ > vpid_snapshot->process_pid = child->pid; > vpid_snapshot->super.process_name.jobid = child->name->jobid; > vpid_snapshot->super.process_name.vpid = child->name->vpid; > - vpid_snapshot->super.process_name.epoch = child->name->epoch; > + > ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); > } > } > > @@ -2095,7 +2095,7 @@ > vpid_snapshot->process_pid = child->pid; > vpid_snapshot->super.process_name.jobid = child->name->jobid; > vpid_snapshot->super.process_name.vpid = child->name->vpid; > - vpid_snapshot->super.process_name.epoch = child->name->epoch; > + > ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); > /*vpid_snapshot->migrating = true;*/ > > opal_list_append(&(local_global_snapshot.local_snapshots), > &(vpid_snapshot->super.super)); > @@ -2111,7 +2111,7 @@ > vpid_snapshot->process_pid = child->pid; > vpid_snapshot->super.process_name.jobid = child->name->jobid; > vpid_snapshot->super.process_name.vpid = child->name->vpid; > - vpid_snapshot->super.process_name.epoch = child->name->epoch; > + > ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); > } > } > > > Modified: trunk/orte/mca/snapc/full/snapc_full_module.c > ============================================================================== > --- trunk/orte/mca/snapc/full/snapc_full_module.c (original) > +++ trunk/orte/mca/snapc/full/snapc_full_module.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -83,7 +83,7 @@ > void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t > *snapshot) { > snapshot->process_name.jobid = 0; > snapshot->process_name.vpid = 0; > - snapshot->process_name.epoch = 0; > + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); > > snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; > } > @@ -91,7 +91,7 @@ > void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t > *snapshot) { > snapshot->process_name.jobid = 0; > snapshot->process_name.vpid = 0; > - snapshot->process_name.epoch = 0; > + ORTE_EPOCH_SET(snapshot->process_name.epoch,0); > > snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; > } > > Modified: trunk/orte/mca/sstore/base/sstore_base_fns.c > ============================================================================== > --- trunk/orte/mca/sstore/base/sstore_base_fns.c (original) > +++ trunk/orte/mca/sstore/base/sstore_base_fns.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -62,7 +62,7 @@ > { > snapshot->process_name.jobid = 0; > snapshot->process_name.vpid = 0; > - snapshot->process_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); > > snapshot->crs_comp = NULL; > snapshot->compress_comp = NULL; > @@ -76,7 +76,7 @@ > { > snapshot->process_name.jobid = 0; > snapshot->process_name.vpid = 0; > - snapshot->process_name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); > > if( NULL != snapshot->crs_comp ) { > free(snapshot->crs_comp); > @@ -637,7 +637,7 @@ > > vpid_snapshot->process_name.jobid = proc.jobid; > vpid_snapshot->process_name.vpid = proc.vpid; > - vpid_snapshot->process_name.epoch = proc.epoch; > + ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch); > } > else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, > strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) { > vpid_snapshot->crs_comp = strdup(value); > > Modified: trunk/orte/mca/sstore/central/sstore_central_global.c > ============================================================================== > --- trunk/orte/mca/sstore/central/sstore_central_global.c (original) > +++ trunk/orte/mca/sstore/central/sstore_central_global.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -1216,8 +1216,7 @@ > > vpid_snapshot->process_name.jobid = handle_info->jobid; > vpid_snapshot->process_name.vpid = i; > - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; > - vpid_snapshot->process_name.epoch = > orte_ess.proc_get_epoch(&vpid_snapshot->process_name); > + > ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); > > vpid_snapshot->crs_comp = NULL; > global_snapshot->start_time = NULL; > > Modified: trunk/orte/mca/sstore/central/sstore_central_local.c > ============================================================================== > --- trunk/orte/mca/sstore/central/sstore_central_local.c (original) > +++ trunk/orte/mca/sstore/central/sstore_central_local.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -210,7 +210,7 @@ > { > info->name.jobid = ORTE_JOBID_INVALID; > info->name.vpid = ORTE_VPID_INVALID; > - info->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); > > info->local_location = NULL; > info->metadata_filename = NULL; > @@ -222,7 +222,7 @@ > { > info->name.jobid = ORTE_JOBID_INVALID; > info->name.vpid = ORTE_VPID_INVALID; > - info->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); > > if( NULL != info->local_location ) { > free(info->local_location); > @@ -535,7 +535,7 @@ > > app_info->name.jobid = name->jobid; > app_info->name.vpid = name->vpid; > - app_info->name.epoch = name->epoch; > + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); > > opal_list_append(handle_info->app_info_handle, &(app_info->super)); > > > Modified: trunk/orte/mca/sstore/stage/sstore_stage_global.c > ============================================================================== > --- trunk/orte/mca/sstore/stage/sstore_stage_global.c (original) > +++ trunk/orte/mca/sstore/stage/sstore_stage_global.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -1218,10 +1218,10 @@ > p_set = OBJ_NEW(orte_filem_base_process_set_t); > p_set->source.jobid = peer->jobid; > p_set->source.vpid = peer->vpid; > - p_set->source.epoch = peer->epoch; > + ORTE_EPOCH_SET(p_set->source.epoch,peer->epoch); > p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; > p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; > - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); > opal_list_append(&(filem_request->process_sets), &(p_set->super) ); > } > > @@ -1706,8 +1706,7 @@ > > vpid_snapshot->process_name.jobid = handle_info->jobid; > vpid_snapshot->process_name.vpid = i; > - vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID; > - vpid_snapshot->process_name.epoch = > orte_ess.proc_get_epoch(&vpid_snapshot->process_name); > + > ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); > > /* JJH: Currently we do not have this information since we do not save > * individual vpid info in the Global SStore. It is in the metadata > > Modified: trunk/orte/mca/sstore/stage/sstore_stage_local.c > ============================================================================== > --- trunk/orte/mca/sstore/stage/sstore_stage_local.c (original) > +++ trunk/orte/mca/sstore/stage/sstore_stage_local.c 2011-08-26 18:16:14 EDT > (Fri, 26 Aug 2011) > @@ -287,7 +287,7 @@ > { > info->name.jobid = ORTE_JOBID_INVALID; > info->name.vpid = ORTE_VPID_INVALID; > - info->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); > > info->local_location = NULL; > info->compressed_local_location = NULL; > @@ -302,7 +302,7 @@ > { > info->name.jobid = ORTE_JOBID_INVALID; > info->name.vpid = ORTE_VPID_INVALID; > - info->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); > > if( NULL != info->local_location ) { > free(info->local_location); > @@ -1014,7 +1014,7 @@ > > app_info->name.jobid = name->jobid; > app_info->name.vpid = name->vpid; > - app_info->name.epoch = name->epoch; > + ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); > > opal_list_append(handle_info->app_info_handle, &(app_info->super)); > > @@ -2057,17 +2057,17 @@ > /* if I am the HNP, then use me as the source */ > p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; > p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; > - p_set->source.epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); > } > else { > /* otherwise, set the HNP as the source */ > p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; > p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; > - p_set->source.epoch = ORTE_PROC_MY_HNP->epoch; > + ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); > } > p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; > p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; > - p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch; > + ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); > opal_list_append(&(filem_request->process_sets), &(p_set->super) ); > > /* Define the file set */ > > Modified: trunk/orte/orted/orted_comm.c > ============================================================================== > --- trunk/orte/orted/orted_comm.c (original) > +++ trunk/orte/orted/orted_comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -123,18 +123,13 @@ > nm = (orte_routed_tree_t*)item; > > target.vpid = nm->vpid; > - target.epoch = orte_util_lookup_epoch(&target); > + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); > > - if (!orte_util_proc_is_running(&target)) { > + if (!PROC_IS_RUNNING(&target)) { > continue; > } > > - target.epoch = ORTE_EPOCH_INVALID; > - if (ORTE_NODE_RANK_INVALID == (target.epoch = > orte_ess.proc_get_epoch(&target))) { > - /* If we are trying to send to a previously failed process it's > - * better to fail silently. */ > - continue; > - } > + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); > > OPAL_OUTPUT_VERBOSE((1, orte_debug_output, > "%s orte:daemon:send_relay sending relay msg to > %s", > @@ -422,7 +417,8 @@ > proct = OBJ_NEW(orte_proc_t); > proct->name.jobid = proc.jobid; > proct->name.vpid = proc.vpid; > - proct->name.epoch = proc.epoch; > + ORTE_EPOCH_SET(proct->name.epoch,proc.epoch); > + > opal_pointer_array_add(&procarray, proct); > num_replies++; > } > @@ -1059,7 +1055,9 @@ > orte_job_t *jdata; > orte_proc_t *proc; > orte_vpid_t vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; > +#endif > int32_t i, num_procs; > > /* setup the answer */ > @@ -1086,12 +1084,14 @@ > goto CLEANUP; > } > > +#if ORTE_ENABLE_EPOCH > /* unpack the epoch */ > n = 1; > if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &epoch, &n, > ORTE_EPOCH))) { > ORTE_ERROR_LOG(ret); > goto CLEANUP; > } > +#endif > > /* if they asked for a specific proc, then just get that info */ > if (ORTE_VPID_WILDCARD != vpid) { > @@ -1201,7 +1201,7 @@ > /* loop across all daemons */ > proc2.jobid = ORTE_PROC_MY_NAME->jobid; > for (proc2.vpid=1; proc2.vpid < > orte_process_info.num_procs; proc2.vpid++) { > - proc2.epoch = orte_util_lookup_epoch(&proc2); > + > ORTE_EPOCH_SET(proc2.epoch,orte_util_lookup_epoch(&proc2)); > > /* setup the cmd */ > relay_msg = OBJ_NEW(opal_buffer_t); > > Modified: trunk/orte/orted/orted_main.c > ============================================================================== > --- trunk/orte/orted/orted_main.c (original) > +++ trunk/orte/orted/orted_main.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -388,14 +388,14 @@ > orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); > ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; > ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; > - ORTE_PROC_MY_DAEMON->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_EPOCH_MIN); > > /* if I am also the hnp, then update that contact info field too */ > if (ORTE_PROC_IS_HNP) { > orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); > ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; > ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; > - ORTE_PROC_MY_HNP->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_EPOCH_MIN); > } > > /* setup the primary daemon command receive function */ > @@ -495,7 +495,8 @@ > proc = OBJ_NEW(orte_proc_t); > proc->name.jobid = jdata->jobid; > proc->name.vpid = 0; > - proc->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); > + > proc->state = ORTE_PROC_STATE_RUNNING; > proc->app_idx = 0; > proc->node = nodes[0]; /* hnp node must be there */ > > Modified: trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c > (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_compare_fns.c > 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -76,6 +76,7 @@ > } > } > > +#if ORTE_ENABLE_EPOCH > /** check the epochs - if one of them is WILDCARD, then ignore > * this field since anything is okay > */ > @@ -87,6 +88,7 @@ > return OPAL_VALUE1_GREATER; > } > } > +#endif > > /** only way to get here is if all fields are equal or WILDCARD */ > return OPAL_EQUAL; > @@ -122,6 +124,7 @@ > return OPAL_EQUAL; > } > > +#if ORTE_ENABLE_EPOCH > int orte_dt_compare_epoch(orte_epoch_t *value1, > orte_epoch_t *value2, > opal_data_type_t type) > @@ -136,6 +139,7 @@ > > return OPAL_EQUAL; > } > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > /** > > Modified: trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_copy_fns.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -61,7 +61,7 @@ > > val->jobid = src->jobid; > val->vpid = src->vpid; > - val->epoch = src->epoch; > + ORTE_EPOCH_SET(val->epoch,src->epoch); > > *dest = val; > return ORTE_SUCCESS; > @@ -105,6 +105,7 @@ > return ORTE_SUCCESS; > } > > +#if ORTE_ENABLE_EPOCH > /* > * EPOCH > */ > @@ -123,6 +124,7 @@ > > return ORTE_SUCCESS; > } > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > > > Modified: trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c > (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_packing_fns.c > 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -58,7 +58,9 @@ > orte_process_name_t* proc; > orte_jobid_t *jobid; > orte_vpid_t *vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t *epoch; > +#endif > > /* collect all the jobids in a contiguous array */ > jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t)); > @@ -100,6 +102,7 @@ > } > free(vpid); > > +#if ORTE_ENABLE_EPOCH > /* Collect all the epochs in a contiguous array */ > epoch = (orte_epoch_t *) malloc(num_vals * sizeof(orte_epoch_t)); > if (NULL == epoch) { > @@ -118,6 +121,7 @@ > return rc; > } > free(epoch); > +#endif > > return ORTE_SUCCESS; > } > @@ -156,6 +160,7 @@ > return ret; > } > > +#if ORTE_ENABLE_EPOCH > /* > * EPOCH > */ > @@ -171,6 +176,7 @@ > > return ret; > } > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > /* > > Modified: trunk/orte/runtime/data_type_support/orte_dt_print_fns.c > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_print_fns.c (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_print_fns.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -125,8 +125,10 @@ > orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, > ORTE_STD_CNTR_T); > break; > > +#if ORTE_ENABLE_EPOCH > case ORTE_EPOCH: > orte_dt_quick_print(output, "ORTE_EPOCH", prefix, src, > ORTE_EPOCH_T); > +#endif > > case ORTE_VPID: > orte_dt_quick_print(output, "ORTE_VPID", prefix, src, > ORTE_VPID_T); > @@ -478,11 +480,21 @@ > if (orte_xml_output) { > /* need to create the output in XML format */ > if (0 == src->pid) { > +#if ORTE_ENABLE_EPOCH > asprintf(output, "%s<process rank=\"%s\" status=\"%s\" > epoch=\"%s\"/>\n", pfx2, > ORTE_VPID_PRINT(src->name.vpid), > orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); > +#else > + asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", > pfx2, > + ORTE_VPID_PRINT(src->name.vpid), > orte_proc_state_to_str(src->state)); > +#endif > } else { > +#if ORTE_ENABLE_EPOCH > asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\" > epoch=\"%s\"/>\n", pfx2, > ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, > orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); > +#else > + asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" > status=\"%s\"/>\n", pfx2, > + ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, > orte_proc_state_to_str(src->state)); > +#endif > } > free(pfx2); > return ORTE_SUCCESS; > @@ -490,10 +502,17 @@ > > if (!orte_devel_level_output) { > /* just print a very simple output for users */ > +#if ORTE_ENABLE_EPOCH > asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: > %s", pfx2, > ORTE_JOBID_PRINT(src->name.jobid), > ORTE_VPID_PRINT(src->name.vpid), > ORTE_EPOCH_PRINT(src->name.epoch)); > +#else > + asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: > %s", pfx2, > + ORTE_JOBID_PRINT(src->name.jobid), > + ORTE_VPID_PRINT(src->name.vpid)); > +#endif > + > /* set the return */ > *output = tmp; > free(pfx2); > > Modified: trunk/orte/runtime/data_type_support/orte_dt_size_fns.c > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_size_fns.c (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_size_fns.c 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -45,9 +45,11 @@ > *size = sizeof(orte_std_cntr_t); > break; > > +#if ORTE_ENABLE_EPOCH > case ORTE_EPOCH: > *size = sizeof(orte_epoch_t); > break; > +#endif > > case ORTE_VPID: > *size = sizeof(orte_vpid_t); > > Modified: trunk/orte/runtime/data_type_support/orte_dt_support.h > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_support.h (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_support.h 2011-08-26 > 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -52,9 +52,14 @@ > int orte_dt_compare_vpid(orte_vpid_t *value1, > orte_vpid_t *value2, > opal_data_type_t type); > +#if ORTE_ENABLE_EPOCH > int orte_dt_compare_epoch(orte_epoch_t *value1, > orte_epoch_t *value2, > opal_data_type_t type); > +#define ORTE_EPOCH_CMP(n,m) ( (m) - (n) ) > +#else > +#define ORTE_EPOCH_CMP(n,m) ( 0 ) > +#endif > #if !ORTE_DISABLE_FULL_SUPPORT > int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, > opal_data_type_t type); > int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, > opal_data_type_t type); > @@ -86,7 +91,9 @@ > int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, > opal_data_type_t type); > int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, > opal_data_type_t type); > int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t > type); > +#if ORTE_ENABLE_EPOCH > int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, > opal_data_type_t type); > +#endif > #if !ORTE_DISABLE_FULL_SUPPORT > int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t > type); > int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, opal_data_type_t > type); > @@ -116,8 +123,10 @@ > int32_t num_vals, opal_data_type_t type); > int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src, > int32_t num_vals, opal_data_type_t type); > +#if ORTE_ENABLE_EPOCH > int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src, > int32_t num_vals, opal_data_type_t type); > +#endif > #if !ORTE_DISABLE_FULL_SUPPORT > int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, > int32_t num_vals, opal_data_type_t type); > @@ -185,8 +194,10 @@ > int32_t *num_vals, opal_data_type_t type); > int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest, > int32_t *num_vals, opal_data_type_t type); > +#if ORTE_ENABLE_EPOCH > int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest, > int32_t *num_vals, opal_data_type_t type); > +#endif > #if !ORTE_DISABLE_FULL_SUPPORT > int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, > int32_t *num_vals, opal_data_type_t type); > > Modified: trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c > ============================================================================== > --- trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c > (original) > +++ trunk/orte/runtime/data_type_support/orte_dt_unpacking_fns.c > 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -54,7 +54,9 @@ > orte_process_name_t* proc; > orte_jobid_t *jobid; > orte_vpid_t *vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t *epoch; > +#endif > > num = *num_vals; > > @@ -92,6 +94,7 @@ > return rc; > } > > +#if ORTE_ENABLE_EPOCH > /* collect all the epochs in a contiguous array */ > epoch= (orte_epoch_t*)malloc(num * sizeof(orte_epoch_t)); > if (NULL == epoch) { > @@ -109,18 +112,21 @@ > free(jobid); > return rc; > } > +#endif > > /* build the names from the jobid/vpid/epoch arrays */ > proc = (orte_process_name_t*)dest; > for (i=0; i < num; i++) { > proc->jobid = jobid[i]; > proc->vpid = vpid[i]; > - proc->epoch = epoch[i]; > + ORTE_EPOCH_SET(proc->epoch,epoch[i]); > proc++; > } > > /* cleanup */ > +#if ORTE_ENABLE_EPOCH > free(epoch); > +#endif > free(vpid); > free(jobid); > > @@ -159,6 +165,7 @@ > return ret; > } > > +#if ORTE_ENABLE_EPOCH > /* > * EPOCH > */ > @@ -174,6 +181,7 @@ > > return ret; > } > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > /* > > Modified: trunk/orte/runtime/orte_data_server.c > ============================================================================== > --- trunk/orte/runtime/orte_data_server.c (original) > +++ trunk/orte/runtime/orte_data_server.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -220,7 +220,7 @@ > data->port = port_name; > data->owner.jobid = sender->jobid; > data->owner.vpid = sender->vpid; > - data->owner.epoch = sender->epoch; > + ORTE_EPOCH_SET(data->owner.epoch,sender->epoch); > > /* store the data */ > data->index = opal_pointer_array_add(orte_data_server_store, > data); > > Modified: trunk/orte/runtime/orte_globals.c > ============================================================================== > --- trunk/orte/runtime/orte_globals.c (original) > +++ trunk/orte/runtime/orte_globals.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -277,6 +277,7 @@ > return rc; > } > > +#if ORTE_ENABLE_EPOCH > tmp = ORTE_EPOCH; > if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_epoch, > orte_dt_unpack_epoch, > @@ -290,6 +291,7 @@ > ORTE_ERROR_LOG(rc); > return rc; > } > +#endif > > #if !ORTE_DISABLE_FULL_SUPPORT > tmp = ORTE_JOB; > @@ -933,7 +935,7 @@ > proc->beat = 0; > OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t); > opal_ring_buffer_init(&proc->stats, orte_stat_history_size); > - proc->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); > #if OPAL_ENABLE_FT_CR == 1 > proc->ckpt_state = 0; > proc->ckpt_snapshot_ref = NULL; > > Modified: trunk/orte/runtime/orte_init.c > ============================================================================== > --- trunk/orte/runtime/orte_init.c (original) > +++ trunk/orte/runtime/orte_init.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -57,8 +57,17 @@ > char *orte_prohibited_session_dirs = NULL; > bool orte_create_session_dirs = true; > > +#if ORTE_ENABLE_EPOCH > +orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, > ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD}; > +#else > orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, > ORTE_VPID_WILDCARD}; > +#endif > + > +#if ORTE_ENABLE_EPOCH > +orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, > ORTE_VPID_INVALID, ORTE_EPOCH_INVALID}; > +#else > orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, > ORTE_VPID_INVALID}; > +#endif > > > #if OPAL_CC_USE_PRAGMA_IDENT > > Modified: trunk/orte/runtime/orte_wait.h > ============================================================================== > --- trunk/orte/runtime/orte_wait.h (original) > +++ trunk/orte/runtime/orte_wait.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -204,7 +204,7 @@ > mev = OBJ_NEW(orte_message_event_t); \ > mev->sender.jobid = (sndr)->jobid; \ > mev->sender.vpid = (sndr)->vpid; \ > - mev->sender.epoch = (sndr)->epoch; \ > + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ > opal_dss.copy_payload(mev->buffer, (buf)); \ > mev->tag = (tg); \ > mev->file = strdup((buf)->parent.cls_init_file_name); \ > @@ -228,7 +228,7 @@ > mev = OBJ_NEW(orte_message_event_t); \ > mev->sender.jobid = (sndr)->jobid; \ > mev->sender.vpid = (sndr)->vpid; \ > - mev->sender.epoch = (sndr)->epoch; \ > + ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ > opal_dss.copy_payload(mev->buffer, (buf)); \ > mev->tag = (tg); \ > opal_event_evtimer_set(opal_event_base, \ > @@ -258,7 +258,7 @@ > tmp = OBJ_NEW(orte_notify_event_t); \ > tmp->proc.jobid = (data)->jobid; \ > tmp->proc.vpid = (data)->vpid; \ > - tmp->proc.epoch = (data)->epoch; \ > + ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \ > opal_event.evtimer_set(opal_event_base, \ > tmp->ev, (cbfunc), tmp); \ > now.tv_sec = 0; \ > > Modified: trunk/orte/test/system/oob_stress.c > ============================================================================== > --- trunk/orte/test/system/oob_stress.c (original) > +++ trunk/orte/test/system/oob_stress.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -74,8 +74,7 @@ > > for (j=1; j < count+1; j++) { > peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % > orte_process_info.num_procs; > - peer.epoch = ORTE_EPOCH_INVALID; > - peer.epoch = orte_ess.proc_get_epoch(&peer); > + ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); > > /* rank0 starts ring */ > if (ORTE_PROC_MY_NAME->vpid == 0) { > > Modified: trunk/orte/test/system/orte_ring.c > ============================================================================== > --- trunk/orte/test/system/orte_ring.c (original) > +++ trunk/orte/test/system/orte_ring.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -41,16 +41,14 @@ > if( right_peer_orte_name.vpid >= num_peers ) { > right_peer_orte_name.vpid = 0; > } > - right_peer_orte_name.epoch = ORTE_EPOCH_INVALID; > - right_peer_orte_name.epoch = > orte_ess.proc_get_epoch(&right_peer_orte_name); > + > ORTE_EPOCH_SET(right_peer_orte_name.epoch,orte_ess.proc_get_epoch(&right_peer_orte_name)); > > left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid; > left_peer_orte_name.vpid = ORTE_PROC_MY_NAME->vpid - 1; > if( ORTE_PROC_MY_NAME->vpid == 0 ) { > left_peer_orte_name.vpid = num_peers - 1; > } > - left_peer_orte_name.epoch = ORTE_EPOCH_INVALID; > - left_peer_orte_name.epoch = > orte_ess.proc_get_epoch(&left_peer_orte_name); > + > ORTE_EPOCH_SET(left_peer_orte_name.epoch,orte_ess.proc_get_epoch(&left_peer_orte_name)); > > printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is > %s\n", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), getpid(), > > Modified: trunk/orte/test/system/orte_spawn.c > ============================================================================== > --- trunk/orte/test/system/orte_spawn.c (original) > +++ trunk/orte/test/system/orte_spawn.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -74,8 +74,8 @@ > for (i=0; i < app->num_procs; i++) { > name.vpid = i; > > - name.epoch = ORTE_EPOCH_INVALID; > - name.epoch = orte_ess.proc_get_epoch(&name); > + ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); > + > fprintf(stderr, "Parent: sending message to child %s\n", > ORTE_NAME_PRINT(&name)); > if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) { > ORTE_ERROR_LOG(rc); > > Modified: trunk/orte/tools/orte-ps/orte-ps.c > ============================================================================== > --- trunk/orte/tools/orte-ps/orte-ps.c (original) > +++ trunk/orte/tools/orte-ps/orte-ps.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -869,8 +869,14 @@ > } > > /* query the HNP for info on the procs in this job */ > - if (ORTE_SUCCESS != (ret = > orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid, > - > ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD, &cnt, &procs))) { > + if (ORTE_SUCCESS != (ret = > orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), > + job->jobid, > + > ORTE_VPID_WILDCARD, > +#if ORTE_ENABLE_EPOCH > + > ORTE_EPOCH_WILDCARD, > +#endif > + &cnt, > + &procs))) { > ORTE_ERROR_LOG(ret); > } > job->procs->addr = (void**)procs; > > Modified: trunk/orte/tools/orte-top/orte-top.c > ============================================================================== > --- trunk/orte/tools/orte-top/orte-top.c (original) > +++ trunk/orte/tools/orte-top/orte-top.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -471,7 +471,7 @@ > if (NULL == ranks) { > /* take all ranks */ > proc.vpid = ORTE_VPID_WILDCARD; > - proc.epoch = ORTE_EPOCH_WILDCARD; > + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_WILDCARD); > if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, > ORTE_NAME))) { > ORTE_ERROR_LOG(ret); > goto cleanup; > > Modified: trunk/orte/util/comm/comm.c > ============================================================================== > --- trunk/orte/util/comm/comm.c (original) > +++ trunk/orte/util/comm/comm.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -433,8 +433,13 @@ > return ORTE_SUCCESS; > } > > +#if ORTE_ENABLE_EPOCH > int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, > orte_jobid_t job, orte_vpid_t vpid, > orte_epoch_t epoch, int *num_procs, > orte_proc_t ***proc_info_array) > +#else > +int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, > orte_jobid_t job, orte_vpid_t vpid, > + int *num_procs, orte_proc_t > ***proc_info_array) > +#endif > { > int ret; > int32_t cnt, cnt_procs, n; > @@ -463,11 +468,13 @@ > OBJ_RELEASE(cmd); > return ret; > } > +#if ORTE_ENABLE_EPOCH > if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { > ORTE_ERROR_LOG(ret); > OBJ_RELEASE(cmd); > return ret; > } > +#endif > /* define a max time to wait for send to complete */ > timer_fired = false; > error_exit = ORTE_SUCCESS; > > Modified: trunk/orte/util/comm/comm.h > ============================================================================== > --- trunk/orte/util/comm/comm.h (original) > +++ trunk/orte/util/comm/comm.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -52,7 +52,10 @@ > int *num_nodes, orte_node_t > ***node_info_array); > > ORTE_DECLSPEC int orte_util_comm_query_proc_info(const orte_process_name_t > *hnp, orte_jobid_t job, orte_vpid_t vpid, > - orte_epoch_t epoch, int > *num_procs, orte_proc_t ***proc_info_array); > +#if ORTE_ENABLE_EPOCH > + orte_epoch_t epoch, > +#endif > + int *num_procs, orte_proc_t > ***proc_info_array); > > ORTE_DECLSPEC int orte_util_comm_spawn_job(const orte_process_name_t *hnp, > orte_job_t *jdata); > > > Modified: trunk/orte/util/hnp_contact.c > ============================================================================== > --- trunk/orte/util/hnp_contact.c (original) > +++ trunk/orte/util/hnp_contact.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -55,7 +55,8 @@ > { > ptr->name.jobid = ORTE_JOBID_INVALID; > ptr->name.vpid = ORTE_VPID_INVALID; > - ptr->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); > + > ptr->rml_uri = NULL; > } > static void orte_hnp_contact_destruct(orte_hnp_contact_t *ptr) > > Modified: trunk/orte/util/name_fns.c > ============================================================================== > --- trunk/orte/util/name_fns.c (original) > +++ trunk/orte/util/name_fns.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -46,7 +46,7 @@ > { > list->name.jobid = ORTE_JOBID_INVALID; > list->name.vpid = ORTE_VPID_INVALID; > - list->name.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(list->name.epoch,ORTE_EPOCH_MIN); > } > > /* destructor - used to free any resources held by instance */ > @@ -116,7 +116,10 @@ > char* orte_util_print_name_args(const orte_process_name_t *name) > { > orte_print_args_buffers_t *ptr; > - char *job, *vpid, *epoch; > + char *job, *vpid; > +#if ORTE_ENABLE_EPOCH > + char *epoch; > +#endif > > /* protect against NULL names */ > if (NULL == name) { > @@ -141,7 +144,7 @@ > */ > job = orte_util_print_jobids(name->jobid); > vpid = orte_util_print_vpids(name->vpid); > - epoch = orte_util_print_epoch(name->epoch); > + ORTE_EPOCH_SET(epoch,orte_util_print_epoch(name->epoch)); > > /* get the next buffer */ > ptr = get_print_name_buffer(); > @@ -156,9 +159,15 @@ > ptr->cntr = 0; > } > > +#if ORTE_ENABLE_EPOCH > snprintf(ptr->buffers[ptr->cntr++], > ORTE_PRINT_NAME_ARGS_MAX_SIZE, > "[%s,%s,%s]", job, vpid, epoch); > +#else > + snprintf(ptr->buffers[ptr->cntr++], > + ORTE_PRINT_NAME_ARGS_MAX_SIZE, > + "[%s,%s]", job, vpid); > +#endif > > return ptr->buffers[ptr->cntr-1]; > } > @@ -282,6 +291,7 @@ > return ptr->buffers[ptr->cntr-1]; > } > > +#if ORTE_ENABLE_EPOCH > char* orte_util_print_epoch(const orte_epoch_t epoch) > { > orte_print_args_buffers_t *ptr; > @@ -309,6 +319,7 @@ > } > return ptr->buffers[ptr->cntr-1]; > } > +#endif > > > > @@ -403,6 +414,7 @@ > return ORTE_SUCCESS; > } > > +#if ORTE_ENABLE_EPOCH > int orte_util_convert_epoch_to_string(char **epoch_string, const orte_epoch_t > epoch) > { > /* check for wildcard value - handle appropriately */ > @@ -425,7 +437,6 @@ > return ORTE_SUCCESS; > } > > - > int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* > epoch_string) > { > if (NULL == epoch_string) { /* got an error */ > @@ -450,6 +461,7 @@ > > return ORTE_SUCCESS; > } > +#endif > > int orte_util_convert_string_to_process_name(orte_process_name_t *name, > const char* name_string) > @@ -457,13 +469,15 @@ > char *temp, *token; > orte_jobid_t job; > orte_vpid_t vpid; > +#if ORTE_ENABLE_EPOCH > orte_epoch_t epoch; > +#endif > int return_code=ORTE_SUCCESS; > - > + > /* set default */ > name->jobid = ORTE_JOBID_INVALID; > name->vpid = ORTE_VPID_INVALID; > - name->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(name->epoch,ORTE_EPOCH_MIN); > > /* check for NULL string - error */ > if (NULL == name_string) { > @@ -510,6 +524,7 @@ > vpid = strtoul(token, NULL, 10); > } > > +#if ORTE_ENABLE_EPOCH > token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field > -> epoch*/ > > /* check for error */ > @@ -528,10 +543,11 @@ > } else { > epoch = strtoul(token, NULL, 10); > } > +#endif > > name->jobid = job; > name->vpid = vpid; > - name->epoch = epoch; > + ORTE_EPOCH_SET(name->epoch,epoch); > > free(temp); > > @@ -568,6 +584,7 @@ > asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned > long)name->vpid); > } > > +#if ORTE_ENABLE_EPOCH > if (ORTE_EPOCH_WILDCARD == name->epoch) { > asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, > ORTE_SCHEMA_WILDCARD_STRING); > } else if (ORTE_EPOCH_INVALID == name->epoch) { > @@ -575,6 +592,10 @@ > } else { > asprintf(name_string, "%s%c%lu", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, > (unsigned long)name->epoch); > } > +#else > + asprintf(name_string, "%s", tmp2); > +#endif > + > free(tmp); > free(tmp2); > > @@ -585,8 +606,11 @@ > /**** CREATE PROCESS NAME ****/ > int orte_util_create_process_name(orte_process_name_t **name, > orte_jobid_t job, > - orte_vpid_t vpid, > - orte_epoch_t epoch) > + orte_vpid_t vpid > +#if ORTE_ENABLE_EPOCH > + ,orte_epoch_t epoch > +#endif > + ) > { > *name = NULL; > > @@ -598,7 +622,8 @@ > > (*name)->jobid = job; > (*name)->vpid = vpid; > - (*name)->epoch = epoch; > + ORTE_EPOCH_SET((*name)->epoch,epoch); > + > return ORTE_SUCCESS; > } > > @@ -655,6 +680,7 @@ > } > } > > +#if ORTE_ENABLE_EPOCH > /* Get here if jobid's and vpid's are equal, or not being checked. > * Now check epoch. > */ > @@ -666,6 +692,7 @@ > return OPAL_VALUE1_GREATER; > } > } > +#endif > > /* only way to get here is if all fields are being checked and are equal, > * or jobid not checked, but vpid equal, > > Modified: trunk/orte/util/name_fns.h > ============================================================================== > --- trunk/orte/util/name_fns.h (original) > +++ trunk/orte/util/name_fns.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -61,9 +61,13 @@ > #define ORTE_VPID_PRINT(n) \ > orte_util_print_vpids(n) > > +#if ORTE_ENABLE_EPOCH > ORTE_DECLSPEC char* orte_util_print_epoch(const orte_epoch_t epoch); > #define ORTE_EPOCH_PRINT(n) \ > orte_util_print_epoch(n) > +#else > +#define ORTE_EPOCH_PRINT(n) > +#endif > > ORTE_DECLSPEC char* orte_util_print_job_family(const orte_jobid_t job); > #define ORTE_JOB_FAMILY_PRINT(n) \ > @@ -104,6 +108,24 @@ > #define ORTE_JOBID_IS_DAEMON(n) \ > !((n) & 0x0000ffff) > > +/* Macro for getting the epoch out of the process name */ > +#if ORTE_ENABLE_EPOCH > +#define ORTE_EPOCH_GET(n) \ > + ((n)->epoch) > +#else > +#define ORTE_EPOCH_GET(n) > +#endif > + > +/* Macro for setting the epoch in the process name */ > +#if ORTE_ENABLE_EPOCH > +#define ORTE_EPOCH_SET(n,m) \ > + ( (n) = (m) ) > +#else > +#define ORTE_EPOCH_SET(n,m) \ > + do { \ > + } while(0); > +#endif > + > /* List of names for general use */ > struct orte_namelist_t { > opal_list_item_t item; /**< Allows this item to be placed on a list > */ > @@ -117,16 +139,24 @@ > ORTE_DECLSPEC int orte_util_convert_string_to_jobid(orte_jobid_t *jobid, > const char* jobidstring); > ORTE_DECLSPEC int orte_util_convert_vpid_to_string(char **vpid_string, const > orte_vpid_t vpid); > ORTE_DECLSPEC int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const > char* vpidstring); > +#if ORTE_ENABLE_EPOCH > ORTE_DECLSPEC int orte_util_convert_epoch_to_string(char **epoch_string, > const orte_epoch_t epoch); > ORTE_DECLSPEC int orte_util_convert_string_to_epoch(orte_vpid_t *epoch, const > char* epochstring); > +#endif > ORTE_DECLSPEC int > orte_util_convert_string_to_process_name(orte_process_name_t *name, > const char* name_string); > ORTE_DECLSPEC int orte_util_convert_process_name_to_string(char** name_string, > const orte_process_name_t *name); > +#if ORTE_ENABLE_EPOCH > ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, > orte_jobid_t job, > orte_vpid_t vpid, > orte_epoch_t epoch); > +#else > +ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, > + orte_jobid_t job, > + orte_vpid_t vpid); > +#endif > ORTE_DECLSPEC int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields, > const orte_process_name_t* name1, > const orte_process_name_t* name2); > > Modified: trunk/orte/util/nidmap.c > ============================================================================== > --- trunk/orte/util/nidmap.c (original) > +++ trunk/orte/util/nidmap.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -249,7 +249,7 @@ > */ > /* construct the URI */ > proc.vpid = node->daemon; > - proc.epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN); > > orte_util_convert_process_name_to_string(&proc_name, &proc); > asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, > (int)orte_process_info.my_port); > @@ -1001,6 +1001,7 @@ > } > #endif > > +#if ORTE_ENABLE_EPOCH > /* Look up the current epoch value that we have stored locally. > * > * Note that this will not ping the HNP to get the most up to date epoch > stored > @@ -1023,7 +1024,9 @@ > /*print_orte_job_data();*/ > return e; > } > +#endif > > +#if ORTE_RESIL_ORTE > bool orte_util_proc_is_running(orte_process_name_t *proc) { > int i; > unsigned int j; > @@ -1078,7 +1081,9 @@ > > return ORTE_ERROR; > } > +#endif > > +#if ORTE_ENABLE_EPOCH > /* > * This function performs both the get and set operations on the epoch for a > * sepcific process name. If the epoch passed into the function is > @@ -1091,6 +1096,11 @@ > orte_job_t *jdata; > orte_proc_t *pdata; > > + if (ORTE_JOBID_INVALID == proc->jobid || > + ORTE_VPID_INVALID == proc->vpid) { > + return ORTE_EPOCH_INVALID; > + } > + > /* Sanity check just to make sure we don't overwrite our existing > * orte_job_data. > */ > @@ -1165,4 +1175,5 @@ > return ORTE_EPOCH_MIN; > } > } > +#endif > > > Modified: trunk/orte/util/nidmap.h > ============================================================================== > --- trunk/orte/util/nidmap.h (original) > +++ trunk/orte/util/nidmap.h 2011-08-26 18:16:14 EDT (Fri, 26 Aug 2011) > @@ -48,11 +48,19 @@ > ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc); > ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc); > > +#if ORTE_ENABLE_EPOCH > ORTE_DECLSPEC orte_epoch_t orte_util_lookup_epoch(orte_process_name_t *proc); > ORTE_DECLSPEC orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, > orte_epoch_t epoch); > +#endif > > ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, > orte_proc_state_t state); > + > +#if ORTE_RESIL_ORTE > +#define PROC_IS_RUNNING(n) orte_util_proc_is_running(n) > ORTE_DECLSPEC bool orte_util_proc_is_running(orte_process_name_t *proc); > +#else > +#define PROC_IS_RUNNING(n) ( true ) > +#endif > > ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr); > ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); > @@ -72,5 +80,8 @@ > END_C_DECLS > > /* Local functions */ > +#if ORTE_ENABLE_EPOCH > orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, > orte_epoch_t epoch); > #endif > + > +#endif > > Modified: trunk/orte/util/proc_info.c > ============================================================================== > --- trunk/orte/util/proc_info.c (original) > +++ trunk/orte/util/proc_info.c 2011-08-26 18:16:14 EDT (Fri, 26 Aug > 2011) > @@ -36,13 +36,19 @@ > > #include "orte/util/proc_info.h" > > +#if ORTE_ENABLE_EPOCH > +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, > ORTE_EPOCH_MIN} > +#else > +#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID} > +#endif > + > ORTE_DECLSPEC orte_proc_info_t orte_process_info = { > - /* .my_name = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, > ORTE_EPOCH_MIN}, > - /* .my_daemon = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, > ORTE_EPOCH_MIN}, > + /* .my_name = */ ORTE_NAME_INVALID, > + /* .my_daemon = */ ORTE_NAME_INVALID, > /* .my_daemon_uri = */ NULL, > - /* .my_hnp = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, > ORTE_EPOCH_MIN}, > + /* .my_hnp = */ ORTE_NAME_INVALID, > /* .my_hnp_uri = */ NULL, > - /* .my_parent = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, > ORTE_EPOCH_MIN}, > + /* .my_parent = */ ORTE_NAME_INVALID, > /* .hnp_pid = */ 0, > /* .app_num = */ 0, > /* .num_procs = */ 1, > > Modified: trunk/test/util/orte_session_dir.c > ============================================================================== > --- trunk/test/util/orte_session_dir.c (original) > +++ trunk/test/util/orte_session_dir.c 2011-08-26 18:16:14 EDT (Fri, > 26 Aug 2011) > @@ -57,7 +57,7 @@ > orte_process_info.my_name->cellid = 0; > orte_process_info.my_name->jobid = 0; > orte_process_info.my_name->vpid = 0; > - orte_process_info.my_name->epoch = ORTE_EPOCH_MIN; > + ORTE_EPOCH_SET(orte_process_info.my_name->epoch,ORTE_EPOCH_MIN); > > test_init("orte_session_dir_t"); > test_out = fopen( "test_session_dir_out", "w+" ); > _______________________________________________ > svn-full mailing list > svn-f...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn-full