Nathan, here is a revision of the previously attached patch, and that supports graph and dist graph.
Cheers, Gilles On 2014/09/30 0:05, Nathan Hjelm wrote: > An equivalent change would need to be made for graph and dist graph as > well. That will take a little more work. Also, I was avoiding changing > anything in topo for 1.8. > > -Nathan > > On Mon, Sep 29, 2014 at 08:02:41PM +0900, Gilles Gouaillardet wrote: >> Nathan, >> >> why not just make the topology information available at that point as you >> described it ? >> >> the attached patch does this, could you please review it ? >> >> Cheers, >> >> Gilles >> >> On 2014/09/26 2:50, Nathan Hjelm wrote: >> >> On Tue, Aug 26, 2014 at 07:03:24PM +0300, Lisandro Dalcin wrote: >> >> I finally managed to track down some issues in mpi4py's test suite >> using Open MPI 1.8+. The code below should be enough to reproduce the >> problem. Run it under valgrind to make sense of my following >> diagnostics. >> >> In this code I'm creating a 2D, periodic Cartesian topology out of >> COMM_SELF. In this case, the process in COMM_SELF has 4 logical in/out >> links to itself. So we have size=1 but indegree=outdegree=4. However, >> in ompi/mca/coll/basic/coll_basic_module.c, "size * 2" request are >> being allocated to manage communication: >> >> if (OMPI_COMM_IS_INTER(comm)) { >> size = ompi_comm_remote_size(comm); >> } else { >> size = ompi_comm_size(comm); >> } >> basic_module->mccb_num_reqs = size * 2; >> basic_module->mccb_reqs = (ompi_request_t**) >> malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); >> >> I guess you have to also special-case for topologies and allocate >> indegree+outdegree requests (not sure about this number, just >> guessing). >> >> >> I wish this was possible but the topology information is not available >> at that point. We may be able to change that but I don't see the work >> completing anytime soon. I committed an alternative fix as r32796 and >> CMR'd it to 1.8.3. I can confirm that the attached reproducer no longer >> produces a SEGV. Let me know if you run into any more issues. >> >> >> -Nathan >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel >> Link to this post: >> http://www.open-mpi.org/community/lists/devel/2014/09/15915.php >> Index: ompi/mca/topo/base/topo_base_cart_create.c >> =================================================================== >> --- ompi/mca/topo/base/topo_base_cart_create.c (revision 32807) >> +++ ompi/mca/topo/base/topo_base_cart_create.c (working copy) >> @@ -163,10 +163,18 @@ >> return MPI_ERR_INTERN; >> } >> >> + assert(NULL == new_comm->c_topo); >> + assert(!(new_comm->c_flags & OMPI_COMM_CART)); >> + new_comm->c_topo = topo; >> + new_comm->c_topo->mtc.cart = cart; >> + new_comm->c_topo->reorder = reorder; >> + new_comm->c_flags |= OMPI_COMM_CART; >> ret = ompi_comm_enable(old_comm, new_comm, >> new_rank, num_procs, topo_procs); >> if (OMPI_SUCCESS != ret) { >> /* something wrong happened during setting the communicator */ >> + new_comm->c_topo = NULL; >> + new_comm->c_flags &= ~OMPI_COMM_CART; >> ompi_comm_free (&new_comm); >> free(topo_procs); >> if(NULL != cart->periods) free(cart->periods); >> @@ -176,10 +184,6 @@ >> return ret; >> } >> >> - new_comm->c_topo = topo; >> - new_comm->c_topo->mtc.cart = cart; >> - new_comm->c_topo->reorder = reorder; >> - new_comm->c_flags |= OMPI_COMM_CART; >> *comm_topo = new_comm; >> >> if( MPI_UNDEFINED == new_rank ) { >> Index: ompi/mca/coll/basic/coll_basic_module.c >> =================================================================== >> --- ompi/mca/coll/basic/coll_basic_module.c (revision 32807) >> +++ ompi/mca/coll/basic/coll_basic_module.c (working copy) >> @@ -13,6 +13,8 @@ >> * Copyright (c) 2012 Sandia National Laboratories. All rights >> reserved. >> * Copyright (c) 2013 Los Alamos National Security, LLC. All rights >> * reserved. >> + * Copyright (c) 2014 Research Organization for Information Science >> + * and Technology (RIST). All rights reserved. >> * $COPYRIGHT$ >> * >> * Additional copyrights may follow >> @@ -28,6 +30,7 @@ >> #include "mpi.h" >> #include "ompi/mca/coll/coll.h" >> #include "ompi/mca/coll/base/base.h" >> +#include "ompi/mca/topo/topo.h" >> #include "coll_basic.h" >> >> >> @@ -70,6 +73,15 @@ >> } else { >> size = ompi_comm_size(comm); >> } >> + if (comm->c_flags & OMPI_COMM_CART) { >> + int cart_size; >> + assert (NULL != comm->c_topo); >> + comm->c_topo->topo.cart.cartdim_get(comm, &cart_size); >> + cart_size *= 2; >> + if (cart_size > size) { >> + size = cart_size; >> + } >> + } >> basic_module->mccb_num_reqs = size * 2; >> basic_module->mccb_reqs = (ompi_request_t**) >> malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel >> Link to this post: >> http://www.open-mpi.org/community/lists/devel/2014/09/15929.php > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel > Link to this post: > http://www.open-mpi.org/community/lists/devel/2014/09/15930.php
Index: ompi/mca/topo/base/topo_base_cart_create.c =================================================================== --- ompi/mca/topo/base/topo_base_cart_create.c (revision 32815) +++ ompi/mca/topo/base/topo_base_cart_create.c (working copy) @@ -163,10 +163,18 @@ return MPI_ERR_INTERN; } + assert(NULL == new_comm->c_topo); + assert(!(new_comm->c_flags & OMPI_COMM_CART)); + new_comm->c_topo = topo; + new_comm->c_topo->mtc.cart = cart; + new_comm->c_topo->reorder = reorder; + new_comm->c_flags |= OMPI_COMM_CART; ret = ompi_comm_enable(old_comm, new_comm, new_rank, num_procs, topo_procs); if (OMPI_SUCCESS != ret) { /* something wrong happened during setting the communicator */ + new_comm->c_topo = NULL; + new_comm->c_flags &= ~OMPI_COMM_CART; ompi_comm_free (&new_comm); free(topo_procs); if(NULL != cart->periods) free(cart->periods); @@ -176,10 +184,6 @@ return ret; } - new_comm->c_topo = topo; - new_comm->c_topo->mtc.cart = cart; - new_comm->c_topo->reorder = reorder; - new_comm->c_flags |= OMPI_COMM_CART; *comm_topo = new_comm; if( MPI_UNDEFINED == new_rank ) { Index: ompi/mca/coll/basic/coll_basic_module.c =================================================================== --- ompi/mca/coll/basic/coll_basic_module.c (revision 32815) +++ ompi/mca/coll/basic/coll_basic_module.c (working copy) @@ -13,6 +13,8 @@ * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,6 +30,8 @@ #include "mpi.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" +#include "ompi/mca/topo/topo.h" +#include "ompi/mca/topo/base/base.h" #include "coll_basic.h" @@ -70,7 +74,36 @@ } else { size = ompi_comm_size(comm); } - basic_module->mccb_num_reqs = size * 2; + size *= 2; + if (OMPI_COMM_IS_CART(comm)) { + int cart_size; + mca_topo_base_comm_cart_2_1_0_t *cart; + assert (NULL != comm->c_topo); + cart = comm->c_topo->mtc.cart; + cart_size = cart->ndims * 4; + if (cart_size > size) { + size = cart_size; + } + } else if (OMPI_COMM_IS_GRAPH(comm)) { + int rank, degree; + assert (NULL != comm->c_topo); + rank = ompi_comm_rank (comm); + mca_topo_base_graph_neighbors_count (comm, rank, °ree); + degree *= 2; + if (degree > size) { + size = degree; + } + } else if (OMPI_COMM_IS_DIST_GRAPH(comm)) { + int dist_graph_size; + mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph; + assert (NULL != comm->c_topo); + dist_graph = comm->c_topo->mtc.dist_graph; + dist_graph_size = dist_graph->indegree + dist_graph->outdegree; + if (dist_graph_size > size) { + size = dist_graph_size; + } + } + basic_module->mccb_num_reqs = size; basic_module->mccb_reqs = (ompi_request_t**) malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);