Not quite right. There still is no topology information at collective selection time for either graph or dist graph.
-Nathan
On Tue, Sep 30, 2014 at 02:03:27PM +0900, Gilles Gouaillardet wrote:
> Nathan,
>
> here is a revision of the previously attached patch, and that supports
> graph and dist graph.
>
> Cheers,
>
> Gilles
>
> On 2014/09/30 0:05, Nathan Hjelm wrote:
>
> An equivalent change would need to be made for graph and dist graph as
> well. That will take a little more work. Also, I was avoiding changing
> anything in topo for 1.8.
>
> -Nathan
>
> On Mon, Sep 29, 2014 at 08:02:41PM +0900, Gilles Gouaillardet wrote:
>
> Nathan,
>
> why not just make the topology information available at that point as you
> described it ?
>
> the attached patch does this, could you please review it ?
>
> Cheers,
>
> Gilles
>
> On 2014/09/26 2:50, Nathan Hjelm wrote:
>
> On Tue, Aug 26, 2014 at 07:03:24PM +0300, Lisandro Dalcin wrote:
>
> I finally managed to track down some issues in mpi4py's test suite
> using Open MPI 1.8+. The code below should be enough to reproduce the
> problem. Run it under valgrind to make sense of my following
> diagnostics.
>
> In this code I'm creating a 2D, periodic Cartesian topology out of
> COMM_SELF. In this case, the process in COMM_SELF has 4 logical in/out
> links to itself. So we have size=1 but indegree=outdegree=4. However,
> in ompi/mca/coll/basic/coll_basic_module.c, "size * 2" request are
> being allocated to manage communication:
>
> if (OMPI_COMM_IS_INTER(comm)) {
> size = ompi_comm_remote_size(comm);
> } else {
> size = ompi_comm_size(comm);
> }
> basic_module->mccb_num_reqs = size * 2;
> basic_module->mccb_reqs = (ompi_request_t**)
> malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
>
> I guess you have to also special-case for topologies and allocate
> indegree+outdegree requests (not sure about this number, just
> guessing).
>
>
> I wish this was possible but the topology information is not available
> at that point. We may be able to change that but I don't see the work
> completing anytime soon. I committed an alternative fix as r32796 and
> CMR'd it to 1.8.3. I can confirm that the attached reproducer no longer
> produces a SEGV. Let me know if you run into any more issues.
>
>
> -Nathan
>
> _______________________________________________
> devel mailing list
> [email protected]
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post:
> http://www.open-mpi.org/community/lists/devel/2014/09/15915.php
>
> Index: ompi/mca/topo/base/topo_base_cart_create.c
> ===================================================================
> --- ompi/mca/topo/base/topo_base_cart_create.c (revision 32807)
> +++ ompi/mca/topo/base/topo_base_cart_create.c (working copy)
> @@ -163,10 +163,18 @@
> return MPI_ERR_INTERN;
> }
>
> + assert(NULL == new_comm->c_topo);
> + assert(!(new_comm->c_flags & OMPI_COMM_CART));
> + new_comm->c_topo = topo;
> + new_comm->c_topo->mtc.cart = cart;
> + new_comm->c_topo->reorder = reorder;
> + new_comm->c_flags |= OMPI_COMM_CART;
> ret = ompi_comm_enable(old_comm, new_comm,
> new_rank, num_procs, topo_procs);
> if (OMPI_SUCCESS != ret) {
> /* something wrong happened during setting the communicator */
> + new_comm->c_topo = NULL;
> + new_comm->c_flags &= ~OMPI_COMM_CART;
> ompi_comm_free (&new_comm);
> free(topo_procs);
> if(NULL != cart->periods) free(cart->periods);
> @@ -176,10 +184,6 @@
> return ret;
> }
>
> - new_comm->c_topo = topo;
> - new_comm->c_topo->mtc.cart = cart;
> - new_comm->c_topo->reorder = reorder;
> - new_comm->c_flags |= OMPI_COMM_CART;
> *comm_topo = new_comm;
>
> if( MPI_UNDEFINED == new_rank ) {
> Index: ompi/mca/coll/basic/coll_basic_module.c
> ===================================================================
> --- ompi/mca/coll/basic/coll_basic_module.c (revision 32807)
> +++ ompi/mca/coll/basic/coll_basic_module.c (working copy)
> @@ -13,6 +13,8 @@
> * Copyright (c) 2012 Sandia National Laboratories. All rights
> reserved.
> * Copyright (c) 2013 Los Alamos National Security, LLC. All rights
> * reserved.
> + * Copyright (c) 2014 Research Organization for Information Science
> + * and Technology (RIST). All rights reserved.
> * $COPYRIGHT$
> *
> * Additional copyrights may follow
> @@ -28,6 +30,7 @@
> #include "mpi.h"
> #include "ompi/mca/coll/coll.h"
> #include "ompi/mca/coll/base/base.h"
> +#include "ompi/mca/topo/topo.h"
> #include "coll_basic.h"
>
>
> @@ -70,6 +73,15 @@
> } else {
> size = ompi_comm_size(comm);
> }
> + if (comm->c_flags & OMPI_COMM_CART) {
> + int cart_size;
> + assert (NULL != comm->c_topo);
> + comm->c_topo->topo.cart.cartdim_get(comm, &cart_size);
> + cart_size *= 2;
> + if (cart_size > size) {
> + size = cart_size;
> + }
> + }
> basic_module->mccb_num_reqs = size * 2;
> basic_module->mccb_reqs = (ompi_request_t**)
> malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
>
> _______________________________________________
> devel mailing list
> [email protected]
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post:
> http://www.open-mpi.org/community/lists/devel/2014/09/15929.php
>
> _______________________________________________
> devel mailing list
> [email protected]
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post:
> http://www.open-mpi.org/community/lists/devel/2014/09/15930.php
> Index: ompi/mca/topo/base/topo_base_cart_create.c
> ===================================================================
> --- ompi/mca/topo/base/topo_base_cart_create.c (revision 32815)
> +++ ompi/mca/topo/base/topo_base_cart_create.c (working copy)
> @@ -163,10 +163,18 @@
> return MPI_ERR_INTERN;
> }
>
> + assert(NULL == new_comm->c_topo);
> + assert(!(new_comm->c_flags & OMPI_COMM_CART));
> + new_comm->c_topo = topo;
> + new_comm->c_topo->mtc.cart = cart;
> + new_comm->c_topo->reorder = reorder;
> + new_comm->c_flags |= OMPI_COMM_CART;
> ret = ompi_comm_enable(old_comm, new_comm,
> new_rank, num_procs, topo_procs);
> if (OMPI_SUCCESS != ret) {
> /* something wrong happened during setting the communicator */
> + new_comm->c_topo = NULL;
> + new_comm->c_flags &= ~OMPI_COMM_CART;
> ompi_comm_free (&new_comm);
> free(topo_procs);
> if(NULL != cart->periods) free(cart->periods);
> @@ -176,10 +184,6 @@
> return ret;
> }
>
> - new_comm->c_topo = topo;
> - new_comm->c_topo->mtc.cart = cart;
> - new_comm->c_topo->reorder = reorder;
> - new_comm->c_flags |= OMPI_COMM_CART;
> *comm_topo = new_comm;
>
> if( MPI_UNDEFINED == new_rank ) {
> Index: ompi/mca/coll/basic/coll_basic_module.c
> ===================================================================
> --- ompi/mca/coll/basic/coll_basic_module.c (revision 32815)
> +++ ompi/mca/coll/basic/coll_basic_module.c (working copy)
> @@ -13,6 +13,8 @@
> * Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
> * Copyright (c) 2013 Los Alamos National Security, LLC. All rights
> * reserved.
> + * Copyright (c) 2014 Research Organization for Information Science
> + * and Technology (RIST). All rights reserved.
> * $COPYRIGHT$
> *
> * Additional copyrights may follow
> @@ -28,6 +30,8 @@
> #include "mpi.h"
> #include "ompi/mca/coll/coll.h"
> #include "ompi/mca/coll/base/base.h"
> +#include "ompi/mca/topo/topo.h"
> +#include "ompi/mca/topo/base/base.h"
> #include "coll_basic.h"
>
>
> @@ -70,7 +74,36 @@
> } else {
> size = ompi_comm_size(comm);
> }
> - basic_module->mccb_num_reqs = size * 2;
> + size *= 2;
> + if (OMPI_COMM_IS_CART(comm)) {
> + int cart_size;
> + mca_topo_base_comm_cart_2_1_0_t *cart;
> + assert (NULL != comm->c_topo);
> + cart = comm->c_topo->mtc.cart;
> + cart_size = cart->ndims * 4;
> + if (cart_size > size) {
> + size = cart_size;
> + }
> + } else if (OMPI_COMM_IS_GRAPH(comm)) {
> + int rank, degree;
> + assert (NULL != comm->c_topo);
> + rank = ompi_comm_rank (comm);
> + mca_topo_base_graph_neighbors_count (comm, rank, °ree);
> + degree *= 2;
> + if (degree > size) {
> + size = degree;
> + }
> + } else if (OMPI_COMM_IS_DIST_GRAPH(comm)) {
> + int dist_graph_size;
> + mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph;
> + assert (NULL != comm->c_topo);
> + dist_graph = comm->c_topo->mtc.dist_graph;
> + dist_graph_size = dist_graph->indegree + dist_graph->outdegree;
> + if (dist_graph_size > size) {
> + size = dist_graph_size;
> + }
> + }
> + basic_module->mccb_num_reqs = size;
> basic_module->mccb_reqs = (ompi_request_t**)
> malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
>
> _______________________________________________
> devel mailing list
> [email protected]
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post:
> http://www.open-mpi.org/community/lists/devel/2014/09/15941.php
pgp_NxQZWFh88.pgp
Description: PGP signature
