Nathan,

here is a revision of the previously attached patch, and that supports
graph and dist graph.

Cheers,

Gilles

On 2014/09/30 0:05, Nathan Hjelm wrote:
> An equivalent change would need to be made for graph and dist graph as
> well. That will take a little more work. Also, I was avoiding changing
> anything in topo for 1.8.
>
> -Nathan
>
> On Mon, Sep 29, 2014 at 08:02:41PM +0900, Gilles Gouaillardet wrote:
>>    Nathan,
>>
>>    why not just make the topology information available at that point as you
>>    described it ?
>>
>>    the attached patch does this, could you please review it ?
>>
>>    Cheers,
>>
>>    Gilles
>>
>>    On 2014/09/26 2:50, Nathan Hjelm wrote:
>>
>>  On Tue, Aug 26, 2014 at 07:03:24PM +0300, Lisandro Dalcin wrote:
>>
>>  I finally managed to track down some issues in mpi4py's test suite
>>  using Open MPI 1.8+. The code below should be enough to reproduce the
>>  problem. Run it under valgrind to make sense of my following
>>  diagnostics.
>>
>>  In this code I'm creating a 2D, periodic Cartesian topology out of
>>  COMM_SELF. In this case, the process in COMM_SELF has 4 logical in/out
>>  links to itself. So we have size=1 but indegree=outdegree=4. However,
>>  in ompi/mca/coll/basic/coll_basic_module.c, "size * 2" request are
>>  being allocated to manage communication:
>>
>>      if (OMPI_COMM_IS_INTER(comm)) {
>>          size = ompi_comm_remote_size(comm);
>>      } else {
>>          size = ompi_comm_size(comm);
>>      }
>>      basic_module->mccb_num_reqs = size * 2;
>>      basic_module->mccb_reqs = (ompi_request_t**)
>>          malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
>>
>>  I guess you have to also special-case for topologies and allocate
>>  indegree+outdegree requests (not sure about this number, just
>>  guessing).
>>
>>
>>  I wish this was possible but the topology information is not available
>>  at that point. We may be able to change that but I don't see the work
>>  completing anytime soon. I committed an alternative fix as r32796 and
>>  CMR'd it to 1.8.3. I can confirm that the attached reproducer no longer
>>  produces a SEGV. Let me know if you run into any more issues.
>>
>>
>>  -Nathan
>>
>>  _______________________________________________
>>  devel mailing list
>>  de...@open-mpi.org
>>  Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>  Link to this post: 
>> http://www.open-mpi.org/community/lists/devel/2014/09/15915.php
>> Index: ompi/mca/topo/base/topo_base_cart_create.c
>> ===================================================================
>> --- ompi/mca/topo/base/topo_base_cart_create.c       (revision 32807)
>> +++ ompi/mca/topo/base/topo_base_cart_create.c       (working copy)
>> @@ -163,10 +163,18 @@
>>          return MPI_ERR_INTERN;
>>      }
>>  
>> +    assert(NULL == new_comm->c_topo);
>> +    assert(!(new_comm->c_flags & OMPI_COMM_CART));
>> +    new_comm->c_topo           = topo;
>> +    new_comm->c_topo->mtc.cart = cart;
>> +    new_comm->c_topo->reorder  = reorder;
>> +    new_comm->c_flags         |= OMPI_COMM_CART;
>>      ret = ompi_comm_enable(old_comm, new_comm,
>>                             new_rank, num_procs, topo_procs);
>>      if (OMPI_SUCCESS != ret) {
>>          /* something wrong happened during setting the communicator */
>> +        new_comm->c_topo = NULL;
>> +        new_comm->c_flags &= ~OMPI_COMM_CART;
>>          ompi_comm_free (&new_comm);
>>          free(topo_procs);
>>          if(NULL != cart->periods) free(cart->periods);
>> @@ -176,10 +184,6 @@
>>          return ret;
>>      }
>>  
>> -    new_comm->c_topo           = topo;
>> -    new_comm->c_topo->mtc.cart = cart;
>> -    new_comm->c_topo->reorder  = reorder;
>> -    new_comm->c_flags         |= OMPI_COMM_CART;
>>      *comm_topo = new_comm;
>>  
>>      if( MPI_UNDEFINED == new_rank ) {
>> Index: ompi/mca/coll/basic/coll_basic_module.c
>> ===================================================================
>> --- ompi/mca/coll/basic/coll_basic_module.c  (revision 32807)
>> +++ ompi/mca/coll/basic/coll_basic_module.c  (working copy)
>> @@ -13,6 +13,8 @@
>>   * Copyright (c) 2012      Sandia National Laboratories. All rights 
>> reserved.
>>   * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
>>   *                         reserved.
>> + * Copyright (c) 2014      Research Organization for Information Science
>> + *                         and Technology (RIST). All rights reserved.
>>   * $COPYRIGHT$
>>   * 
>>   * Additional copyrights may follow
>> @@ -28,6 +30,7 @@
>>  #include "mpi.h"
>>  #include "ompi/mca/coll/coll.h"
>>  #include "ompi/mca/coll/base/base.h"
>> +#include "ompi/mca/topo/topo.h"
>>  #include "coll_basic.h"
>>  
>>  
>> @@ -70,6 +73,15 @@
>>      } else {
>>          size = ompi_comm_size(comm);
>>      }
>> +    if (comm->c_flags & OMPI_COMM_CART) {
>> +        int cart_size;
>> +        assert (NULL != comm->c_topo);
>> +        comm->c_topo->topo.cart.cartdim_get(comm, &cart_size);
>> +        cart_size *= 2;
>> +        if (cart_size > size) {
>> +            size = cart_size;
>> +        }
>> +    }
>>      basic_module->mccb_num_reqs = size * 2;
>>      basic_module->mccb_reqs = (ompi_request_t**) 
>>          malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
>> Link to this post: 
>> http://www.open-mpi.org/community/lists/devel/2014/09/15929.php
>
>
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post: 
> http://www.open-mpi.org/community/lists/devel/2014/09/15930.php

Index: ompi/mca/topo/base/topo_base_cart_create.c
===================================================================
--- ompi/mca/topo/base/topo_base_cart_create.c  (revision 32815)
+++ ompi/mca/topo/base/topo_base_cart_create.c  (working copy)
@@ -163,10 +163,18 @@
         return MPI_ERR_INTERN;
     }

+    assert(NULL == new_comm->c_topo);
+    assert(!(new_comm->c_flags & OMPI_COMM_CART));
+    new_comm->c_topo           = topo;
+    new_comm->c_topo->mtc.cart = cart;
+    new_comm->c_topo->reorder  = reorder;
+    new_comm->c_flags         |= OMPI_COMM_CART;
     ret = ompi_comm_enable(old_comm, new_comm,
                            new_rank, num_procs, topo_procs);
     if (OMPI_SUCCESS != ret) {
         /* something wrong happened during setting the communicator */
+        new_comm->c_topo = NULL;
+        new_comm->c_flags &= ~OMPI_COMM_CART;
         ompi_comm_free (&new_comm);
         free(topo_procs);
         if(NULL != cart->periods) free(cart->periods);
@@ -176,10 +184,6 @@
         return ret;
     }

-    new_comm->c_topo           = topo;
-    new_comm->c_topo->mtc.cart = cart;
-    new_comm->c_topo->reorder  = reorder;
-    new_comm->c_flags         |= OMPI_COMM_CART;
     *comm_topo = new_comm;

     if( MPI_UNDEFINED == new_rank ) {
Index: ompi/mca/coll/basic/coll_basic_module.c
===================================================================
--- ompi/mca/coll/basic/coll_basic_module.c     (revision 32815)
+++ ompi/mca/coll/basic/coll_basic_module.c     (working copy)
@@ -13,6 +13,8 @@
  * Copyright (c) 2012      Sandia National Laboratories. All rights reserved.
  * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2014      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  * 
  * Additional copyrights may follow
@@ -28,6 +30,8 @@
 #include "mpi.h"
 #include "ompi/mca/coll/coll.h"
 #include "ompi/mca/coll/base/base.h"
+#include "ompi/mca/topo/topo.h"
+#include "ompi/mca/topo/base/base.h"
 #include "coll_basic.h"


@@ -70,7 +74,36 @@
     } else {
         size = ompi_comm_size(comm);
     }
-    basic_module->mccb_num_reqs = size * 2;
+    size *= 2;
+    if (OMPI_COMM_IS_CART(comm)) {
+        int cart_size;
+        mca_topo_base_comm_cart_2_1_0_t *cart;
+        assert (NULL != comm->c_topo);
+        cart = comm->c_topo->mtc.cart;
+        cart_size = cart->ndims * 4;
+        if (cart_size > size) {
+            size = cart_size;
+        }
+    } else if (OMPI_COMM_IS_GRAPH(comm)) {
+        int rank, degree;
+        assert (NULL != comm->c_topo);
+        rank = ompi_comm_rank (comm);
+        mca_topo_base_graph_neighbors_count (comm, rank, &degree);
+        degree *= 2;
+        if (degree > size) {
+            size = degree;
+        }
+    } else if (OMPI_COMM_IS_DIST_GRAPH(comm)) {
+        int dist_graph_size;
+        mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph;
+        assert (NULL != comm->c_topo);
+        dist_graph = comm->c_topo->mtc.dist_graph;
+        dist_graph_size = dist_graph->indegree + dist_graph->outdegree;
+        if (dist_graph_size > size) {
+            size = dist_graph_size;
+        }
+    }
+    basic_module->mccb_num_reqs = size;
     basic_module->mccb_reqs = (ompi_request_t**) 
         malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);

Reply via email to