Nathan,
why not just make the topology information available at that point as
you described it ?
the attached patch does this, could you please review it ?
Cheers,
Gilles
On 2014/09/26 2:50, Nathan Hjelm wrote:
> On Tue, Aug 26, 2014 at 07:03:24PM +0300, Lisandro Dalcin wrote:
>> I finally managed to track down some issues in mpi4py's test suite
>> using Open MPI 1.8+. The code below should be enough to reproduce the
>> problem. Run it under valgrind to make sense of my following
>> diagnostics.
>>
>> In this code I'm creating a 2D, periodic Cartesian topology out of
>> COMM_SELF. In this case, the process in COMM_SELF has 4 logical in/out
>> links to itself. So we have size=1 but indegree=outdegree=4. However,
>> in ompi/mca/coll/basic/coll_basic_module.c, "size * 2" request are
>> being allocated to manage communication:
>>
>> if (OMPI_COMM_IS_INTER(comm)) {
>> size = ompi_comm_remote_size(comm);
>> } else {
>> size = ompi_comm_size(comm);
>> }
>> basic_module->mccb_num_reqs = size * 2;
>> basic_module->mccb_reqs = (ompi_request_t**)
>> malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
>>
>> I guess you have to also special-case for topologies and allocate
>> indegree+outdegree requests (not sure about this number, just
>> guessing).
>>
> I wish this was possible but the topology information is not available
> at that point. We may be able to change that but I don't see the work
> completing anytime soon. I committed an alternative fix as r32796 and
> CMR'd it to 1.8.3. I can confirm that the attached reproducer no longer
> produces a SEGV. Let me know if you run into any more issues.
>
>
> -Nathan
>
>
> _______________________________________________
> devel mailing list
> [email protected]
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post:
> http://www.open-mpi.org/community/lists/devel/2014/09/15915.php
Index: ompi/mca/topo/base/topo_base_cart_create.c
===================================================================
--- ompi/mca/topo/base/topo_base_cart_create.c (revision 32807)
+++ ompi/mca/topo/base/topo_base_cart_create.c (working copy)
@@ -163,10 +163,18 @@
return MPI_ERR_INTERN;
}
+ assert(NULL == new_comm->c_topo);
+ assert(!(new_comm->c_flags & OMPI_COMM_CART));
+ new_comm->c_topo = topo;
+ new_comm->c_topo->mtc.cart = cart;
+ new_comm->c_topo->reorder = reorder;
+ new_comm->c_flags |= OMPI_COMM_CART;
ret = ompi_comm_enable(old_comm, new_comm,
new_rank, num_procs, topo_procs);
if (OMPI_SUCCESS != ret) {
/* something wrong happened during setting the communicator */
+ new_comm->c_topo = NULL;
+ new_comm->c_flags &= ~OMPI_COMM_CART;
ompi_comm_free (&new_comm);
free(topo_procs);
if(NULL != cart->periods) free(cart->periods);
@@ -176,10 +184,6 @@
return ret;
}
- new_comm->c_topo = topo;
- new_comm->c_topo->mtc.cart = cart;
- new_comm->c_topo->reorder = reorder;
- new_comm->c_flags |= OMPI_COMM_CART;
*comm_topo = new_comm;
if( MPI_UNDEFINED == new_rank ) {
Index: ompi/mca/coll/basic/coll_basic_module.c
===================================================================
--- ompi/mca/coll/basic/coll_basic_module.c (revision 32807)
+++ ompi/mca/coll/basic/coll_basic_module.c (working copy)
@@ -13,6 +13,8 @@
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
+ * Copyright (c) 2014 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -28,6 +30,7 @@
#include "mpi.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
+#include "ompi/mca/topo/topo.h"
#include "coll_basic.h"
@@ -70,6 +73,15 @@
} else {
size = ompi_comm_size(comm);
}
+ if (comm->c_flags & OMPI_COMM_CART) {
+ int cart_size;
+ assert (NULL != comm->c_topo);
+ comm->c_topo->topo.cart.cartdim_get(comm, &cart_size);
+ cart_size *= 2;
+ if (cart_size > size) {
+ size = cart_size;
+ }
+ }
basic_module->mccb_num_reqs = size * 2;
basic_module->mccb_reqs = (ompi_request_t**)
malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);