Ralph,

here is some more detailed information.


from orte_ess_base_app_finalize()
first orte_rml_base_close() is invoked(via mca_base_framework_close(&orte_rml_base_framework);
and it does
while (NULL != (item = opal_list_remove_first(&orte_rml_base.posted_recvs))) {
        OBJ_RELEASE(item);
    }
then, opal_stop_progress_thread() is invoked

that means that when orte_rml_base_close is invoked, the progress thread is up and running,
and can potentially invoke orte_rml_base_post_recv that does

    if (req->cancel) {
OPAL_LIST_FOREACH(recv, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) { if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &post->peer, &recv->peer) &&
                post->tag == recv->tag) {
opal_output_verbose(5, orte_rml_base_framework.framework_output,
                                    "%s canceling recv %d for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
post->tag, ORTE_NAME_PRINT(&recv->peer));
                /* got a match - remove it */
opal_list_remove_item(&orte_rml_base.posted_recvs, &recv->super);
                OBJ_RELEASE(recv);
                break;
            }
        }
        OBJ_RELEASE(req);
        return;
    }

/* this is where the assert fails */

since there is no lock, there is room for a race condition.


before all that happen, and in orte_ess_base_app_finalize(),
mca_base_framework_close(&orte_grpcomm_base_framework) invokes finalize from grpcomm_rcd.c
that does
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_RCD) that is
orte_rml_oob_recv_cancel that ends up doing opal_event_set(..., orte_rml_base_post_recv)


my first and naive attempt was to stop the opal_async progress thread before closing the rml_base framework: diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c
index d3bc6e6..4e09b47 100644
--- a/orte/mca/ess/base/ess_base_std_app.c
+++ b/orte/mca/ess/base/ess_base_std_app.c
@@ -353,18 +353,18 @@ int orte_ess_base_app_finalize(void)
     (void) mca_base_framework_close(&orte_dfs_base_framework);
     (void) mca_base_framework_close(&orte_routed_base_framework);

-    (void) mca_base_framework_close(&orte_rml_base_framework);
-    (void) mca_base_framework_close(&orte_oob_base_framework);
-    (void) mca_base_framework_close(&orte_state_base_framework);
-
-    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
-
     /* release the event base */
     if (progress_thread_running) {
         opal_stop_progress_thread("opal_async", true);
         progress_thread_running = false;
     }

+    (void) mca_base_framework_close(&orte_rml_base_framework);
+    (void) mca_base_framework_close(&orte_oob_base_framework);
+    (void) mca_base_framework_close(&orte_state_base_framework);
+
+    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
+
     return ORTE_SUCCESS;
 }

that did not work : the opal_async progress thread is also used by pmix, so at this stage, invoking opal_stop_progress_thread only decrements the refcount (e.g. no pthread_join() )

my second and dumb attempt was to finalize pmix before ess_base_app, and that did not work
(crash)

i ran out of idea on how to fix this issue, but i found a simple workaround : adding a short sleep (10 ms) in orte_rml_base_close() seems enough to avoid the race condition.

diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c
index 54dc454..050154c 100644
--- a/orte/mca/rml/base/rml_base_frame.c
+++ b/orte/mca/rml/base/rml_base_frame.c
@@ -17,6 +17,7 @@

 #include "orte_config.h"

+#include <sys/poll.h>
 #include <string.h>

 #include "opal/dss/dss.h"
@@ -78,6 +79,7 @@ static int orte_rml_base_close(void)
 {
     opal_list_item_t *item;

+    poll(NULL,0,10);
while (NULL != (item = opal_list_remove_first(&orte_rml_base.posted_recvs))) {
         OBJ_RELEASE(item);
     }

incidentally, i found two OPAL_LIST_FOREACH "loops" in which opal_list_remove_item is invoked. per a comment in opal_list.h, this is unsafe and OPAL_LIST_FOREACH_SAFE should be used :

diff --git a/orte/mca/rml/base/rml_base_msg_handlers.c b/orte/mca/rml/base/rml_base_msg_handlers.c
index 758bf91..22c7601 100644
--- a/orte/mca/rml/base/rml_base_msg_handlers.c
+++ b/orte/mca/rml/base/rml_base_msg_handlers.c
@@ -12,7 +12,9 @@
  *                         All rights reserved.
  * Copyright (c) 2007-2013 Los Alamos National Security, LLC.  All rights
  *                         reserved.
- * Copyright (c) 2015 Intel, Inc. All rights reserved.
+ * Copyright (c) 2015      Intel, Inc. All rights reserved.
+ * Copyright (c) 2015      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -55,7 +57,7 @@ static void msg_match_recv(orte_rml_posted_recv_t *rcv, bool get_all);
 void orte_rml_base_post_recv(int sd, short args, void *cbdata)
 {
     orte_rml_recv_request_t *req = (orte_rml_recv_request_t*)cbdata;
-    orte_rml_posted_recv_t *post, *recv;
+    orte_rml_posted_recv_t *post, *recv, *next;
     orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;

     opal_output_verbose(5, orte_rml_base_framework.framework_output,
@@ -75,7 +77,7 @@ void orte_rml_base_post_recv(int sd, short args, void *cbdata)
      * and remove it from our list
      */
     if (req->cancel) {
- OPAL_LIST_FOREACH(recv, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) { + OPAL_LIST_FOREACH_SAFE(recv, next, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) { if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &post->peer, &recv->peer) &&
                 post->tag == recv->tag) {
opal_output_verbose(5, orte_rml_base_framework.framework_output, @@ -120,12 +122,12 @@ void orte_rml_base_post_recv(int sd, short args, void *cbdata)

 void orte_rml_base_complete_recv_msg (orte_rml_recv_t **recv_msg)
 {
-    orte_rml_posted_recv_t *post;
+    orte_rml_posted_recv_t *post, *next;
     orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
     opal_buffer_t buf;
     orte_rml_recv_t *msg = *recv_msg;
     /* see if we have a waiting recv for this message */
- OPAL_LIST_FOREACH(post, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) { + OPAL_LIST_FOREACH_SAFE(post, next, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) {
         /* since names could include wildcards, must use
          * the more generalized comparison function
          */

i hope this helps,

Gilles

On 7/17/2015 11:04 PM, Ralph Castain wrote:
It’s probably a race condition caused by uniting the ORTE and OPAL async threads, though I can’t confirm that yet.

On Jul 17, 2015, at 3:11 AM, Gilles Gouaillardet <gilles.gouaillar...@gmail.com <mailto:gilles.gouaillar...@gmail.com>> wrote:

Folks,

I noticed several errors such as http://mtt.open-mpi.org/index.php?do_redir=2244
that did not make any sense to me (at first glance)

I was able to attach one process when the issue occurs.
the sigsegv occurs in thread 2, while thread 1 is invoking ompi_rte_finalize.

All I can think is a scenario in which the progress thread (aka thread 2) is still dealing with some memory that was just freed/unmapped/corrupted by the main thread.

I empirically noticed the error is more likely to occur when there are many tasks on one node
e.g. mpirun --oversubscribe -np 32 a.out

Cheers,

Gilles

_______________________________________________
devel mailing list
de...@open-mpi.org <mailto:de...@open-mpi.org>
Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
Link to this post: http://www.open-mpi.org/community/lists/devel/2015/07/17652.php



_______________________________________________
devel mailing list
de...@open-mpi.org
Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
Link to this post: 
http://www.open-mpi.org/community/lists/devel/2015/07/17655.php

Reply via email to