Agreed. goto's just make me grumpy.

-Nathan

On Wed, Apr 22, 2015 at 01:17:11PM -0600, Howard Pritchard wrote:
>    Hi Rafael,
>    I give you an A+ for effort.   We always appreciate patches.
>    Howard
>    2015-04-22 12:43 GMT-06:00 Nathan Hjelm <hje...@lanl.gov>:
> 
>      Umm, why are you cleaning up this way. The allocated resources *should*
>      be freed by the udcm_module_finalize call. If there is a bug in that
>      path it should be fixed there NOT by adding a bunch of gotos (ick).
> 
>      I will take a look now and apply the appropriate fix.
> 
>      -Nathan
> 
>      On Wed, Apr 22, 2015 at 04:55:57PM +0200, Raphael Fouassier wrote:
>      > We are experiencing a bug in OpenMPI in 1.8.4 which happens also on
>      > master: if locked memory limits are too low, a segfault happens
>      > in openib/udcm because some memory is not correctly deallocated.
>      >
>      > To reproduce it, modify /etc/security/limits.conf with:
>      > * soft memlock 64
>      > * hard memlock 64
>      > and launch with mpirun (not in a slurm allocation).
>      >
>      >
>      > I propose 2 patches for 1.8.4 and master (because of the btl move to
>      > opal) which:
>      > - free all allocated ressources
>      > - print the limits error
>      >
> 
>      > diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      b/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      > index 19753a9..b222274 100644
>      > --- a/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      > +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      > @@ -5,6 +5,7 @@
>      >   * Copyright (c) 2009      IBM Corporation.  All rights reserved.
>      >   * Copyright (c) 2011-2014 Los Alamos National Security, LLC.  All
>      rights
>      >   *                         reserved.
>      > + * Copyright (c) 2015      Bull SAS. All rights reserved.
>      >   *
>      >   * $COPYRIGHT$
>      >   *
>      > @@ -460,6 +461,8 @@ static int
>      udcm_component_query(mca_btl_openib_module_t *btl,
>      >
>      >          rc = udcm_module_init (m, btl);
>      >          if (OMPI_SUCCESS != rc) {
>      > +            free(m);
>      > +            m = NULL;
>      >              break;
>      >          }
>      >
>      > @@ -536,9 +539,19 @@ static int udcm_endpoint_finalize(struct
>      mca_btl_base_endpoint_t *lcl_ep)
>      >      return OMPI_SUCCESS;
>      >  }
>      >
>      > +static void *udcm_unmonitor(int fd, int flags, void *context)
>      > +{
>      > +    volatile int *barrier = (volatile int *)context;
>      > +
>      > +    *barrier = 1;
>      > +
>      > +    return NULL;
>      > +}
>      > +
>      >  static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >  {
>      >      int rc = OMPI_ERR_NOT_SUPPORTED;
>      > +    volatile int barrier = 0;
>      >
>      >      BTL_VERBOSE(("created cpc module %p for btl %p",
>      >                   (void*)m, (void*)btl));
>      > @@ -549,7 +562,7 @@ static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >      m->cm_channel = ibv_create_comp_channel
>      (btl->device->ib_dev_context);
>      >      if (NULL == m->cm_channel) {
>      >          BTL_VERBOSE(("error creating ud completion channel"));
>      > -        return OMPI_ERR_NOT_SUPPORTED;
>      > +        goto out;
>      >      }
>      >
>      >      /* Create completion queues */
>      > @@ -558,29 +571,33 @@ static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >                                     m->cm_channel, 0);
>      >      if (NULL == m->cm_recv_cq) {
>      >          BTL_VERBOSE(("error creating ud recv completion queue"));
>      > -        return OMPI_ERR_NOT_SUPPORTED;
>      > +        mca_btl_openib_show_init_error(__FILE__, __LINE__,
>      "ibv_create_cq",
>      > +                                     
>       ibv_get_device_name(btl->device->ib_dev));
>      > +        goto out1;
>      >      }
>      >
>      >      m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context,
>      >                                     UDCM_SEND_CQ_SIZE, NULL, NULL, 0);
>      >      if (NULL == m->cm_send_cq) {
>      >          BTL_VERBOSE(("error creating ud send completion queue"));
>      > -        return OMPI_ERR_NOT_SUPPORTED;
>      > +        mca_btl_openib_show_init_error(__FILE__, __LINE__,
>      "ibv_create_cq",
>      > +                                     
>       ibv_get_device_name(btl->device->ib_dev));
>      > +        goto out2;
>      >      }
>      >
>      >      if (0 != (rc = udcm_module_allocate_buffers (m))) {
>      >          BTL_VERBOSE(("error allocating cm buffers"));
>      > -        return rc;
>      > +        goto out3;
>      >      }
>      >
>      >      if (0 != (rc = udcm_module_create_listen_qp (m))) {
>      >          BTL_VERBOSE(("error creating UD QP"));
>      > -        return rc;
>      > +        goto out4;
>      >      }
>      >
>      >      if (0 != (rc = udcm_module_post_all_recvs (m))) {
>      >          BTL_VERBOSE(("error posting receives"));
>      > -        return rc;
>      > +        goto out5;
>      >      }
>      >
>      >      /* UD CM initialized properly.  So fill in the rest of the CPC
>      > @@ -633,12 +650,41 @@ static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >      /* Finally, request CQ notification */
>      >      if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) {
>      >          BTL_VERBOSE(("error requesting recv completions"));
>      > -        return OMPI_ERROR;
>      > +        rc = OMPI_ERROR;
>      > +        goto out6;
>      >      }
>      >
>      >      /* Ready to use */
>      >
>      >      return OMPI_SUCCESS;
>      > +
>      > +    out6:
>      > +        OBJ_DESTRUCT(&m->cm_timeout_lock);
>      > +        OBJ_DESTRUCT(&m->flying_messages);
>      > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue_lock);
>      > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue);
>      > +        OBJ_DESTRUCT(&m->cm_send_lock);
>      > +        OBJ_DESTRUCT(&m->cm_lock);
>      > +
>      > +        m->channel_monitored = false;
>      > +
>      > +        ompi_btl_openib_fd_unmonitor(m->cm_channel->fd,
>      > +                                     udcm_unmonitor, (void
>      *)&barrier);
>      > +        while (0 == barrier) {
>      > +            sched_yield();
>      > +        }
>      > +    out5:
>      > +        udcm_module_destroy_listen_qp (m);
>      > +    out4:
>      > +        udcm_module_destroy_buffers (m);
>      > +    out3:
>      > +        ibv_destroy_cq (m->cm_send_cq);
>      > +    out2:
>      > +        ibv_destroy_cq (m->cm_recv_cq);
>      > +    out1:
>      > +        ibv_destroy_comp_channel (m->cm_channel);
>      > +    out:
>      > +        return rc;
>      >  }
>      >
>      >  static int
>      > @@ -691,15 +737,6 @@
>      udcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
>      >      return rc;
>      >  }
>      >
>      > -static void *udcm_unmonitor(int fd, int flags, void *context)
>      > -{
>      > -    volatile int *barrier = (volatile int *)context;
>      > -
>      > -    *barrier = 1;
>      > -
>      > -    return NULL;
>      > -}
>      > -
>      >  static int udcm_module_finalize(mca_btl_openib_module_t *btl,
>      >                                  ompi_btl_openib_connect_base_module_t
>      *cpc)
>      >  {
> 
>      > diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      > index e0b8e80..eb4785a 100644
>      > --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      > +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>      > @@ -8,7 +8,7 @@
>      >   * Copyright (c) 2014-2015 Research Organization for Information
>      Science
>      >   *                         and Technology (RIST). All rights
>      reserved.
>      >   * Copyright (c) 2014      Intel, Inc. All rights reserved.
>      > - * Copyright (c) 2014      Bull SAS.  All rights reserved.
>      > + * Copyright (c) 2014-2015 Bull SAS.  All rights reserved.
>      >   *
>      >   * $COPYRIGHT$
>      >   *
>      > @@ -470,6 +470,8 @@ static int
>      udcm_component_query(mca_btl_openib_module_t *btl,
>      >
>      >          rc = udcm_module_init (m, btl);
>      >          if (OPAL_SUCCESS != rc) {
>      > +            free(m);
>      > +            m = NULL;
>      >              break;
>      >          }
>      >
>      > @@ -646,9 +648,19 @@ static int udcm_endpoint_finalize(struct
>      mca_btl_base_endpoint_t *lcl_ep)
>      >      return OPAL_SUCCESS;
>      >  }
>      >
>      > +static void *udcm_unmonitor(int fd, int flags, void *context)
>      > +{
>      > +    volatile int *barrier = (volatile int *)context;
>      > +
>      > +    *barrier = 1;
>      > +
>      > +    return NULL;
>      > +}
>      > +
>      >  static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >  {
>      >      int rc = OPAL_ERR_NOT_SUPPORTED;
>      > +    volatile int barrier = 0;
>      >
>      >      BTL_VERBOSE(("created cpc module %p for btl %p",
>      >                   (void*)m, (void*)btl));
>      > @@ -659,7 +671,7 @@ static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >      m->cm_channel = ibv_create_comp_channel
>      (btl->device->ib_dev_context);
>      >      if (NULL == m->cm_channel) {
>      >          BTL_VERBOSE(("error creating ud completion channel"));
>      > -        return OPAL_ERR_NOT_SUPPORTED;
>      > +        goto out;
>      >      }
>      >
>      >      /* Create completion queues */
>      > @@ -668,29 +680,33 @@ static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >                                     m->cm_channel, 0);
>      >      if (NULL == m->cm_recv_cq) {
>      >          BTL_VERBOSE(("error creating ud recv completion queue"));
>      > -        return OPAL_ERR_NOT_SUPPORTED;
>      > +        mca_btl_openib_show_init_error(__FILE__, __LINE__,
>      "ibv_create_cq",
>      > +                                     
>       ibv_get_device_name(btl->device->ib_dev));
>      > +        goto out1;
>      >      }
>      >
>      >      m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context,
>      >                                     UDCM_SEND_CQ_SIZE, NULL, NULL, 0);
>      >      if (NULL == m->cm_send_cq) {
>      >          BTL_VERBOSE(("error creating ud send completion queue"));
>      > -        return OPAL_ERR_NOT_SUPPORTED;
>      > +        mca_btl_openib_show_init_error(__FILE__, __LINE__,
>      "ibv_create_cq",
>      > +                                     
>       ibv_get_device_name(btl->device->ib_dev));
>      > +        goto out2;
>      >      }
>      >
>      >      if (0 != (rc = udcm_module_allocate_buffers (m))) {
>      >          BTL_VERBOSE(("error allocating cm buffers"));
>      > -        return rc;
>      > +        goto out3;
>      >      }
>      >
>      >      if (0 != (rc = udcm_module_create_listen_qp (m))) {
>      >          BTL_VERBOSE(("error creating UD QP"));
>      > -        return rc;
>      > +        goto out4;
>      >      }
>      >
>      >      if (0 != (rc = udcm_module_post_all_recvs (m))) {
>      >          BTL_VERBOSE(("error posting receives"));
>      > -        return rc;
>      > +        goto out5;
>      >      }
>      >
>      >      /* UD CM initialized properly.  So fill in the rest of the CPC
>      > @@ -743,12 +759,40 @@ static int udcm_module_init (udcm_module_t *m,
>      mca_btl_openib_module_t *btl)
>      >      /* Finally, request CQ notification */
>      >      if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) {
>      >          BTL_VERBOSE(("error requesting recv completions"));
>      > -        return OPAL_ERROR;
>      > +        rc = OPAL_ERROR;
>      > +        goto out6;
>      >      }
>      >
>      >      /* Ready to use */
>      >
>      >      return OPAL_SUCCESS;
>      > +
>      > +    out6:
>      > +        OBJ_DESTRUCT(&m->cm_timeout_lock);
>      > +        OBJ_DESTRUCT(&m->flying_messages);
>      > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue_lock);
>      > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue);
>      > +        OBJ_DESTRUCT(&m->cm_send_lock);
>      > +        OBJ_DESTRUCT(&m->cm_lock);
>      > +
>      > +        m->channel_monitored = false;
>      > +
>      > +        opal_btl_openib_fd_unmonitor(m->cm_channel->fd,
>      udcm_unmonitor, (void *)&barrier);
>      > +        while (0 == barrier) {
>      > +            sched_yield();
>      > +        }
>      > +    out5:
>      > +        udcm_module_destroy_listen_qp (m);
>      > +    out4:
>      > +        udcm_module_destroy_buffers (m);
>      > +    out3:
>      > +        ibv_destroy_cq (m->cm_send_cq);
>      > +    out2:
>      > +        ibv_destroy_cq (m->cm_recv_cq);
>      > +    out1:
>      > +        ibv_destroy_comp_channel (m->cm_channel);
>      > +    out:
>      > +        return rc;
>      >  }
>      >
>      >  static int
>      > @@ -801,15 +845,6 @@
>      udcm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc,
>      >      return rc;
>      >  }
>      >
>      > -static void *udcm_unmonitor(int fd, int flags, void *context)
>      > -{
>      > -    volatile int *barrier = (volatile int *)context;
>      > -
>      > -    *barrier = 1;
>      > -
>      > -    return NULL;
>      > -}
>      > -
>      >  static int udcm_module_finalize(mca_btl_openib_module_t *btl,
>      >                                  opal_btl_openib_connect_base_module_t
>      *cpc)
>      >  {
> 
>      > _______________________________________________
>      > devel mailing list
>      > de...@open-mpi.org
>      > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
>      > Link to this post:
>      http://www.open-mpi.org/community/lists/devel/2015/04/17305.php
> 
>      _______________________________________________
>      devel mailing list
>      de...@open-mpi.org
>      Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
>      Link to this post:
>      http://www.open-mpi.org/community/lists/devel/2015/04/17311.php

> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post: 
> http://www.open-mpi.org/community/lists/devel/2015/04/17314.php

Attachment: pgpQifcFBxibz.pgp
Description: PGP signature

Reply via email to