I will commit the log messages with my fix. Will PR the fix for 1.8.5.

-Nathan

On Wed, Apr 22, 2015 at 12:43:56PM -0600, Nathan Hjelm wrote:
> 
> Umm, why are you cleaning up this way. The allocated resources *should*
> be freed by the udcm_module_finalize call. If there is a bug in that
> path it should be fixed there NOT by adding a bunch of gotos (ick).
> 
> I will take a look now and apply the appropriate fix.
> 
> -Nathan
> 
> On Wed, Apr 22, 2015 at 04:55:57PM +0200, Raphaël Fouassier wrote:
> > We are experiencing a bug in OpenMPI in 1.8.4 which happens also on
> > master: if locked memory limits are too low, a segfault happens
> > in openib/udcm because some memory is not correctly deallocated.
> > 
> > To reproduce it, modify /etc/security/limits.conf with:
> > * soft memlock 64
> > * hard memlock 64
> > and launch with mpirun (not in a slurm allocation).
> > 
> > 
> > I propose 2 patches for 1.8.4 and master (because of the btl move to
> > opal) which:
> > - free all allocated ressources
> > - print the limits error
> > 
> 
> > diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c 
> > b/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
> > index 19753a9..b222274 100644
> > --- a/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
> > +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
> > @@ -5,6 +5,7 @@
> >   * Copyright (c) 2009      IBM Corporation.  All rights reserved.
> >   * Copyright (c) 2011-2014 Los Alamos National Security, LLC.  All rights
> >   *                         reserved.
> > + * Copyright (c) 2015      Bull SAS. All rights reserved.
> >   *
> >   * $COPYRIGHT$
> >   * 
> > @@ -460,6 +461,8 @@ static int udcm_component_query(mca_btl_openib_module_t 
> > *btl,
> >  
> >          rc = udcm_module_init (m, btl);
> >          if (OMPI_SUCCESS != rc) {
> > +            free(m);
> > +            m = NULL;
> >              break;
> >          }
> >  
> > @@ -536,9 +539,19 @@ static int udcm_endpoint_finalize(struct 
> > mca_btl_base_endpoint_t *lcl_ep)
> >      return OMPI_SUCCESS;
> >  }
> >  
> > +static void *udcm_unmonitor(int fd, int flags, void *context)
> > +{
> > +    volatile int *barrier = (volatile int *)context;
> > +
> > +    *barrier = 1;
> > +
> > +    return NULL;
> > +}
> > +
> >  static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t 
> > *btl)
> >  {
> >      int rc = OMPI_ERR_NOT_SUPPORTED;
> > +    volatile int barrier = 0;
> >  
> >      BTL_VERBOSE(("created cpc module %p for btl %p",
> >                   (void*)m, (void*)btl));
> > @@ -549,7 +562,7 @@ static int udcm_module_init (udcm_module_t *m, 
> > mca_btl_openib_module_t *btl)
> >      m->cm_channel = ibv_create_comp_channel (btl->device->ib_dev_context);
> >      if (NULL == m->cm_channel) {
> >          BTL_VERBOSE(("error creating ud completion channel"));
> > -        return OMPI_ERR_NOT_SUPPORTED;
> > +        goto out;
> >      }
> >  
> >      /* Create completion queues */
> > @@ -558,29 +571,33 @@ static int udcm_module_init (udcm_module_t *m, 
> > mca_btl_openib_module_t *btl)
> >                                     m->cm_channel, 0);
> >      if (NULL == m->cm_recv_cq) {
> >          BTL_VERBOSE(("error creating ud recv completion queue"));
> > -        return OMPI_ERR_NOT_SUPPORTED;
> > +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
> > +                                       
> > ibv_get_device_name(btl->device->ib_dev));
> > +        goto out1;
> >      }
> >  
> >      m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context,
> >                                     UDCM_SEND_CQ_SIZE, NULL, NULL, 0);
> >      if (NULL == m->cm_send_cq) {
> >          BTL_VERBOSE(("error creating ud send completion queue"));
> > -        return OMPI_ERR_NOT_SUPPORTED;
> > +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
> > +                                       
> > ibv_get_device_name(btl->device->ib_dev));
> > +        goto out2;
> >      }
> >  
> >      if (0 != (rc = udcm_module_allocate_buffers (m))) {
> >          BTL_VERBOSE(("error allocating cm buffers"));
> > -        return rc;
> > +        goto out3;
> >      }
> >  
> >      if (0 != (rc = udcm_module_create_listen_qp (m))) {
> >          BTL_VERBOSE(("error creating UD QP"));
> > -        return rc;
> > +        goto out4;
> >      }
> >  
> >      if (0 != (rc = udcm_module_post_all_recvs (m))) {
> >          BTL_VERBOSE(("error posting receives"));
> > -        return rc;
> > +        goto out5;
> >      }
> >  
> >      /* UD CM initialized properly.  So fill in the rest of the CPC
> > @@ -633,12 +650,41 @@ static int udcm_module_init (udcm_module_t *m, 
> > mca_btl_openib_module_t *btl)
> >      /* Finally, request CQ notification */
> >      if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) {
> >          BTL_VERBOSE(("error requesting recv completions"));
> > -        return OMPI_ERROR;
> > +        rc = OMPI_ERROR;
> > +        goto out6;
> >      }
> >  
> >      /* Ready to use */
> >  
> >      return OMPI_SUCCESS;
> > +
> > +    out6:
> > +        OBJ_DESTRUCT(&m->cm_timeout_lock);
> > +        OBJ_DESTRUCT(&m->flying_messages);
> > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue_lock);
> > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue);
> > +        OBJ_DESTRUCT(&m->cm_send_lock);
> > +        OBJ_DESTRUCT(&m->cm_lock);
> > +
> > +        m->channel_monitored = false;
> > +
> > +        ompi_btl_openib_fd_unmonitor(m->cm_channel->fd,
> > +                                     udcm_unmonitor, (void *)&barrier);
> > +        while (0 == barrier) {
> > +            sched_yield();
> > +        }
> > +    out5:
> > +        udcm_module_destroy_listen_qp (m);
> > +    out4:
> > +        udcm_module_destroy_buffers (m);
> > +    out3:
> > +        ibv_destroy_cq (m->cm_send_cq);
> > +    out2:
> > +        ibv_destroy_cq (m->cm_recv_cq);
> > +    out1:
> > +        ibv_destroy_comp_channel (m->cm_channel);
> > +    out:
> > +        return rc;
> >  }
> >  
> >  static int
> > @@ -691,15 +737,6 @@ 
> > udcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
> >      return rc;
> >  }
> >  
> > -static void *udcm_unmonitor(int fd, int flags, void *context)
> > -{
> > -    volatile int *barrier = (volatile int *)context;
> > -
> > -    *barrier = 1;
> > -
> > -    return NULL;
> > -}
> > -
> >  static int udcm_module_finalize(mca_btl_openib_module_t *btl,
> >                                  ompi_btl_openib_connect_base_module_t *cpc)
> >  {
> 
> > diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c 
> > b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
> > index e0b8e80..eb4785a 100644
> > --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
> > +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
> > @@ -8,7 +8,7 @@
> >   * Copyright (c) 2014-2015 Research Organization for Information Science
> >   *                         and Technology (RIST). All rights reserved.
> >   * Copyright (c) 2014      Intel, Inc. All rights reserved.
> > - * Copyright (c) 2014      Bull SAS.  All rights reserved.
> > + * Copyright (c) 2014-2015 Bull SAS.  All rights reserved.
> >   *
> >   * $COPYRIGHT$
> >   * 
> > @@ -470,6 +470,8 @@ static int udcm_component_query(mca_btl_openib_module_t 
> > *btl,
> >  
> >          rc = udcm_module_init (m, btl);
> >          if (OPAL_SUCCESS != rc) {
> > +            free(m);
> > +            m = NULL;
> >              break;
> >          }
> >  
> > @@ -646,9 +648,19 @@ static int udcm_endpoint_finalize(struct 
> > mca_btl_base_endpoint_t *lcl_ep)
> >      return OPAL_SUCCESS;
> >  }
> >  
> > +static void *udcm_unmonitor(int fd, int flags, void *context)
> > +{
> > +    volatile int *barrier = (volatile int *)context;
> > +
> > +    *barrier = 1;
> > +
> > +    return NULL;
> > +}
> > +
> >  static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t 
> > *btl)
> >  {
> >      int rc = OPAL_ERR_NOT_SUPPORTED;
> > +    volatile int barrier = 0;
> >  
> >      BTL_VERBOSE(("created cpc module %p for btl %p",
> >                   (void*)m, (void*)btl));
> > @@ -659,7 +671,7 @@ static int udcm_module_init (udcm_module_t *m, 
> > mca_btl_openib_module_t *btl)
> >      m->cm_channel = ibv_create_comp_channel (btl->device->ib_dev_context);
> >      if (NULL == m->cm_channel) {
> >          BTL_VERBOSE(("error creating ud completion channel"));
> > -        return OPAL_ERR_NOT_SUPPORTED;
> > +        goto out;
> >      }
> >  
> >      /* Create completion queues */
> > @@ -668,29 +680,33 @@ static int udcm_module_init (udcm_module_t *m, 
> > mca_btl_openib_module_t *btl)
> >                                     m->cm_channel, 0);
> >      if (NULL == m->cm_recv_cq) {
> >          BTL_VERBOSE(("error creating ud recv completion queue"));
> > -        return OPAL_ERR_NOT_SUPPORTED;
> > +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
> > +                                       
> > ibv_get_device_name(btl->device->ib_dev));
> > +        goto out1;
> >      }
> >  
> >      m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context,
> >                                     UDCM_SEND_CQ_SIZE, NULL, NULL, 0);
> >      if (NULL == m->cm_send_cq) {
> >          BTL_VERBOSE(("error creating ud send completion queue"));
> > -        return OPAL_ERR_NOT_SUPPORTED;
> > +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
> > +                                       
> > ibv_get_device_name(btl->device->ib_dev));
> > +        goto out2;
> >      }
> >  
> >      if (0 != (rc = udcm_module_allocate_buffers (m))) {
> >          BTL_VERBOSE(("error allocating cm buffers"));
> > -        return rc;
> > +        goto out3;
> >      }
> >  
> >      if (0 != (rc = udcm_module_create_listen_qp (m))) {
> >          BTL_VERBOSE(("error creating UD QP"));
> > -        return rc;
> > +        goto out4;
> >      }
> >  
> >      if (0 != (rc = udcm_module_post_all_recvs (m))) {
> >          BTL_VERBOSE(("error posting receives"));
> > -        return rc;
> > +        goto out5;
> >      }
> >  
> >      /* UD CM initialized properly.  So fill in the rest of the CPC
> > @@ -743,12 +759,40 @@ static int udcm_module_init (udcm_module_t *m, 
> > mca_btl_openib_module_t *btl)
> >      /* Finally, request CQ notification */
> >      if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) {
> >          BTL_VERBOSE(("error requesting recv completions"));
> > -        return OPAL_ERROR;
> > +        rc = OPAL_ERROR;
> > +        goto out6;
> >      }
> >  
> >      /* Ready to use */
> >  
> >      return OPAL_SUCCESS;
> > +
> > +    out6:
> > +        OBJ_DESTRUCT(&m->cm_timeout_lock);
> > +        OBJ_DESTRUCT(&m->flying_messages);
> > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue_lock);
> > +        OBJ_DESTRUCT(&m->cm_recv_msg_queue);
> > +        OBJ_DESTRUCT(&m->cm_send_lock);
> > +        OBJ_DESTRUCT(&m->cm_lock);
> > +
> > +        m->channel_monitored = false;
> > +
> > +        opal_btl_openib_fd_unmonitor(m->cm_channel->fd, udcm_unmonitor, 
> > (void *)&barrier);
> > +        while (0 == barrier) {
> > +            sched_yield();
> > +        }
> > +    out5:
> > +        udcm_module_destroy_listen_qp (m);
> > +    out4:
> > +        udcm_module_destroy_buffers (m);
> > +    out3:
> > +        ibv_destroy_cq (m->cm_send_cq);
> > +    out2:
> > +        ibv_destroy_cq (m->cm_recv_cq);
> > +    out1:
> > +        ibv_destroy_comp_channel (m->cm_channel);
> > +    out:
> > +        return rc;
> >  }
> >  
> >  static int
> > @@ -801,15 +845,6 @@ 
> > udcm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc,
> >      return rc;
> >  }
> >  
> > -static void *udcm_unmonitor(int fd, int flags, void *context)
> > -{
> > -    volatile int *barrier = (volatile int *)context;
> > -
> > -    *barrier = 1;
> > -
> > -    return NULL;
> > -}
> > -
> >  static int udcm_module_finalize(mca_btl_openib_module_t *btl,
> >                                  opal_btl_openib_connect_base_module_t *cpc)
> >  {
> 
> > _______________________________________________
> > devel mailing list
> > de...@open-mpi.org
> > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> > Link to this post: 
> > http://www.open-mpi.org/community/lists/devel/2015/04/17305.php
> 



> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post: 
> http://www.open-mpi.org/community/lists/devel/2015/04/17311.php

Attachment: pgpztI2XL1Rqx.pgp
Description: PGP signature

Reply via email to