Hello,

I confirm that this PR fixes our issue.

Best regards,
Fouassier Raphaël


Le 22/04/2015 21:29, Nathan Hjelm a écrit :
> PR https://github.com/open-mpi/ompi-release/pull/250. Raphaël, can you
> please confirm this fixes your issue.
>
> -Nathan
>
> On Wed, Apr 22, 2015 at 04:55:57PM +0200, Raphaël Fouassier wrote:
>> We are experiencing a bug in OpenMPI in 1.8.4 which happens also on
>> master: if locked memory limits are too low, a segfault happens
>> in openib/udcm because some memory is not correctly deallocated.
>>
>> To reproduce it, modify /etc/security/limits.conf with:
>> * soft memlock 64
>> * hard memlock 64
>> and launch with mpirun (not in a slurm allocation).
>>
>>
>> I propose 2 patches for 1.8.4 and master (because of the btl move to
>> opal) which:
>> - free all allocated ressources
>> - print the limits error
>>
>> diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c 
>> b/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>> index 19753a9..b222274 100644
>> --- a/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>> +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c
>> @@ -5,6 +5,7 @@
>>   * Copyright (c) 2009      IBM Corporation.  All rights reserved.
>>   * Copyright (c) 2011-2014 Los Alamos National Security, LLC.  All rights
>>   *                         reserved.
>> + * Copyright (c) 2015      Bull SAS. All rights reserved.
>>   *
>>   * $COPYRIGHT$
>>   * 
>> @@ -460,6 +461,8 @@ static int udcm_component_query(mca_btl_openib_module_t 
>> *btl,
>>  
>>          rc = udcm_module_init (m, btl);
>>          if (OMPI_SUCCESS != rc) {
>> +            free(m);
>> +            m = NULL;
>>              break;
>>          }
>>  
>> @@ -536,9 +539,19 @@ static int udcm_endpoint_finalize(struct 
>> mca_btl_base_endpoint_t *lcl_ep)
>>      return OMPI_SUCCESS;
>>  }
>>  
>> +static void *udcm_unmonitor(int fd, int flags, void *context)
>> +{
>> +    volatile int *barrier = (volatile int *)context;
>> +
>> +    *barrier = 1;
>> +
>> +    return NULL;
>> +}
>> +
>>  static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl)
>>  {
>>      int rc = OMPI_ERR_NOT_SUPPORTED;
>> +    volatile int barrier = 0;
>>  
>>      BTL_VERBOSE(("created cpc module %p for btl %p",
>>                   (void*)m, (void*)btl));
>> @@ -549,7 +562,7 @@ static int udcm_module_init (udcm_module_t *m, 
>> mca_btl_openib_module_t *btl)
>>      m->cm_channel = ibv_create_comp_channel (btl->device->ib_dev_context);
>>      if (NULL == m->cm_channel) {
>>          BTL_VERBOSE(("error creating ud completion channel"));
>> -        return OMPI_ERR_NOT_SUPPORTED;
>> +        goto out;
>>      }
>>  
>>      /* Create completion queues */
>> @@ -558,29 +571,33 @@ static int udcm_module_init (udcm_module_t *m, 
>> mca_btl_openib_module_t *btl)
>>                                     m->cm_channel, 0);
>>      if (NULL == m->cm_recv_cq) {
>>          BTL_VERBOSE(("error creating ud recv completion queue"));
>> -        return OMPI_ERR_NOT_SUPPORTED;
>> +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
>> +                                       
>> ibv_get_device_name(btl->device->ib_dev));
>> +        goto out1;
>>      }
>>  
>>      m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context,
>>                                     UDCM_SEND_CQ_SIZE, NULL, NULL, 0);
>>      if (NULL == m->cm_send_cq) {
>>          BTL_VERBOSE(("error creating ud send completion queue"));
>> -        return OMPI_ERR_NOT_SUPPORTED;
>> +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
>> +                                       
>> ibv_get_device_name(btl->device->ib_dev));
>> +        goto out2;
>>      }
>>  
>>      if (0 != (rc = udcm_module_allocate_buffers (m))) {
>>          BTL_VERBOSE(("error allocating cm buffers"));
>> -        return rc;
>> +        goto out3;
>>      }
>>  
>>      if (0 != (rc = udcm_module_create_listen_qp (m))) {
>>          BTL_VERBOSE(("error creating UD QP"));
>> -        return rc;
>> +        goto out4;
>>      }
>>  
>>      if (0 != (rc = udcm_module_post_all_recvs (m))) {
>>          BTL_VERBOSE(("error posting receives"));
>> -        return rc;
>> +        goto out5;
>>      }
>>  
>>      /* UD CM initialized properly.  So fill in the rest of the CPC
>> @@ -633,12 +650,41 @@ static int udcm_module_init (udcm_module_t *m, 
>> mca_btl_openib_module_t *btl)
>>      /* Finally, request CQ notification */
>>      if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) {
>>          BTL_VERBOSE(("error requesting recv completions"));
>> -        return OMPI_ERROR;
>> +        rc = OMPI_ERROR;
>> +        goto out6;
>>      }
>>  
>>      /* Ready to use */
>>  
>>      return OMPI_SUCCESS;
>> +
>> +    out6:
>> +        OBJ_DESTRUCT(&m->cm_timeout_lock);
>> +        OBJ_DESTRUCT(&m->flying_messages);
>> +        OBJ_DESTRUCT(&m->cm_recv_msg_queue_lock);
>> +        OBJ_DESTRUCT(&m->cm_recv_msg_queue);
>> +        OBJ_DESTRUCT(&m->cm_send_lock);
>> +        OBJ_DESTRUCT(&m->cm_lock);
>> +
>> +        m->channel_monitored = false;
>> +
>> +        ompi_btl_openib_fd_unmonitor(m->cm_channel->fd,
>> +                                     udcm_unmonitor, (void *)&barrier);
>> +        while (0 == barrier) {
>> +            sched_yield();
>> +        }
>> +    out5:
>> +        udcm_module_destroy_listen_qp (m);
>> +    out4:
>> +        udcm_module_destroy_buffers (m);
>> +    out3:
>> +        ibv_destroy_cq (m->cm_send_cq);
>> +    out2:
>> +        ibv_destroy_cq (m->cm_recv_cq);
>> +    out1:
>> +        ibv_destroy_comp_channel (m->cm_channel);
>> +    out:
>> +        return rc;
>>  }
>>  
>>  static int
>> @@ -691,15 +737,6 @@ 
>> udcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
>>      return rc;
>>  }
>>  
>> -static void *udcm_unmonitor(int fd, int flags, void *context)
>> -{
>> -    volatile int *barrier = (volatile int *)context;
>> -
>> -    *barrier = 1;
>> -
>> -    return NULL;
>> -}
>> -
>>  static int udcm_module_finalize(mca_btl_openib_module_t *btl,
>>                                  ompi_btl_openib_connect_base_module_t *cpc)
>>  {
>> diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c 
>> b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>> index e0b8e80..eb4785a 100644
>> --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>> +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
>> @@ -8,7 +8,7 @@
>>   * Copyright (c) 2014-2015 Research Organization for Information Science
>>   *                         and Technology (RIST). All rights reserved.
>>   * Copyright (c) 2014      Intel, Inc. All rights reserved.
>> - * Copyright (c) 2014      Bull SAS.  All rights reserved.
>> + * Copyright (c) 2014-2015 Bull SAS.  All rights reserved.
>>   *
>>   * $COPYRIGHT$
>>   * 
>> @@ -470,6 +470,8 @@ static int udcm_component_query(mca_btl_openib_module_t 
>> *btl,
>>  
>>          rc = udcm_module_init (m, btl);
>>          if (OPAL_SUCCESS != rc) {
>> +            free(m);
>> +            m = NULL;
>>              break;
>>          }
>>  
>> @@ -646,9 +648,19 @@ static int udcm_endpoint_finalize(struct 
>> mca_btl_base_endpoint_t *lcl_ep)
>>      return OPAL_SUCCESS;
>>  }
>>  
>> +static void *udcm_unmonitor(int fd, int flags, void *context)
>> +{
>> +    volatile int *barrier = (volatile int *)context;
>> +
>> +    *barrier = 1;
>> +
>> +    return NULL;
>> +}
>> +
>>  static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl)
>>  {
>>      int rc = OPAL_ERR_NOT_SUPPORTED;
>> +    volatile int barrier = 0;
>>  
>>      BTL_VERBOSE(("created cpc module %p for btl %p",
>>                   (void*)m, (void*)btl));
>> @@ -659,7 +671,7 @@ static int udcm_module_init (udcm_module_t *m, 
>> mca_btl_openib_module_t *btl)
>>      m->cm_channel = ibv_create_comp_channel (btl->device->ib_dev_context);
>>      if (NULL == m->cm_channel) {
>>          BTL_VERBOSE(("error creating ud completion channel"));
>> -        return OPAL_ERR_NOT_SUPPORTED;
>> +        goto out;
>>      }
>>  
>>      /* Create completion queues */
>> @@ -668,29 +680,33 @@ static int udcm_module_init (udcm_module_t *m, 
>> mca_btl_openib_module_t *btl)
>>                                     m->cm_channel, 0);
>>      if (NULL == m->cm_recv_cq) {
>>          BTL_VERBOSE(("error creating ud recv completion queue"));
>> -        return OPAL_ERR_NOT_SUPPORTED;
>> +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
>> +                                       
>> ibv_get_device_name(btl->device->ib_dev));
>> +        goto out1;
>>      }
>>  
>>      m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context,
>>                                     UDCM_SEND_CQ_SIZE, NULL, NULL, 0);
>>      if (NULL == m->cm_send_cq) {
>>          BTL_VERBOSE(("error creating ud send completion queue"));
>> -        return OPAL_ERR_NOT_SUPPORTED;
>> +        mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq",
>> +                                       
>> ibv_get_device_name(btl->device->ib_dev));
>> +        goto out2;
>>      }
>>  
>>      if (0 != (rc = udcm_module_allocate_buffers (m))) {
>>          BTL_VERBOSE(("error allocating cm buffers"));
>> -        return rc;
>> +        goto out3;
>>      }
>>  
>>      if (0 != (rc = udcm_module_create_listen_qp (m))) {
>>          BTL_VERBOSE(("error creating UD QP"));
>> -        return rc;
>> +        goto out4;
>>      }
>>  
>>      if (0 != (rc = udcm_module_post_all_recvs (m))) {
>>          BTL_VERBOSE(("error posting receives"));
>> -        return rc;
>> +        goto out5;
>>      }
>>  
>>      /* UD CM initialized properly.  So fill in the rest of the CPC
>> @@ -743,12 +759,40 @@ static int udcm_module_init (udcm_module_t *m, 
>> mca_btl_openib_module_t *btl)
>>      /* Finally, request CQ notification */
>>      if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) {
>>          BTL_VERBOSE(("error requesting recv completions"));
>> -        return OPAL_ERROR;
>> +        rc = OPAL_ERROR;
>> +        goto out6;
>>      }
>>  
>>      /* Ready to use */
>>  
>>      return OPAL_SUCCESS;
>> +
>> +    out6:
>> +        OBJ_DESTRUCT(&m->cm_timeout_lock);
>> +        OBJ_DESTRUCT(&m->flying_messages);
>> +        OBJ_DESTRUCT(&m->cm_recv_msg_queue_lock);
>> +        OBJ_DESTRUCT(&m->cm_recv_msg_queue);
>> +        OBJ_DESTRUCT(&m->cm_send_lock);
>> +        OBJ_DESTRUCT(&m->cm_lock);
>> +
>> +        m->channel_monitored = false;
>> +
>> +        opal_btl_openib_fd_unmonitor(m->cm_channel->fd, udcm_unmonitor, 
>> (void *)&barrier);
>> +        while (0 == barrier) {
>> +            sched_yield();
>> +        }
>> +    out5:
>> +        udcm_module_destroy_listen_qp (m);
>> +    out4:
>> +        udcm_module_destroy_buffers (m);
>> +    out3:
>> +        ibv_destroy_cq (m->cm_send_cq);
>> +    out2:
>> +        ibv_destroy_cq (m->cm_recv_cq);
>> +    out1:
>> +        ibv_destroy_comp_channel (m->cm_channel);
>> +    out:
>> +        return rc;
>>  }
>>  
>>  static int
>> @@ -801,15 +845,6 @@ 
>> udcm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc,
>>      return rc;
>>  }
>>  
>> -static void *udcm_unmonitor(int fd, int flags, void *context)
>> -{
>> -    volatile int *barrier = (volatile int *)context;
>> -
>> -    *barrier = 1;
>> -
>> -    return NULL;
>> -}
>> -
>>  static int udcm_module_finalize(mca_btl_openib_module_t *btl,
>>                                  opal_btl_openib_connect_base_module_t *cpc)
>>  {
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
>> Link to this post: 
>> http://www.open-mpi.org/community/lists/devel/2015/04/17305.php
>
>
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post: 
> http://www.open-mpi.org/community/lists/devel/2015/04/17318.php

Reply via email to