Found a pretty nasty frag leak (and a minor one) in ob1 (see commit below). If this fix addresses some hangs we are seeing on infiniband LANL might want a 1.4.6 rolled (or a faster rollout for 1.6.0).
-Nathan ---------- Forwarded message ---------- List-Post: devel@lists.open-mpi.org Date: Thu, 1 Mar 2012 08:53:39 -0700 From: hje...@osl.iu.edu Reply-To: de...@open-mpi.org To: s...@open-mpi.org Subject: [OMPI svn] svn:open-mpi r26077 Author: hjelmn List-Post: devel@lists.open-mpi.org Date: 2012-03-01 10:53:39 EST (Thu, 01 Mar 2012) New Revision: 26077 URL: https://svn.open-mpi.org/trac/ompi/changeset/26077 Log: ob1: fix two fragment leaks - MAJOR! get src descriptor leaks if mca_bml_base_send fails - minor. descriptor leaked in mca_pml_send_request_start_copy if the btl returns OMPI_ERR_RESOURCE_BUSY. Text files modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c | 27 ++++++++++++++++----------- 1 files changed, 16 insertions(+), 11 deletions(-) Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c ============================================================================== --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c (original) +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c 2012-03-01 10:53:39 EST (Thu, 01 Mar 2012) @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -546,15 +549,14 @@ } return OMPI_SUCCESS; } - switch(OPAL_SOS_GET_ERROR_CODE(rc)) { - case OMPI_ERR_RESOURCE_BUSY: - /* No more resources. Allow the upper level to queue the send */ - rc = OMPI_ERR_OUT_OF_RESOURCE; - break; - default: - mca_bml_base_free(bml_btl, des); - break; + + if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + /* No more resources. Allow the upper level to queue the send */ + rc = OMPI_ERR_OUT_OF_RESOURCE; } + + mca_bml_base_free (bml_btl, des); + return rc; } @@ -631,7 +633,7 @@ * operation is achieved. */ - mca_btl_base_descriptor_t* des; + mca_btl_base_descriptor_t *des, *src = NULL; mca_btl_base_segment_t* segment; mca_pml_ob1_hdr_t* hdr; bool need_local_cb = false; @@ -640,7 +642,6 @@ bml_btl = sendreq->req_rdma[0].bml_btl; if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg; - mca_btl_base_descriptor_t* src; size_t i; size_t old_position = sendreq->req_send.req_base.req_convertor.bConverted; @@ -781,6 +782,10 @@ return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, des); + if (NULL != src) { + mca_bml_base_free (bml_btl, src); + } + return rc; } @@ -1144,7 +1149,7 @@ 0, &frag->rdma_length, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_FLAGS_PUT, + MCA_BTL_DES_FLAGS_PUT, &des ); if( OPAL_UNLIKELY(NULL == des) ) { _______________________________________________ svn mailing list s...@open-mpi.org http://www.open-mpi.org/mailman/listinfo.cgi/svn