OK. I wanted to post my patch later this week, but you beat me to it, so here it is attached. But my approach is completely different and may coexist with yours.
On Tue, Jun 05, 2007 at 12:03:55PM -0400, George Bosilca wrote: > The multi-NIC support was broken for a while. This patch correct it It was always completely broken as far as I can tell. > and take it back to the original performances (sum of bandwidths). Do you have sum of bandwidths between TCP and IB without leave_pinned just by using this patch. I doubt it. The problem with current code is that if you have mix of two networks and one of them doesn't need memory registration (like TCP) it hijacks all the traffic unless leave_pinned is in use. The reason is that memory is always appears to be registered on TCP and OB1 never tries to use something different for RDMA. > The idea behind is to decide in the beginning how to split the > message in fragments and their sizes and then only reschedule on the > BTLs that complete a fragment. So Instead of using a round-robin over > the BTL when we select a new BTL, we keep trace of the last BTL and > schedule the new fragment over it. Are you sure you attached correct patch? What the patch does doesn't match your description. It schedules new rdma fragment upon completion of the previous instead of blindly do round-robin and this is very good idea, but unfortunately implementation breaks threaded support (and this is not good as was decided today). Current assumption is that OB1 schedules one request only on one CPU at a time. When you call new mca_pml_ob1_recv_request_schedule_btl_exclusive() function schedule loop may run on another CPU. > > This way, we get good performance even when the relative difference > between the characteristics of the BTLs are huge. This patch was on > my modified versions for a while and it was used on one of our multi- > NIC clusters by several users for few months. > I suppose all NICs are ethernet? My approach is to pre calculate how much data should be send on each BTL in advance according to relative weight before we start scheduling. During schedule function there is no more calculation just chop data in rdma_frag_length peaces and send it. The current code doesn't do balance according to btl_weight at all if rdma_frag_length is much smaller than message length (it is INT_MAX for TCP, so TCP is special in this regard too). The reason is that each time schedule loop calculates how much data should be send it calculates a fragment size according to btl_weight and then chops it according to rdma_frag_length and lose any information it got from previous calculation. Just look at the code and do a simulation. You don't see it when all BTL have same bandwidth because no matter what relative bandwidth BTLs have OB1 will always schedule more or less same number of bytes on each one. -- Gleb.
diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index c8eec33..d134908 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -30,6 +30,44 @@ #include "pml_ob1.h" #include "pml_ob1_rdma.h" +/* Use this registration if no registration needed for a BTL instead of NULL. + * This will help other code to distinguish case when memory is not registered + * from case when registration is not needed */ +static mca_mpool_base_registration_t pml_ob1_dummy_reg; + +/* Calculate what percentage of a message to send through each BTL according to + * relative weight */ +static inline void calc_weighted_length(mca_pml_ob1_rdma_btl_t *rdma_btls, + int num_btls, size_t size, double weight_total) +{ + int i; + size_t length_left = size; + + /* shortcut for common case for only one BTL */ + if(num_btls == 1) { + rdma_btls[0].length = size; + return; + } + + for(i = 0; i < num_btls; i++) { + mca_bml_base_btl_t* bml_btl = rdma_btls[i].bml_btl; + size_t length = 0; + if(length_left != 0) { + length = (length_left > bml_btl->btl_eager_limit)? + ((size_t)(size * (bml_btl->btl_weight / weight_total))) : + length_left; + + if(length > length_left) + length = length_left; + length_left -= length; + } + rdma_btls[i].length = length; + } + + /* account for rounding errors */ + rdma_btls[0].length += length_left; +} + /* * Check to see if memory is registered or can be registered. Build a * set of registrations on the request. @@ -42,6 +80,7 @@ size_t mca_pml_ob1_rdma_btls( mca_pml_ob1_rdma_btl_t* rdma_btls) { size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; size_t num_btls_used = 0; size_t n; @@ -59,10 +98,7 @@ size_t mca_pml_ob1_rdma_btls( mca_mpool_base_registration_t* reg = NULL; mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool; - /* btl is rdma capable and registration is not required */ - if(NULL == btl_mpool) { - reg = NULL; - } else { + if(NULL != btl_mpool) { if(!mca_pml_ob1.leave_pinned) { /* look through existing registrations */ btl_mpool->mpool_find(btl_mpool, base, size, ®); @@ -73,14 +109,51 @@ size_t mca_pml_ob1_rdma_btls( if(NULL == reg) bml_btl = NULL; /* skip it */ + } else { + /* if registration is not required use dummy registration */ + reg = &pml_ob1_dummy_reg; } if(bml_btl != NULL) { rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].btl_reg = reg; + weight_total += bml_btl->btl_weight; num_btls_used++; } } + + /* if we don't use leave_pinned and all BTLs that already have this memory + * registered amount to less then half of available bandwidth - fall back to + * pipeline protocol */ + if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) + return 0; + + calc_weighted_length(rdma_btls, num_btls_used, size, weight_total); + bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; return num_btls_used; } + +size_t mca_pml_ob1_rdma_pipeline_btls( + mca_bml_base_endpoint_t* bml_endpoint, + size_t size, + mca_pml_ob1_rdma_btl_t* rdma_btls) +{ + int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + + for(i = 0; i < num_btls && i < MCA_PML_OB1_MAX_RDMA_PER_REQUEST; i++) { + rdma_btls[i].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); + if(rdma_btls[i].bml_btl->btl_mpool != NULL) + rdma_btls[i].btl_reg = NULL; + else + rdma_btls[i].btl_reg = &pml_ob1_dummy_reg; + + weight_total += rdma_btls[i].bml_btl->btl_weight; + } + + calc_weighted_length(rdma_btls, i, size, weight_total); + + return i; +} diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h index d2c983c..e135123 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h @@ -31,6 +31,7 @@ struct mca_bml_base_endpoint_t; struct mca_pml_ob1_rdma_btl_t { struct mca_bml_base_btl_t* bml_btl; struct mca_mpool_base_registration_t* btl_reg; + size_t length; }; typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t; @@ -46,5 +47,10 @@ typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t; size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls); +/* Choose RDMA BTLs to use for sending of a request by pipeline protocol. + * Calculate number of bytes to send through each BTL according to available + * bandwidth */ +size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, + size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls); #endif diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 9cf5661..b211b56 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -240,13 +240,15 @@ static int mca_pml_ob1_recv_request_ack( /* by default copy everything */ recvreq->req_send_offset = bytes_received; if(hdr->hdr_msg_length > bytes_received) { + size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); /* * lookup request buffer to determine if memory is already * registered. */ if(ompi_convertor_need_buffers(&recvreq->req_recv.req_convertor) == 0 && - hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG) { + hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG && + rdma_num != 0) { unsigned char *base; ompi_convertor_get_current_pointer( &recvreq->req_recv.req_convertor, (void**)&(base) ); @@ -261,18 +263,25 @@ static int mca_pml_ob1_recv_request_ack( recvreq->req_send_offset = hdr->hdr_msg_length; /* are rdma devices available for long rdma protocol */ - } else if (bml_endpoint->btl_send_limit < hdr->hdr_msg_length && - bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length && - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) { + } else { + if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length && + bml_endpoint->btl_rdma_offset < hdr->hdr_msg_length) { - /* use convertor to figure out the rdma offset for this request */ - recvreq->req_send_offset = hdr->hdr_msg_length - - bml_endpoint->btl_rdma_offset; - if(recvreq->req_send_offset < bytes_received) { - recvreq->req_send_offset = bytes_received; + /* use converter to figure out the rdma offset for this + * request */ + recvreq->req_send_offset = hdr->hdr_msg_length - + bml_endpoint->btl_rdma_offset; + if(recvreq->req_send_offset < bytes_received) + recvreq->req_send_offset = bytes_received; + ompi_convertor_set_position( + &recvreq->req_recv.req_convertor, + &recvreq->req_send_offset); + + recvreq->req_rdma_cnt = + mca_pml_ob1_rdma_pipeline_btls(bml_endpoint, + recvreq->req_send_offset - bytes_received, + recvreq->req_rdma); } - ompi_convertor_set_position( &recvreq->req_recv.req_convertor, - &recvreq->req_send_offset ); } } /* nothing to send by copy in/out - no need to ack */ @@ -591,14 +600,8 @@ void mca_pml_ob1_recv_request_matched_probe( int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq ) { ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc; - mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; mca_bml_base_btl_t* bml_btl; - int num_btl_avail = - mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); - int num_tries = num_btl_avail; - - if(recvreq->req_rdma_cnt) - num_tries = recvreq->req_rdma_cnt; + int num_tries = recvreq->req_rdma_cnt; do { size_t bytes_remaining = recvreq->req_send_offset - @@ -614,8 +617,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec mca_btl_base_descriptor_t* dst; mca_btl_base_descriptor_t* ctl; mca_mpool_base_registration_t * reg = NULL; - int rc; - + int rc, rdma_idx; + if(prev_bytes_remaining == bytes_remaining) { if( ++num_fail == num_tries ) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); @@ -635,47 +638,18 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset); - if(recvreq->req_rdma_cnt) { - /* - * Select the next btl out of the list w/ preregistered - * memory. - */ - bml_btl = recvreq->req_rdma[recvreq->req_rdma_idx].bml_btl; - num_btl_avail = recvreq->req_rdma_cnt - recvreq->req_rdma_idx; - reg = recvreq->req_rdma[recvreq->req_rdma_idx].btl_reg; - - if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) + do { + rdma_idx = recvreq->req_rdma_idx; + bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; + reg = recvreq->req_rdma[rdma_idx].btl_reg; + size = recvreq->req_rdma[rdma_idx].length; + if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) recvreq->req_rdma_idx = 0; - } else { - /* - * Otherwise, schedule round-robin across the - * available RDMA nics dynamically registering/deregister - * as required. - */ - bml_btl = - mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); - } + } while(!size); - /* - * If more than one NIC is available - try to use both for - * anything larger than the eager limit - */ - if( num_btl_avail == 1 || - bytes_remaining < bml_btl->btl_eager_limit ) { - size = bytes_remaining; - } else { - /* - * otherwise attempt to give the BTL a percentage of - * the message based on a weighting factor. for - * simplicity calculate this as a percentage of the - * overall message length (regardless of amount - * previously assigned) - */ - size = (size_t)(bml_btl->btl_weight * bytes_remaining); - } /* makes sure that we don't exceed BTL max rdma size * if memory is not pinned already */ - if(0 == recvreq->req_rdma_cnt && + if(NULL == reg && bml_btl->btl_rdma_pipeline_frag_size != 0 && size > bml_btl->btl_rdma_pipeline_frag_size) { size = bml_btl->btl_rdma_pipeline_frag_size; @@ -683,8 +657,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec /* prepare a descriptor for RDMA */ mca_bml_base_prepare_dst(bml_btl, reg, - &recvreq->req_recv.req_convertor, MCA_BTL_NO_ORDER, - 0, &size, &dst); + &recvreq->req_recv.req_convertor, + MCA_BTL_NO_ORDER, 0, &size, &dst); if(dst == NULL) { continue; } @@ -750,6 +724,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec /* update request state */ recvreq->req_rdma_offset += size; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,1); + recvreq->req_rdma[rdma_idx].length -= size; bytes_remaining -= size; } else { mca_bml_base_free(bml_btl,ctl); diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 0928e02..14d1574 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -135,7 +135,7 @@ do { \ for( r = 0; r < recvreq->req_rdma_cnt; r++ ) { \ mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[r].btl_reg; \ - if( NULL != btl_reg ) { \ + if( NULL != btl_reg && btl_reg->mpool != NULL) { \ btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); \ } \ } \ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 75b7afc..16553df 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -122,7 +122,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s /* return mpool resources */ for(r = 0; r < sendreq->req_rdma_cnt; r++) { mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; - if( NULL != reg ) { + if( NULL != reg && reg->mpool != NULL ) { reg->mpool->mpool_deregister(reg->mpool, reg); } }