Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
nope, doesn't seem to make a difference. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
> Quoting Roland Dreier <[EMAIL PROTECTED]>: > Subject: Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small > message bandwidth > > OK, I applied the following patch (I had to change one line of your > patch to get it to apply because the small-message changed the context > so one chunk didn't apply). > > Anyway I don't see any difference in small message latency or large > message throughput. (Actually latency seems slightly worse but I > think the change is within my normal variability so I'm don't think > the difference is significant) OK. I wonder whether unrolling the loop in skb_put_frags might be helpful. Could you please try the following? Does this affect latency for you? (I don't see any difference in between UD and CM either with or without this patch). Try to improve small message latency some more by unrolling more loops. Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> --- diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index a389854..a8895b4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -311,38 +311,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, return 0; } } -/* Adjust length of skb with fragments to match received data */ -static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, - unsigned int length, struct sk_buff *toskb) -{ - int i, num_frags; - unsigned int size; - - /* put header into skb */ - size = min(length, hdr_space); - skb->tail += size; - skb->len += size; - length -= size; - - num_frags = skb_shinfo(skb)->nr_frags; - for (i = 0; i < num_frags; i++) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (length == 0) { - /* don't need this page */ - skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); - --skb_shinfo(skb)->nr_frags; - } else { - size = min(length, (unsigned) PAGE_SIZE); - - frag->size = size; - skb->data_len += size; - skb->truesize += size; - skb->len += size; - length -= size; - } - } -} void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { @@ -352,7 +320,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) struct ipoib_cm_rx *p; unsigned long flags; u64 mapping[IPOIB_CM_RX_SG]; - int frags; + unsigned head_size, frag_size, frags; ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n", wr_id, wc->opcode, wc->status); @@ -388,8 +356,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } } - frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, - (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + head_size = min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE); + frag_size = wc->byte_len - head_size; + frags = PAGE_ALIGN(frag_size) / PAGE_SIZE; newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping); if (unlikely(!newskb)) { @@ -408,7 +377,18 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); - skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + memcpy(&skb_shinfo(newskb)->frags[frags], &skb_shinfo(skb)->frags[frags], + (IPOIB_CM_RX_SG - 1 - frags) * sizeof(skb_frag_t)); + skb_shinfo(newskb)->nr_frags = IPOIB_CM_RX_SG - 1; + + skb_shinfo(skb)->nr_frags = frags; + skb->tail += head_size; + skb->len += wc->byte_len; + skb->data_len += frag_size; + skb->truesize += frag_size; + if (frags) + skb_shinfo(skb)->frags[frags - 1].size = + (frag_size - 1) % PAGE_SIZE + 1; skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb->mac.raw = skb->data; -- MST ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
> Quoting Roland Dreier <[EMAIL PROTECTED]>: > Subject: Re: [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth > > OK, I applied the following patch (I had to change one line of your > patch to get it to apply because the small-message changed the context > so one chunk didn't apply). > > Anyway I don't see any difference in small message latency or large > message throughput. (Actually latency seems slightly worse but I > think the change is within my normal variability so I'm don't think > the difference is significant) OK, thanks for testing this. I need to spend more time on reproducing this issue, and profiling. I'll add this to my todo list. -- MST ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
OK, I applied the following patch (I had to change one line of your patch to get it to apply because the small-message changed the context so one chunk didn't apply). Anyway I don't see any difference in small message latency or large message throughput. (Actually latency seems slightly worse but I think the change is within my normal variability so I'm don't think the difference is significant) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 2594db2..20d7ad4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -98,9 +98,9 @@ enum { #defineIPOIB_OP_RECV (1ul << 31) #ifdef CONFIG_INFINIBAND_IPOIB_CM -#defineIPOIB_CM_OP_SRQ (1ul << 30) +#defineIPOIB_OP_CM (1ul << 30) #else -#defineIPOIB_CM_OP_SRQ (0) +#defineIPOIB_OP_CM (0) #endif /* structs */ @@ -143,7 +143,6 @@ struct ipoib_cm_rx { struct ipoib_cm_tx { struct ib_cm_id *id; - struct ib_cq*cq; struct ib_qp*qp; struct list_head list; struct net_device *dev; @@ -232,6 +231,7 @@ struct ipoib_dev_priv { unsigned tx_tail; struct ib_sgetx_sge; struct ib_send_wrtx_wr; + unsigned tx_outstanding; struct ib_wc ibwc[IPOIB_NUM_WC]; @@ -438,6 +438,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, unsigned int mtu); void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); #else struct ipoib_cm_tx; @@ -526,6 +527,9 @@ static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *w { } +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 3484e8b..9515ef6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -82,7 +82,7 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id) struct ib_recv_wr *bad_wr; int i, ret; - priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ; + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; for (i = 0; i < IPOIB_CM_RX_SG; ++i) priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; @@ -344,7 +344,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct sk_buff *skb, *newskb; struct ipoib_cm_rx *p; unsigned long flags; @@ -436,7 +436,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, priv->tx_sge.addr = addr; priv->tx_sge.length = len; - priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); } @@ -487,20 +487,19 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ dev->trans_start = jiffies; ++tx->tx_head; - if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) { + if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); netif_stop_queue(dev); - set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); } } } -static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx, - struct ib_wc *wc) +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned int wr_id = wc->wr_id; + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ipoib_tx_buf *tx_req; unsigned long flags; @@ -525,11 +524,10 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx spin_lock_irqsave(&priv->tx_lock, flags); ++tx->tx_tail; - if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) && - tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) { - clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
> 1. Is there something special you do when you run the benchmark (msi, > taskset, ...)? Yes, I am using MSI-X, and I pin the interrupt handler to one CPU (CPU#0 in my particular case). Then I use taskset to pin the NPtcp process to a CPU in a different package (CPU#2 in my system). BTW with these same systems, I am getting up to ~1150 MB/sec of throughput with DDR mem-free Arbel, as measured with NPtcp. > 2. On a wild guess that the issue here is higher interrupt rate with CM, >is there a chance you could test the following patch posted by me earlier? >http://www.mail-archive.com/openib-general@openib.org/msg29290.html OK, I'll try that when I get a chance. - R. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
> Quoting Roland Dreier <[EMAIL PROTECTED]>: > Subject: Re: [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth > > Thanks, queued for 2.6.21. With this patch I see small-packet latency > down almost all the way back to what datagram mode gives -- on a pair > of fast woodcrest systems I see latencies for netpipe tcp 1 byte > messages like > > datagram 13.xx > original CM 17.xx > patched CM 14.xx > > so there is still a measurable difference but it is much less now. Hmm. An old system I tried here has a much higher latency, but does not seem to exhibit latency difference between datagram and CM. 1. Is there something special you do when you run the benchmark (msi, taskset, ...)? 2. On a wild guess that the issue here is higher interrupt rate with CM, is there a chance you could test the following patch posted by me earlier? http://www.mail-archive.com/openib-general@openib.org/msg29290.html Thanks, -- MST ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
> Quoting r. Or Gerlitz <[EMAIL PROTECTED]>: > Subject: Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small > message bandwidth > > Michael S. Tsirkin wrote: > > Avoid overhead of freeing/reallocating and mapping/unmapping for dma > > for pages that have not been written to by hardware. > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > index 8ee6f06..a23c8e3 100644 > > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > @@ -68,14 +68,14 @@ struct ipoib_cm_id { > > static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, > >struct ib_cm_event *event); > > > > -static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, > > +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, > > u64 mapping[IPOIB_CM_RX_SG]) > > { > > int i; > > > > ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, > > DMA_FROM_DEVICE); > > > > - for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i) > > + for (i = 0; i < frags; ++i) > > ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, > > DMA_FROM_DEVICE); > > } > > I understand that in ipoib_cm_alloc_rx_skb you call dma_map_page on > IPOIB_CM_RX_SG pages where here you call dma_unmap_single only $frags > times, correct? No. > does this means you are trashing the IOMMU etc etc of > the system? I don't think so. -- MST ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
Michael S. Tsirkin wrote: > Avoid overhead of freeing/reallocating and mapping/unmapping for dma > for pages that have not been written to by hardware. > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > index 8ee6f06..a23c8e3 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > @@ -68,14 +68,14 @@ struct ipoib_cm_id { > static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, > struct ib_cm_event *event); > > -static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, > +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, > u64 mapping[IPOIB_CM_RX_SG]) > { > int i; > > ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, > DMA_FROM_DEVICE); > > - for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i) > + for (i = 0; i < frags; ++i) > ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, > DMA_FROM_DEVICE); > } I understand that in ipoib_cm_alloc_rx_skb you call dma_map_page on IPOIB_CM_RX_SG pages where here you call dma_unmap_single only $frags times, correct? does this means you are trashing the IOMMU etc etc of the system? Or. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
Thanks, queued for 2.6.21. With this patch I see small-packet latency down almost all the way back to what datagram mode gives -- on a pair of fast woodcrest systems I see latencies for netpipe tcp 1 byte messages like datagram 13.xx original CM 17.xx patched CM 14.xx so there is still a measurable difference but it is much less now. - R. ___ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth
Avoid overhead of freeing/reallocating and mapping/unmapping for dma for pages that have not been written to by hardware. Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> --- This gives >10% boost in BW for message sizes up to 32K. Please queue for 2.6.21. before: # ./netperf-2.4.2/src/netperf -f M -H 11.4.3.68 -c -C -- -m 32000 TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68) port 0 AF_INET : demo Recv SendSend Utilization Service Demand Socket Socket Message Elapsed Send Recv SendRecv Size SizeSize Time Throughput localremote local remote bytes bytes bytessecs.MBytes /s % S % S us/KB us/KB 87380 16384 3200010.00 716.23 26.2223.941.430 1.306 after: # ./netperf-2.4.2/src/netperf -f M -H 11.4.3.68 -c -C -- -m 32000 TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68) port 0 AF_INET : demo Recv SendSend Utilization Service Demand Socket Socket Message Elapsed Send Recv SendRecv Size SizeSize Time Throughput localremote local remote bytes bytes bytessecs.MBytes /s % S % S us/KB us/KB 87380 16384 3200010.00 888.67 24.1325.081.061 1.102 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 8ee6f06..a23c8e3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -68,14 +68,14 @@ struct ipoib_cm_id { static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); -static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, u64 mapping[IPOIB_CM_RX_SG]) { int i; ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); - for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i) + for (i = 0; i < frags; ++i) ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); } @@ -93,7 +93,8 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id) ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); - ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + priv->cm.srq_ring[id].mapping); dev_kfree_skb_any(priv->cm.srq_ring[id].skb); priv->cm.srq_ring[id].skb = NULL; } @@ -101,8 +102,8 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id) return ret; } -static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, -u64 mapping[IPOIB_CM_RX_SG]) +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags, +u64 mapping[IPOIB_CM_RX_SG]) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -110,7 +111,7 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); if (unlikely(!skb)) - return -ENOMEM; + return NULL; /* * IPoIB adds a 4 byte header. So we need 12 more bytes to align the @@ -122,10 +123,10 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { dev_kfree_skb_any(skb); - return -EIO; + return NULL; } - for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) { + for (i = 0; i < frags; i++) { struct page *page = alloc_page(GFP_ATOMIC); if (!page) @@ -139,7 +140,7 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, } priv->cm.srq_ring[id].skb = skb; - return 0; + return skb; partial_error: @@ -148,8 +149,8 @@ partial_error: for (; i >= 0; --i) ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - return -ENOMEM; + dev_kfree_skb_any(skb); + return NULL; } static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, @@ -312,7 +313,7 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, } /* Adjust length of skb with fragments to match received data */ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, - unsigned int length) +