[ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-08 Thread greearb
From: Ben Greear 

This saves us constantly allocating large, multi-page
skbs.  It should fix the order-1 allocation errors reported,
and in a 60-vif scenario, this significantly decreases CPU
utilization, and latency, and increases bandwidth.

Signed-off-by: Ben Greear 
---
:100644 100644 b2497b8... ea2f67c... M  drivers/net/wireless/ath/ath9k/recv.c
 drivers/net/wireless/ath/ath9k/recv.c |   92 ++---
 1 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/recv.c 
b/drivers/net/wireless/ath/ath9k/recv.c
index b2497b8..ea2f67c 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -16,6 +16,7 @@
 
 #include "ath9k.h"
 #include "ar9003_mac.h"
+#include 
 
 #define SKB_CB_ATHBUF(__skb)   (*((struct ath_buf **)__skb->cb))
 
@@ -1623,7 +1624,7 @@ div_comb_done:
 int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 {
struct ath_buf *bf;
-   struct sk_buff *skb = NULL, *requeue_skb;
+   struct sk_buff *skb = NULL, *requeue_skb = NULL;
struct ieee80211_rx_status *rxs;
struct ath_hw *ah = sc->sc_ah;
struct ath_common *common = ath9k_hw_common(ah);
@@ -1634,7 +1635,8 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool 
hp)
 */
struct ieee80211_hw *hw = NULL;
struct ieee80211_hdr *hdr;
-   int retval;
+   int retval, len;
+   bool use_copybreak = true;
bool decrypt_error = false;
struct ath_rx_status rs;
enum ath9k_rx_qtype qtype;
@@ -1702,42 +1704,70 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, 
bool hp)
unlikely(tsf_lower - rs.rs_tstamp > 0x1000))
rxs->mactime += 0x1ULL;
 
-   /* Ensure we always have an skb to requeue once we are done
-* processing the current buffer's skb */
-   requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, 
GFP_ATOMIC);
-
-   /* If there is no memory we ignore the current RX'd frame,
-* tell hardware it can give us a new frame using the old
-* skb and put it at the tail of the sc->rx.rxbuf list for
-* processing. */
-   if (!requeue_skb)
-   goto requeue;
-
-   /* Unmap the frame */
-   dma_unmap_single(sc->dev, bf->bf_buf_addr,
-common->rx_bufsize,
-dma_type);
+   len = rs.rs_datalen + ah->caps.rx_status_len;
+   if (use_copybreak) {
+   skb = netdev_alloc_skb(NULL, len);
+   if (!skb) {
+   skb = bf->bf_mpdu;
+   use_copybreak = false;
+   goto non_copybreak;
+   }
+   } else {
+non_copybreak:
+   /* Ensure we always have an skb to requeue once we are
+* done processing the current buffer's skb */
+   requeue_skb = ath_rxbuf_alloc(common,
+ common->rx_bufsize,
+ GFP_ATOMIC);
+
+   /* If there is no memory we ignore the current RX'd
+* frame, tell hardware it can give us a new frame
+* using the old skb and put it at the tail of the
+* sc->rx.rxbuf list for processing. */
+   if (!requeue_skb)
+   goto requeue;
+
+   /* Unmap the frame */
+   dma_unmap_single(sc->dev, bf->bf_buf_addr,
+common->rx_bufsize,
+dma_type);
+   }
 
-   skb_put(skb, rs.rs_datalen + ah->caps.rx_status_len);
+   skb_put(skb, len);
if (ah->caps.rx_status_len)
skb_pull(skb, ah->caps.rx_status_len);
 
+   if (use_copybreak) {
+   struct pci_dev *pdev = to_pci_dev(sc->dev);
+   pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
+   len, PCI_DMA_FROMDEVICE);
+   skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
+   pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
+  len, PCI_DMA_FROMDEVICE);
+   memcpy(skb->cb, bf->bf_mpdu->cb, sizeof(skb->cb));
+   rxs =  IEEE80211_SKB_RXCB(skb);
+   }
+
ath9k_rx_skb_postprocess(common, skb, &rs,
 rxs, decrypt_error);
 
-   /* We will now give hardware our shiny new allocated skb */
-  

Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-08 Thread Felix Fietkau
On 2011-01-08 8:33 AM, gree...@candelatech.com wrote:
> From: Ben Greear
>
> This saves us constantly allocating large, multi-page
> skbs.  It should fix the order-1 allocation errors reported,
> and in a 60-vif scenario, this significantly decreases CPU
> utilization, and latency, and increases bandwidth.
>
> Signed-off-by: Ben Greear
> ---
> :100644 100644 b2497b8... ea2f67c... M
> drivers/net/wireless/ath/ath9k/recv.c
>   drivers/net/wireless/ath/ath9k/recv.c |   92 
> ++---
>   1 files changed, 61 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/ath9k/recv.c 
> b/drivers/net/wireless/ath/ath9k/recv.c
> index b2497b8..ea2f67c 100644
> --- a/drivers/net/wireless/ath/ath9k/recv.c
> +++ b/drivers/net/wireless/ath/ath9k/recv.c
> @@ -1702,42 +1704,70 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, 
> bool hp)
>   unlikely(tsf_lower - rs.rs_tstamp>  0x1000))
>   rxs->mactime += 0x1ULL;
>
> - /* Ensure we always have an skb to requeue once we are done
> -  * processing the current buffer's skb */
> - requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, 
> GFP_ATOMIC);
> -
> - /* If there is no memory we ignore the current RX'd frame,
> -  * tell hardware it can give us a new frame using the old
> -  * skb and put it at the tail of the sc->rx.rxbuf list for
> -  * processing. */
> - if (!requeue_skb)
> - goto requeue;
> -
> - /* Unmap the frame */
> - dma_unmap_single(sc->dev, bf->bf_buf_addr,
> -  common->rx_bufsize,
> -  dma_type);
> + len = rs.rs_datalen + ah->caps.rx_status_len;
> + if (use_copybreak) {
> + skb = netdev_alloc_skb(NULL, len);
> + if (!skb) {
> + skb = bf->bf_mpdu;
> + use_copybreak = false;
> + goto non_copybreak;
> + }
> + } else {
I think this should be dependent on packet size, maybe even based on the 
architecture. Especially on embedded hardware, copying large frames is 
probably quite a bit more expensive than allocating large buffers. Cache 
sizes are small, memory access takes several cycles, especially during 
concurrent DMA.
Once I'm back home, I could try a few packet size threshold to find a 
sweet spot for the typical MIPS hardware that I'm playing with. I expect 
a visible performance regression from this patch when applied as-is.

- Felix
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-08 Thread Ben Greear
On 01/08/2011 04:20 PM, Felix Fietkau wrote:
> On 2011-01-08 8:33 AM, gree...@candelatech.com wrote:
>> From: Ben Greear
>>
>> This saves us constantly allocating large, multi-page
>> skbs. It should fix the order-1 allocation errors reported,
>> and in a 60-vif scenario, this significantly decreases CPU
>> utilization, and latency, and increases bandwidth.
>>
>> Signed-off-by: Ben Greear
>> ---
>> :100644 100644 b2497b8... ea2f67c... M drivers/net/wireless/ath/ath9k/recv.c
>> drivers/net/wireless/ath/ath9k/recv.c | 92 ++---
>> 1 files changed, 61 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/net/wireless/ath/ath9k/recv.c 
>> b/drivers/net/wireless/ath/ath9k/recv.c
>> index b2497b8..ea2f67c 100644
>> --- a/drivers/net/wireless/ath/ath9k/recv.c
>> +++ b/drivers/net/wireless/ath/ath9k/recv.c
>> @@ -1702,42 +1704,70 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, 
>> bool hp)
>> unlikely(tsf_lower - rs.rs_tstamp> 0x1000))
>> rxs->mactime += 0x1ULL;
>>
>> - /* Ensure we always have an skb to requeue once we are done
>> - * processing the current buffer's skb */
>> - requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, GFP_ATOMIC);
>> -
>> - /* If there is no memory we ignore the current RX'd frame,
>> - * tell hardware it can give us a new frame using the old
>> - * skb and put it at the tail of the sc->rx.rxbuf list for
>> - * processing. */
>> - if (!requeue_skb)
>> - goto requeue;
>> -
>> - /* Unmap the frame */
>> - dma_unmap_single(sc->dev, bf->bf_buf_addr,
>> - common->rx_bufsize,
>> - dma_type);
>> + len = rs.rs_datalen + ah->caps.rx_status_len;
>> + if (use_copybreak) {
>> + skb = netdev_alloc_skb(NULL, len);
>> + if (!skb) {
>> + skb = bf->bf_mpdu;
>> + use_copybreak = false;
>> + goto non_copybreak;
>> + }
>> + } else {
> I think this should be dependent on packet size, maybe even based on the 
> architecture. Especially on embedded hardware, copying large frames is 
> probably quite a
> bit more expensive than allocating large buffers. Cache sizes are small, 
> memory access takes several cycles, especially during concurrent DMA.
> Once I'm back home, I could try a few packet size threshold to find a sweet 
> spot for the typical MIPS hardware that I'm playing with. I expect a visible
> performance regression from this patch when applied as-is.

I see a serious performance improvement with this patch.  My current test is 
sending 1024 byte UDP
payloads to/from each of 60 stations at 128kbps.  Please do try it out on your 
system and see how
it performs there.  I'm guessing that any time you have more than 1 VIF this 
will be a good
improvement since mac80211 does skb_copy (and you would typically be copying a 
much smaller
packet with this patch).

If we do see performance differences on different platforms, this could perhaps 
be
something we could tune at run-time.

Thanks,
Ben

>
> - Felix


-- 
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-08 Thread Felix Fietkau
On 2011-01-08 5:36 PM, Ben Greear wrote:
> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>  I think this should be dependent on packet size, maybe even based on the 
>> architecture. Especially on embedded hardware, copying large frames is 
>> probably quite a
>>  bit more expensive than allocating large buffers. Cache sizes are small, 
>> memory access takes several cycles, especially during concurrent DMA.
>>  Once I'm back home, I could try a few packet size threshold to find a sweet 
>> spot for the typical MIPS hardware that I'm playing with. I expect a visible
>>  performance regression from this patch when applied as-is.
>
> I see a serious performance improvement with this patch.  My current test is 
> sending 1024 byte UDP
> payloads to/from each of 60 stations at 128kbps.  Please do try it out on 
> your system and see how
> it performs there.  I'm guessing that any time you have more than 1 VIF this 
> will be a good
> improvement since mac80211 does skb_copy (and you would typically be copying 
> a much smaller
> packet with this patch).
>
> If we do see performance differences on different platforms, this could 
> perhaps be
> something we could tune at run-time.
What kind of system are you testing on? If it's a PC, then the 
performance characteristics will be completely different compared to 
embedded hardware. I've had to remove a few copybreak-like 
implementations from various ethernet drivers on similar hardware, 
because even taking the hit of unaligned load/store exceptions (which 
are already *very* expensive on MIPS) was less than copying the full 
packet data, even with packet sizes less than what you're using.

I don't have suitable test hardware with me right now, but I'll do some 
tests in a week or so.

- Felix
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-08 Thread Ben Greear
On 01/08/2011 04:41 PM, Felix Fietkau wrote:
> On 2011-01-08 5:36 PM, Ben Greear wrote:
>> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>> I think this should be dependent on packet size, maybe even based on the 
>>> architecture. Especially on embedded hardware, copying large frames is 
>>> probably quite a
>>> bit more expensive than allocating large buffers. Cache sizes are small, 
>>> memory access takes several cycles, especially during concurrent DMA.
>>> Once I'm back home, I could try a few packet size threshold to find a sweet 
>>> spot for the typical MIPS hardware that I'm playing with. I expect a visible
>>> performance regression from this patch when applied as-is.
>>
>> I see a serious performance improvement with this patch. My current test is 
>> sending 1024 byte UDP
>> payloads to/from each of 60 stations at 128kbps. Please do try it out on 
>> your system and see how
>> it performs there. I'm guessing that any time you have more than 1 VIF this 
>> will be a good
>> improvement since mac80211 does skb_copy (and you would typically be copying 
>> a much smaller
>> packet with this patch).
>>
>> If we do see performance differences on different platforms, this could 
>> perhaps be
>> something we could tune at run-time.
> What kind of system are you testing on? If it's a PC, then the performance 
> characteristics will be completely different compared to embedded hardware. 
> I've had
> to remove a few copybreak-like implementations from various ethernet drivers 
> on similar hardware, because even taking the hit of unaligned load/store 
> exceptions
> (which are already *very* expensive on MIPS) was less than copying the full 
> packet data, even with packet sizes less than what you're using.
>
> I don't have suitable test hardware with me right now, but I'll do some tests 
> in a week or so.

I'm on a dual-core Atom processor.  I'm interested in your MIPs results when 
you get them...

Thanks,
Ben

>
> - Felix


-- 
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Gabor Juhos
Hi Ben,

> From: Ben Greear 
> 
> This saves us constantly allocating large, multi-page
> skbs.  It should fix the order-1 allocation errors reported,
> and in a 60-vif scenario, this significantly decreases CPU
> utilization, and latency, and increases bandwidth.
> 
> Signed-off-by: Ben Greear 
> ---
> :100644 100644 b2497b8... ea2f67c... M
> drivers/net/wireless/ath/ath9k/recv.c
>  drivers/net/wireless/ath/ath9k/recv.c |   92 
> ++---
>  1 files changed, 61 insertions(+), 31 deletions(-)

<...>

> + if (use_copybreak) {
> + struct pci_dev *pdev = to_pci_dev(sc->dev);

This would cause undefined behaviour with ath9k devices sitting on an AHB bus.

> + pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
> + len, PCI_DMA_FROMDEVICE);
> + skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
> + pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
> +len, PCI_DMA_FROMDEVICE);

Please use the bus agnostic equivalents of these DMA functions.

- Gabor
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Björn Smedman
On Sun, Jan 9, 2011 at 2:06 AM, Ben Greear  wrote:
> On 01/08/2011 04:41 PM, Felix Fietkau wrote:
>>
>> On 2011-01-08 5:36 PM, Ben Greear wrote:
>>>
>>> On 01/08/2011 04:20 PM, Felix Fietkau wrote:

 I think this should be dependent on packet size, maybe even based on the
 architecture. Especially on embedded hardware, copying large frames is
 probably quite a
 bit more expensive than allocating large buffers. Cache sizes are small,
 memory access takes several cycles, especially during concurrent DMA.
 Once I'm back home, I could try a few packet size threshold to find a
 sweet spot for the typical MIPS hardware that I'm playing with. I expect a
 visible
 performance regression from this patch when applied as-is.
>>>
>>> I see a serious performance improvement with this patch. My current test
>>> is sending 1024 byte UDP
>>> payloads to/from each of 60 stations at 128kbps. Please do try it out on
>>> your system and see how
>>> it performs there. I'm guessing that any time you have more than 1 VIF
>>> this will be a good
>>> improvement since mac80211 does skb_copy (and you would typically be
>>> copying a much smaller
>>> packet with this patch).
>>>
>>> If we do see performance differences on different platforms, this could
>>> perhaps be
>>> something we could tune at run-time.
>>
>> What kind of system are you testing on? If it's a PC, then the performance
>> characteristics will be completely different compared to embedded hardware.
>> I've had
>> to remove a few copybreak-like implementations from various ethernet
>> drivers on similar hardware, because even taking the hit of unaligned
>> load/store exceptions
>> (which are already *very* expensive on MIPS) was less than copying the
>> full packet data, even with packet sizes less than what you're using.
>>
>> I don't have suitable test hardware with me right now, but I'll do some
>> tests in a week or so.
>
> I'm on a dual-core Atom processor.  I'm interested in your MIPs results when
> you get them...

I think we should also consider the added stability/saneness with this
patch. I for one would be willing to live with some extra cpu load if
that means the unstoppable rx dma problem can be contained (all the
time).

Perhaps make it a configuration option?

/Björn
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Felix Fietkau
On 2011-01-09 7:15 AM, Björn Smedman wrote:
> I think we should also consider the added stability/saneness with this
> patch. I for one would be willing to live with some extra cpu load if
> that means the unstoppable rx dma problem can be contained (all the
> time).
I don't think this patch has anything to do with the rx dma stop issue.

- Felix
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Björn Smedman
2011/1/9 Felix Fietkau :
> On 2011-01-09 7:15 AM, Björn Smedman wrote:
>>
>> I think we should also consider the added stability/saneness with this
>> patch. I for one would be willing to live with some extra cpu load if
>> that means the unstoppable rx dma problem can be contained (all the
>> time).
>
> I don't think this patch has anything to do with the rx dma stop issue.

Ok, so if you have a machine gun that can go off at any time and is
impossible to stop you don't think it makes a difference where it's
pointed? ;) Just kidding, and call me superstitious, but I would feel
much better knowing ath9k is pointing that half-broken dma engine at
it's own data only, at least until it's 100% stable.

/Björn
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Ben Greear
On 01/09/2011 12:00 AM, Gabor Juhos wrote:
> Hi Ben,
>
>> From: Ben Greear
>>
>> This saves us constantly allocating large, multi-page
>> skbs.  It should fix the order-1 allocation errors reported,
>> and in a 60-vif scenario, this significantly decreases CPU
>> utilization, and latency, and increases bandwidth.
>>
>> Signed-off-by: Ben Greear
>> ---
>> :100644 100644 b2497b8... ea2f67c... M   
>> drivers/net/wireless/ath/ath9k/recv.c
>>   drivers/net/wireless/ath/ath9k/recv.c |   92 
>> ++---
>>   1 files changed, 61 insertions(+), 31 deletions(-)
>
> <...>
>
>> +if (use_copybreak) {
>> +struct pci_dev *pdev = to_pci_dev(sc->dev);
>
> This would cause undefined behaviour with ath9k devices sitting on an AHB bus.
>
>> +pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
>> +len, PCI_DMA_FROMDEVICE);
>> +skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
>> +pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
>> +   len, PCI_DMA_FROMDEVICE);
>
> Please use the bus agnostic equivalents of these DMA functions.

Any idea what that might be?

Should we just disable copybreak for things on AHB bus?

Thanks,
Ben

-- 
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Jouni Malinen
On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
> >On 2011-01-08 8:33 AM, gree...@candelatech.com wrote:
> >>From: Ben Greear
> >>This saves us constantly allocating large, multi-page
> >>skbs. It should fix the order-1 allocation errors reported,
> >>and in a 60-vif scenario, this significantly decreases CPU
> >>utilization, and latency, and increases bandwidth.

As far as CPU use is concerned, 60 VIF scenario should not be the one to
use for checking what is most efficient.. This really needs to be tested
on something that uses a single VIF on an embedded (low-power CPU)..

For the order-1 allocation issues, it would be interesting to see if
someone could take a look at using paged skbs or multiple RX descriptors
with shorter skbs (and copying only for the case where a long frame is
received so that only the A-MSDU RX case would suffer from extra
copying).

> I see a serious performance improvement with this patch.  My current test is 
> sending 1024 byte UDP
> payloads to/from each of 60 stations at 128kbps.  Please do try it out on 
> your system and see how
> it performs there.  I'm guessing that any time you have more than 1 VIF this 
> will be a good
> improvement since mac80211 does skb_copy (and you would typically be copying 
> a much smaller
> packet with this patch).

How would this patch change the number of bytes copied by skb_copy?

> If we do see performance differences on different platforms, this could 
> perhaps be
> something we could tune at run-time.

I guess that could be looked at, but as long as that is not the case,
the test setup you used is not exactly the most common case for ath9k in
the upstream kernel and should not be used to figure out default
behavior.
 
-- 
Jouni MalinenPGP id EFC895FA
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Christian Lamparter
On Sunday 09 January 2011 19:13:04 Jouni Malinen wrote:
> On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
> > On 01/08/2011 04:20 PM, Felix Fietkau wrote:
> > >On 2011-01-08 8:33 AM, gree...@candelatech.com wrote:
> > >>From: Ben Greear
> > >>This saves us constantly allocating large, multi-page
> > >>skbs. It should fix the order-1 allocation errors reported,
> > >>and in a 60-vif scenario, this significantly decreases CPU
> > >>utilization, and latency, and increases bandwidth.
> 
> As far as CPU use is concerned, 60 VIF scenario should not be the one to
> use for checking what is most efficient.. This really needs to be tested
> on something that uses a single VIF on an embedded (low-power CPU)..
> 
> For the order-1 allocation issues, it would be interesting to see if
> someone could take a look at using paged skbs or multiple RX descriptors
> with shorter skbs (and copying only for the case where a long frame is
> received so that only the A-MSDU RX case would suffer from extra
> copying).

Well, here's a shot at paged rx. It'll only work when PAGE_SIZE > buf_size
(which will be true for most system, I guess)

No idea how to handle EDMA... In fact I don't have any ath9k* solution
at the moment to test ;), so you better backup your data! 
---
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h 
b/drivers/net/wireless/ath/ath9k/ath9k.h
index 3681caf5..b113f44 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -227,6 +227,7 @@ struct ath_buf {
   an aggregate) */
struct ath_buf *bf_next;/* next subframe in the aggregate */
struct sk_buff *bf_mpdu;/* enclosing frame structure */
+   struct page *page;
void *bf_desc;  /* virtual addr of desc */
dma_addr_t bf_daddr;/* physical addr of desc */
dma_addr_t bf_buf_addr; /* physical addr of data buffer, for DMA */
diff --git a/drivers/net/wireless/ath/ath9k/recv.c 
b/drivers/net/wireless/ath/ath9k/recv.c
index b2497b8..acdf6ae 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -313,10 +313,13 @@ static void ath_edma_stop_recv(struct ath_softc *sc)
 int ath_rx_init(struct ath_softc *sc, int nbufs)
 {
struct ath_common *common = ath9k_hw_common(sc->sc_ah);
-   struct sk_buff *skb;
+   struct page *page;
struct ath_buf *bf;
int error = 0;
 
+   if (WARN_ON(common->rx_bufsize > PAGE_SIZE))
+   return -EIO;
+
spin_lock_init(&sc->sc_pcu_lock);
sc->sc_flags &= ~SC_OP_RXFLUSH;
spin_lock_init(&sc->rx.rxbuflock);
@@ -342,27 +345,26 @@ int ath_rx_init(struct ath_softc *sc, int nbufs)
}
 
list_for_each_entry(bf, &sc->rx.rxbuf, list) {
-   skb = ath_rxbuf_alloc(common, common->rx_bufsize,
- GFP_KERNEL);
-   if (skb == NULL) {
+   page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
+   if (page == NULL) {
error = -ENOMEM;
goto err;
}
 
-   bf->bf_mpdu = skb;
-   bf->bf_buf_addr = dma_map_single(sc->dev, skb->data,
-   common->rx_bufsize,
-   DMA_FROM_DEVICE);
+   bf->bf_mpdu = NULL;
+   bf->bf_buf_addr = dma_map_page(sc->dev, page, 0,
+  PAGE_SIZE,
+  DMA_FROM_DEVICE);
if (unlikely(dma_mapping_error(sc->dev,
bf->bf_buf_addr))) {
-   dev_kfree_skb_any(skb);
-   bf->bf_mpdu = NULL;
+   __free_pages(page, 0);
bf->bf_buf_addr = 0;
ath_err(common,
"dma_mapping_error() on RX init\n");
error = -ENOMEM;
goto err;
}
+   bf->page = page;
}
sc->rx.rxlink = NULL;
}
@@ -378,7 +380,7 @@ void ath_rx_cleanup(struct ath_softc *sc)
 {
struct ath_hw *ah = sc->sc_ah;
struct ath_common *common = ath9k_hw_common(ah);
-   struct sk_buff *skb;
+   struct page *page;
struct ath_buf *bf;
 
if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA) {
@@ -386,14 +388,15 @@ void ath_rx_cleanup(struct ath_softc *sc)
return;
} else {
list_for_each_entry(bf, &sc->rx.rxbuf, list) {
-   skb = bf->bf_mpdu;
-  

Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Felix Fietkau
On 2011-01-09 1:14 PM, Christian Lamparter wrote:
> On Sunday 09 January 2011 19:13:04 Jouni Malinen wrote:
>>  On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
>>  >  On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>  >  >On 2011-01-08 8:33 AM, gree...@candelatech.com wrote:
>>  >  >>From: Ben Greear
>>  >  >>This saves us constantly allocating large, multi-page
>>  >  >>skbs. It should fix the order-1 allocation errors reported,
>>  >  >>and in a 60-vif scenario, this significantly decreases CPU
>>  >  >>utilization, and latency, and increases bandwidth.
>>
>>  As far as CPU use is concerned, 60 VIF scenario should not be the one to
>>  use for checking what is most efficient.. This really needs to be tested
>>  on something that uses a single VIF on an embedded (low-power CPU)..
>>
>>  For the order-1 allocation issues, it would be interesting to see if
>>  someone could take a look at using paged skbs or multiple RX descriptors
>>  with shorter skbs (and copying only for the case where a long frame is
>>  received so that only the A-MSDU RX case would suffer from extra
>>  copying).
>
> Well, here's a shot at paged rx. It'll only work when PAGE_SIZE>  buf_size
> (which will be true for most system, I guess)
>
> No idea how to handle EDMA... In fact I don't have any ath9k* solution
> at the moment to test ;), so you better backup your data!
I think paged rx would be quite problematic for performance on embedded 
hardware that way. If I read the networking stack correctly, this would 
trigger skb_linearize() for network drivers/devices that cannot do 
scatter/gather IO when packets are forwarded between interfaces (the 
most common use case for OpenWrt).
It might be possible to avoid this by changing the network stack to 
bypass the linearize if there is only data in the page attached to the 
skb, but I don't know how easy that is to add, nor what corner cases I'd 
need to take care of.

- Felix
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Ben Greear
On 01/09/2011 10:13 AM, Jouni Malinen wrote:
> On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
>> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>> On 2011-01-08 8:33 AM, gree...@candelatech.com wrote:
 From: Ben Greear
 This saves us constantly allocating large, multi-page
 skbs. It should fix the order-1 allocation errors reported,
 and in a 60-vif scenario, this significantly decreases CPU
 utilization, and latency, and increases bandwidth.
>
> As far as CPU use is concerned, 60 VIF scenario should not be the one to
> use for checking what is most efficient.. This really needs to be tested
> on something that uses a single VIF on an embedded (low-power CPU)..
>
> For the order-1 allocation issues, it would be interesting to see if
> someone could take a look at using paged skbs or multiple RX descriptors
> with shorter skbs (and copying only for the case where a long frame is
> received so that only the A-MSDU RX case would suffer from extra
> copying).

>
>> I see a serious performance improvement with this patch.  My current test is 
>> sending 1024 byte UDP
>> payloads to/from each of 60 stations at 128kbps.  Please do try it out on 
>> your system and see how
>> it performs there.  I'm guessing that any time you have more than 1 VIF this 
>> will be a good
>> improvement since mac80211 does skb_copy (and you would typically be copying 
>> a much smaller
>> packet with this patch).
>
> How would this patch change the number of bytes copied by skb_copy?

It seems that if you allocate a 2-page SKB, as upstream ath9k does, pass that
up the stack, then if/when anything calls 'skb_copy' it allocates a new skb with
2 pages even if the actual data-length is much smaller.

This copy wouldn't be so bad for single VIF scenarios (which means probably no 
copying),
but you still end up exhausting the order-1 memory buffer pool with lots of big 
skbs
floating around the system.  Note that the original bug was not filed by me
and happened on some embedded device, though I also see memory exhaustion in my
tests with upstream code.

>
>> If we do see performance differences on different platforms, this could 
>> perhaps be
>> something we could tune at run-time.
>
> I guess that could be looked at, but as long as that is not the case,
> the test setup you used is not exactly the most common case for ath9k in
> the upstream kernel and should not be used to figure out default
> behavior.

True, but I also like the protection this should offer against stray
DMA that this chipset/driver seems capable of.

I'm curious if anyone has any stats at all as far as ath9k performance goes?

Thanks,
Ben

-- 
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-09 Thread Gabor Juhos
2011.01.09. 18:49 keltezéssel, Ben Greear írta:
> On 01/09/2011 12:00 AM, Gabor Juhos wrote:
>> Hi Ben,
>>
>>> From: Ben Greear
>>>
>>> This saves us constantly allocating large, multi-page
>>> skbs.  It should fix the order-1 allocation errors reported,
>>> and in a 60-vif scenario, this significantly decreases CPU
>>> utilization, and latency, and increases bandwidth.
>>>
>>> Signed-off-by: Ben Greear
>>> ---
>>> :100644 100644 b2497b8... ea2f67c... M
>>> drivers/net/wireless/ath/ath9k/recv.c
>>>   drivers/net/wireless/ath/ath9k/recv.c |   92 
>>> ++---
>>>   1 files changed, 61 insertions(+), 31 deletions(-)
>>
>> <...>
>>
>>> +if (use_copybreak) {
>>> +struct pci_dev *pdev = to_pci_dev(sc->dev);
>>
>> This would cause undefined behaviour with ath9k devices sitting on an AHB 
>> bus.
>>
>>> +pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
>>> +len, PCI_DMA_FROMDEVICE);
>>> +skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
>>> +pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
>>> +   len, PCI_DMA_FROMDEVICE);
>>
>> Please use the bus agnostic equivalents of these DMA functions.
> 
> Any idea what that might be?

Invalid/null pointer dereference probably. The problem is that sc->dev is
pointing to a device structure inside a platform_device structure when it is not
PCI device. Converting sc->dev to 'struct *pci_dev' and using the result as a
parameter for a PCI specific function is not correct in this case.

> Should we just disable copybreak for things on AHB bus?

We should not disable it, order-1 allocation failures are present there as well.

-Gabor
___
ath9k-devel mailing list
ath9k-devel@lists.ath9k.org
https://lists.ath9k.org/mailman/listinfo/ath9k-devel


Re: [ath9k-devel] [PATCH] ath9k: Implement rx copy-break.

2011-01-10 Thread Jouni Malinen
On Sun, Jan 09, 2011 at 09:14:53PM +0100, Christian Lamparter wrote:
> On Sunday 09 January 2011 19:13:04 Jouni Malinen wrote:
> > For the order-1 allocation issues, it would be interesting to see if
> > someone could take a look at using paged skbs or multiple RX descriptors
> > with shorter skbs (and copying only for the case where a long frame is
> > received so that only the A-MSDU RX case would suffer from extra
> > copying).
> 
> Well, here's a shot at paged rx. It'll only work when PAGE_SIZE > buf_size
> (which will be true for most system, I guess)

Thanks!

For the multiple RX descriptors (fragmented buffer for DMA) part, here's
a very preliminary patch to show how it could be done in ath9k for
anyone who might want to experiment more in this area. This version is
obviously only helping with large allocations (it uses half the buffer
size). The extra copying is there for large A-MSDU case. Though, I did
add some concept code to use skb frag_list to allow such a frame be
delivered to mac80211 without needing any extra allocation/copying. That
is commented out for now, if mac80211 can be made to handle A-MSDU RX
processing with fragmented data (which should be relatively simple
change, I'd hope), there should be no need for doing the extra copy in
ath9k. (And likewise, I was too lazy to handle the EDMA case for now..)

The version here is limited to at maximum two fragments. If desired,
this could be further extended to handle multiple fragments to get the
default RX skb allocation shorter than 2k (i.e., to cover the normal
1500 MTU). I'm not sure whether that would result in noticeable benefit,
but it could help in saving some memory when there are multiple skbs
queued in various places in the networking stack. Likewise, the maximum
A-MSDU receive length could be extended with this approach without
having to use even longer individual buffers for RX.

---
 drivers/net/wireless/ath/ath9k/hw.h   |2 
 drivers/net/wireless/ath/ath9k/main.c |5 +
 drivers/net/wireless/ath/ath9k/recv.c |  102 +-
 3 files changed, 95 insertions(+), 14 deletions(-)

--- wireless-testing.orig/drivers/net/wireless/ath/ath9k/hw.h   2011-01-10 
13:03:16.0 +0200
+++ wireless-testing/drivers/net/wireless/ath/ath9k/hw.h2011-01-10 
13:15:39.0 +0200
@@ -852,6 +852,8 @@ struct ath_hw {
 
/* Enterprise mode cap */
u32 ent_mode;
+
+   struct sk_buff *rx_frag;
 };
 
 static inline struct ath_common *ath9k_hw_common(struct ath_hw *ah)
--- wireless-testing.orig/drivers/net/wireless/ath/ath9k/main.c 2011-01-10 
13:14:26.0 +0200
+++ wireless-testing/drivers/net/wireless/ath/ath9k/main.c  2011-01-10 
13:15:05.0 +0200
@@ -1320,6 +1320,11 @@ static void ath9k_stop(struct ieee80211_
} else
sc->rx.rxlink = NULL;
 
+   if (ah->rx_frag) {
+   dev_kfree_skb_any(ah->rx_frag);
+   ah->rx_frag = NULL;
+   }
+
/* disable HAL and put h/w to sleep */
ath9k_hw_disable(ah);
ath9k_hw_configpcipowersave(ah, 1, 1);
--- wireless-testing.orig/drivers/net/wireless/ath/ath9k/recv.c 2011-01-10 
12:24:08.0 +0200
+++ wireless-testing/drivers/net/wireless/ath/ath9k/recv.c  2011-01-10 
14:29:29.0 +0200
@@ -324,8 +324,19 @@ int ath_rx_init(struct ath_softc *sc, in
if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA) {
return ath_rx_edma_init(sc, nbufs);
} else {
-   common->rx_bufsize = roundup(IEEE80211_MAX_MPDU_LEN,
+   /*
+* Use a buffer that allows a full frame to be received in at
+* most two buffers with scattering DMA. This is needed for
+* A-MSDU; other cases will fit in a single buffer. This will
+* allow the skbs to fit in a single page to avoid issues with
+* higher order allocation.
+*/
+   common->rx_bufsize = roundup(IEEE80211_MAX_MPDU_LEN / 2,
min(common->cachelsz, (u16)64));
+#if 0 /* TESTING - force two fragments even without A-MSDU */
+   common->rx_bufsize = roundup(2800 / 2,
+   min(common->cachelsz, (u16)64));
+#endif
 
ath_dbg(common, ATH_DBG_CONFIG, "cachelsz %u rxbufsize %u\n",
common->cachelsz, common->rx_bufsize);
@@ -863,16 +874,6 @@ static bool ath9k_rx_accept(struct ath_c
return false;
 
/*
-* rs_more indicates chained descriptors which can be used
-* to link buffers together for a sort of scatter-gather
-* operation.
-* reject the frame, we don't support scatter-gather yet and
-* the frame is probably corrupt anyway
-*/
-   if (rx_stats->rs_more)
-   return false;
-
-   /*
 * The rx_stats->rs_status will not be set until the end of the
 * chained descriptors so it ca