Re: regression in ath10k dma allocation

2019-08-20 Thread Tobias Klausmann



On 20.08.19 09:12, Christoph Hellwig wrote:

On Tue, Aug 20, 2019 at 02:58:33PM +0800, Hillf Danton wrote:

On Tue, 20 Aug 2019 05:05:14 +0200 Christoph Hellwig wrote:

Tobias, plase try this patch:


New version below:

---
 From b8a805e93be5a5662323b8ac61fe686df839c4ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig 
Date: Tue, 20 Aug 2019 11:45:49 +0900
Subject: dma-direct: fix zone selection after an unaddressable CMA allocation

The new dma_alloc_contiguous hides if we allocate CMA or regular
pages, and thus fails to retry a ZONE_NORMAL allocation if the CMA
allocation succeeds but isn't addressable.  That means we either fail
outright or dip into a small zone that might not succeed either.

Thanks to Hillf Danton for debugging this issue.

Fixes: b1d2dc009dec ("dma-contiguous: add dma_{alloc,free}_contiguous() 
helpers")
Reported-by: Tobias Klausmann 
Signed-off-by: Christoph Hellwig 
---
  drivers/iommu/dma-iommu.c  |  3 +++
  include/linux/dma-contiguous.h |  5 +
  kernel/dma/contiguous.c|  9 +++--
  kernel/dma/direct.c| 10 +-
  4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d991d40f797f..f68a62c3c32b 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -965,10 +965,13 @@ static void *iommu_dma_alloc_pages(struct device *dev, 
size_t size,
  {
bool coherent = dev_is_dma_coherent(dev);
size_t alloc_size = PAGE_ALIGN(size);
+   int node = dev_to_node(dev);
struct page *page = NULL;
void *cpu_addr;
  
  	page = dma_alloc_contiguous(dev, alloc_size, gfp);

+   if (!page)
+   page = alloc_pages_node(node, gfp, get_order(alloc_size));
if (!page)
return NULL;
  
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h

index c05d4e661489..03f8e98e3bcc 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -160,10 +160,7 @@ bool dma_release_from_contiguous(struct device *dev, 
struct page *pages,
  static inline struct page *dma_alloc_contiguous(struct device *dev, size_t 
size,
gfp_t gfp)
  {
-   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-   size_t align = get_order(PAGE_ALIGN(size));
-
-   return alloc_pages_node(node, gfp, align);
+   return NULL;
  }
  
  static inline void dma_free_contiguous(struct device *dev, struct page *page,

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 2bd410f934b3..e6b450fdbeb6 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct 
page *pages,
   */
  struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
  {
-   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-   size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   size_t align = get_order(PAGE_ALIGN(size));
+   size_t count = size >> PAGE_SHIFT;
struct page *page = NULL;
struct cma *cma = NULL;
  
@@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
  
  	/* CMA can be used only in the context which permits sleeping */

if (cma && gfpflags_allow_blocking(gfp)) {
+   size_t align = get_order(size);
size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
  
  		page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);

}
  
-	/* Fallback allocation of normal pages */

-   if (!page)
-   page = alloc_pages_node(node, gfp, align);
return page;
  }
  
@@ -258,6 +254,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)

   * dma_free_contiguous() - release allocated pages
   * @dev:   Pointer to device for which the pages were allocated.
   * @page:  Pointer to the allocated pages.
+   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
   * @size:  Size of allocated pages.
   *
   * This function releases memory allocated by dma_alloc_contiguous(). As the
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 795c9b095d75..706113c6bebc 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
  struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
  {
+   size_t alloc_size = PAGE_ALIGN(size);
+   int node = dev_to_node(dev);
struct page *page = NULL;
u64 phys_mask;
  
@@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,

gfp &= ~__GFP_ZERO;
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
_mask);
+   page = dma_alloc_contiguous(dev, alloc_size, gfp);
+   if (page && !dma_coherent_ok(dev, 

Re: [PATCH 4/7] ath10k: disable TX complete indication of htt for sdio

2019-08-20 Thread Toke Høiland-Jørgensen
Wen Gong  writes:

> Tx complete message from firmware cost bus bandwidth of sdio, and bus
> bandwidth is the bollteneck of throughput, it will effect the bandwidth
> occupancy of data packet of TX and RX.
>
> This patch disable TX complete indication from firmware for htt data
> packet, it results in significant performance improvement on TX path.

Wait, how does that work? Am I understanding it correctly that this
replaces a per-packet TX completion with a periodic one sent out of
band?

And could you explain what the credits thing is for, please? :)

-Toke


Re: [PATCH 2/7] ath10k: change max RX bundle size from 8 to 32 for sdio

2019-08-20 Thread Toke Høiland-Jørgensen
Wen Gong  writes:

> The max bundle size support by firmware is 32, change it from 8 to 32
> will help performance. This results in significant performance
> improvement on RX path.

What happens when the hardware doesn't have enough data to fill a
bundle? Does it send a smaller one, or does it wait until it can fill
it?

-Toke


[PATCH 5/7] ath10k: add htt TX bundle for sdio

2019-08-20 Thread Wen Gong
The transmission utilization ratio for sdio bus for small packet is
slow, because the space and time cost for sdio bus is same for large
length packet and small length packet. So the speed of data for large
length packet is higher than small length.

Test result of different length of data:

data packet(byte)   cost time(us)   calculated rate(Mbps)
  256   2873
  512   33   124
 1024   35   234
 1792   45   318
14336  168   682
28672  333   688
57344  660   695

This patch change the TX packet from single packet to a large length
bundle packet, max size is 32, it results in significant performance
improvement on TX path.

This patch only effect sdio chip, it will not effect PCI, SNOC etc.
It only enable bundle for sdio chip.

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-7-QCARMSWP-1.

Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/core.c   |  14 +-
 drivers/net/wireless/ath/ath10k/core.h   |   4 +-
 drivers/net/wireless/ath/ath10k/htc.c| 353 ---
 drivers/net/wireless/ath/ath10k/htc.h|  21 +-
 drivers/net/wireless/ath/ath10k/htt.c|   8 +
 drivers/net/wireless/ath/ath10k/htt.h|   5 +
 drivers/net/wireless/ath/ath10k/htt_rx.c |   1 +
 drivers/net/wireless/ath/ath10k/htt_tx.c |   9 +-
 8 files changed, 377 insertions(+), 38 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.c 
b/drivers/net/wireless/ath/ath10k/core.c
index 762bba0..351f4ed 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -3194,6 +3194,11 @@ struct ath10k *ath10k_core_create(size_t priv_size, 
struct device *dev,
if (!ar->workqueue_aux)
goto err_free_wq;
 
+   ar->workqueue_tx_complete =
+   create_singlethread_workqueue("ath10k_tx_complete_wq");
+   if (!ar->workqueue_tx_complete)
+   goto err_free_aux_wq;
+
mutex_init(>conf_mutex);
mutex_init(>dump_mutex);
spin_lock_init(>data_lock);
@@ -3219,7 +3224,7 @@ struct ath10k *ath10k_core_create(size_t priv_size, 
struct device *dev,
 
ret = ath10k_coredump_create(ar);
if (ret)
-   goto err_free_aux_wq;
+   goto err_free_tx_complete;
 
ret = ath10k_debug_create(ar);
if (ret)
@@ -3229,12 +3234,12 @@ struct ath10k *ath10k_core_create(size_t priv_size, 
struct device *dev,
 
 err_free_coredump:
ath10k_coredump_destroy(ar);
-
+err_free_tx_complete:
+   destroy_workqueue(ar->workqueue_tx_complete);
 err_free_aux_wq:
destroy_workqueue(ar->workqueue_aux);
 err_free_wq:
destroy_workqueue(ar->workqueue);
-
 err_free_mac:
ath10k_mac_destroy(ar);
 
@@ -3250,6 +3255,9 @@ void ath10k_core_destroy(struct ath10k *ar)
flush_workqueue(ar->workqueue_aux);
destroy_workqueue(ar->workqueue_aux);
 
+   flush_workqueue(ar->workqueue_tx_complete);
+   destroy_workqueue(ar->workqueue_tx_complete);
+
ath10k_debug_destroy(ar);
ath10k_coredump_destroy(ar);
ath10k_htt_tx_destroy(>htt);
diff --git a/drivers/net/wireless/ath/ath10k/core.h 
b/drivers/net/wireless/ath/ath10k/core.h
index 4d7db07..be9eb37 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -1079,7 +1079,7 @@ struct ath10k {
struct workqueue_struct *workqueue;
/* Auxiliary workqueue */
struct workqueue_struct *workqueue_aux;
-
+   struct workqueue_struct *workqueue_tx_complete;
/* prevents concurrent FW reconfiguration */
struct mutex conf_mutex;
 
@@ -1120,6 +1120,8 @@ struct ath10k {
 
struct work_struct register_work;
struct work_struct restart_work;
+   struct work_struct bundle_tx_work;
+   struct work_struct tx_complete_work;
 
/* cycle count is reported twice for each visited channel during scan.
 * access protected by data_lock
diff --git a/drivers/net/wireless/ath/ath10k/htc.c 
b/drivers/net/wireless/ath/ath10k/htc.c
index 7357a5a..96b620f 100644
--- a/drivers/net/wireless/ath/ath10k/htc.c
+++ b/drivers/net/wireless/ath/ath10k/htc.c
@@ -51,10 +51,12 @@ void ath10k_htc_notify_tx_completion(struct ath10k_htc_ep 
*ep,
 struct sk_buff *skb)
 {
struct ath10k *ar = ep->htc->ar;
+   struct ath10k_htc_hdr *hdr;
 
ath10k_dbg(ar, ATH10K_DBG_HTC, "%s: ep %d skb %pK\n", __func__,
   ep->eid, skb);
 
+   hdr = (struct ath10k_htc_hdr *)skb->data;
ath10k_htc_restore_tx_skb(ep->htc, skb);
 
if (!ep->ep_ops.ep_tx_complete) {
@@ -63,6 +65,11 @@ void ath10k_htc_notify_tx_completion(struct ath10k_htc_ep 
*ep,
return;
}
 
+   if (hdr->flags & ATH10K_HTC_FLAG_SEND_BUNDLE) {
+

[PATCH 7/7] ath10k: enable napi on RX path for sdio

2019-08-20 Thread Wen Gong
For tcp RX, the quantity of tcp acks to remote is 1/2 of the quantity
of tcp data from remote, then it will have many small length packets
on TX path of sdio bus, then it reduce the RX packets's bandwidth of
tcp.

This patch enable napi on RX path, then the RX packet of tcp will not
feed to tcp stack immeditely from mac80211 since GRO is enabled by
default, it will feed to tcp stack after napi complete, if rx bundle
is enabled, then it will feed to tcp stack one time for each bundle
of RX. For example, RX bundle size is 32, then tcp stack will receive
one large length packet, its length is neary 1500*32, then tcp stack
will send a tcp ack for this large packet, this will reduce the tcp
acks ratio from 1/2 to 1/32. This results in significant performance
improvement for tcp RX.

Tcp rx throughout is 240Mbps without this patch, and it arrive 390Mbps
with this patch. The cpu usage has no obvious difference with and
without NAPI.

call stack for each RX packet on GRO path:
(skb length is about 1500 bytes)
  skb_gro_receive ([kernel.kallsyms])
  tcp4_gro_receive ([kernel.kallsyms])
  inet_gro_receive ([kernel.kallsyms])
  dev_gro_receive ([kernel.kallsyms])
  napi_gro_receive ([kernel.kallsyms])
  ieee80211_deliver_skb ([mac80211])
  ieee80211_rx_handlers ([mac80211])
  ieee80211_prepare_and_rx_handle ([mac80211])
  ieee80211_rx_napi ([mac80211])
  ath10k_htt_rx_proc_rx_ind_hl ([ath10k_core])
  ath10k_htt_rx_pktlog_completion_handler ([ath10k_core])
  ath10k_sdio_napi_poll ([ath10k_sdio])
  net_rx_action ([kernel.kallsyms])
  softirqentry_text_start ([kernel.kallsyms])
  do_softirq ([kernel.kallsyms])

call stack for napi complete and send tcp ack from tcp stack:
(skb length is about 1500*32 bytes)
 _tcp_ack_snd_check ([kernel.kallsyms])
 tcp_v4_do_rcv ([kernel.kallsyms])
 tcp_v4_rcv ([kernel.kallsyms])
 local_deliver_finish ([kernel.kallsyms])
 ip_local_deliver ([kernel.kallsyms])
 ip_rcv_finish ([kernel.kallsyms])
 ip_rcv ([kernel.kallsyms])
 netif_receive_skb_core ([kernel.kallsyms])
 netif_receive_skb_one_core([kernel.kallsyms])
 netif_receive_skb ([kernel.kallsyms])
 netif_receive_skb_internal ([kernel.kallsyms])
 napi_gro_complete ([kernel.kallsyms])
 napi_gro_flush ([kernel.kallsyms])
 napi_complete_done ([kernel.kallsyms])
 ath10k_sdio_napi_poll ([ath10k_sdio])
 net_rx_action ([kernel.kallsyms])
 __softirqentry_text_start ([kernel.kallsyms])
 do_softirq ([kernel.kallsyms])

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-7-QCARMSWP-1.

Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/htt.c|  2 ++
 drivers/net/wireless/ath/ath10k/htt.h|  3 +++
 drivers/net/wireless/ath/ath10k/htt_rx.c | 46 ++--
 drivers/net/wireless/ath/ath10k/sdio.c   | 33 +++
 4 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htt.c 
b/drivers/net/wireless/ath/ath10k/htt.c
index 38b5141..04ca13c 100644
--- a/drivers/net/wireless/ath/ath10k/htt.c
+++ b/drivers/net/wireless/ath/ath10k/htt.c
@@ -157,6 +157,8 @@ int ath10k_htt_connect(struct ath10k_htt *htt)
 
htt->eid = conn_resp.eid;
 
+   skb_queue_head_init(>rx_indication_head);
+
if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) {
ep = >htc.endpoint[htt->eid];
ath10k_htc_setup_tx_req(ep);
diff --git a/drivers/net/wireless/ath/ath10k/htt.h 
b/drivers/net/wireless/ath/ath10k/htt.h
index d2e5ea6..321bd2c 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -1879,6 +1879,8 @@ struct ath10k_htt {
struct ath10k *ar;
enum ath10k_htc_ep_id eid;
 
+   struct sk_buff_head rx_indication_head;
+
u8 target_version_major;
u8 target_version_minor;
struct completion target_version_received;
@@ -2299,6 +2301,7 @@ int ath10k_htt_tx_mgmt_inc_pending(struct ath10k_htt 
*htt, bool is_mgmt,
 void ath10k_htt_rx_pktlog_completion_handler(struct ath10k *ar,
 struct sk_buff *skb);
 int ath10k_htt_txrx_compl_task(struct ath10k *ar, int budget);
+int ath10k_htt_rx_hl_indication(struct ath10k *ar, int budget);
 void ath10k_htt_set_tx_ops(struct ath10k_htt *htt);
 void ath10k_htt_set_rx_ops(struct ath10k_htt *htt);
 #endif
diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c 
b/drivers/net/wireless/ath/ath10k/htt_rx.c
index 8651a3b..7a01d12 100644
--- a/drivers/net/wireless/ath/ath10k/htt_rx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_rx.c
@@ -2263,7 +2263,7 @@ static bool ath10k_htt_rx_proc_rx_ind_hl(struct 
ath10k_htt *htt,
if (mpdu_ranges->mpdu_range_status == 
HTT_RX_IND_MPDU_STATUS_TKIP_MIC_ERR)
rx_status->flag |= RX_FLAG_MMIC_ERROR;
 
-   ieee80211_rx_ni(ar->hw, skb);
+   ieee80211_rx_napi(ar->hw, NULL, skb, >napi);
 
/* We have delivered the skb to the upper layers (mac80211) so we
 * must not free it.
@@ -3664,14 +3664,12 @@ bool 

[PATCH 6/7] ath10k: enable alt data of TX path for sdio

2019-08-20 Thread Wen Gong
The default credit size is 1792 bytes, but the IP mtu is 1500 bytes,
then it has about 290 bytes's waste for each data packet on sdio
transfer path for TX bundle, it will reduce the transmission utilization
ratio for data packet.

This patch enable the small credit size in firmware, firmware will use
the new credit size 1556 bytes, it will increase the transmission
utilization ratio for data packet on TX patch. It results in significant
performance improvement on TX path.

This patch only effect sdio chip, it will not effect PCI, SNOC etc.

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-00017-QCARMSWP-1.

Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/core.c | 16 
 drivers/net/wireless/ath/ath10k/htc.c  | 11 +--
 drivers/net/wireless/ath/ath10k/htc.h  | 11 +--
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.c 
b/drivers/net/wireless/ath/ath10k/core.c
index 351f4ed..7593d19 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -31,6 +31,7 @@
 static unsigned int ath10k_cryptmode_param;
 static bool uart_print;
 static bool disable_tx_comp = true;
+static bool alt_data = true;
 static bool skip_otp;
 static bool rawmode;
 static bool fw_diag_log;
@@ -45,6 +46,15 @@
 
 /* If upper layer need the TX complete status, it can enable tx complete */
 module_param(disable_tx_comp, bool, 0644);
+
+/* alt_data is only used for sdio chip, for previous version of firmware, its
+ * alt data size is 1544 which is not enough for native wifi, so it need to
+ * alt_data for the firmware.
+ * If the firmware has changed alt data size to 1556, then it can enable
+ * alt_data for the firmware.
+ * alt_data will not effect PCI, SNOC etc.
+ */
+module_param(alt_data, bool, 0644);
 module_param(skip_otp, bool, 0644);
 module_param(rawmode, bool, 0644);
 module_param(fw_diag_log, bool, 0644);
@@ -701,6 +711,12 @@ static void ath10k_init_sdio(struct ath10k *ar, enum 
ath10k_firmware_mode mode)
 */
param &= ~HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE;
 
+   /* If alternate credit size of 1556 as used by SDIO firmware is
+* big enough for mac80211 / native wifi frames. enable it
+*/
+   if (alt_data && mode == ATH10K_FIRMWARE_MODE_NORMAL)
+   param |= HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE;
+
if (mode == ATH10K_FIRMWARE_MODE_UTF)
param &= ~HI_ACS_FLAGS_SDIO_SWAP_MAILBOX_SET;
else
diff --git a/drivers/net/wireless/ath/ath10k/htc.c 
b/drivers/net/wireless/ath/ath10k/htc.c
index 96b620f..2db7f1a 100644
--- a/drivers/net/wireless/ath/ath10k/htc.c
+++ b/drivers/net/wireless/ath/ath10k/htc.c
@@ -938,12 +938,15 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc)
 */
if (htc->control_resp_len >=
sizeof(msg->hdr) + sizeof(msg->ready_ext)) {
+   htc->alt_data_credit_size =
+   __le16_to_cpu(msg->ready_ext.reserved) & 0x0fff;
htc->max_msgs_per_htc_bundle =
min_t(u8, msg->ready_ext.max_msgs_per_htc_bundle,
  HTC_HOST_MAX_MSG_PER_RX_BUNDLE);
ath10k_dbg(ar, ATH10K_DBG_HTC,
-  "Extended ready message. RX bundle size: %d\n",
-  htc->max_msgs_per_htc_bundle);
+  "Extended ready message. RX bundle size: %d, alt 
size:%d\n",
+  htc->max_msgs_per_htc_bundle,
+  htc->alt_data_credit_size);
}
 
INIT_WORK(>bundle_tx_work, ath10k_htc_bundle_tx_work);
@@ -1096,6 +1099,10 @@ int ath10k_htc_connect_service(struct ath10k_htc *htc,
ep->tx_credits = tx_alloc;
ep->tx_credit_size = htc->target_credit_size;
 
+   if (conn_req->service_id == ATH10K_HTC_SVC_ID_HTT_DATA_MSG &&
+   htc->alt_data_credit_size != 0)
+   ep->tx_credit_size = htc->alt_data_credit_size;
+
/* copy all the callbacks */
ep->ep_ops = conn_req->ep_ops;
 
diff --git a/drivers/net/wireless/ath/ath10k/htc.h 
b/drivers/net/wireless/ath/ath10k/htc.h
index 621ac89..f965b5b 100644
--- a/drivers/net/wireless/ath/ath10k/htc.h
+++ b/drivers/net/wireless/ath/ath10k/htc.h
@@ -135,8 +135,14 @@ struct ath10k_htc_ready_extended {
struct ath10k_htc_ready base;
u8 htc_version; /* @enum ath10k_htc_version */
u8 max_msgs_per_htc_bundle;
-   u8 pad0;
-   u8 pad1;
+   union {
+   __le16 reserved;
+   struct {
+   u8 pad0;
+   u8 pad1;
+   } __packed;
+   } __packed;
+
 } __packed;
 
 struct ath10k_htc_conn_svc {
@@ -373,6 +379,7 @@ struct ath10k_htc {
int total_transmit_credits;
int target_credit_size;
u8 max_msgs_per_htc_bundle;
+   int alt_data_credit_size;
 };
 
 int ath10k_htc_init(struct ath10k *ar);
-- 
1.9.1



[PATCH 1/7] ath10k: enable RX bundle receive for sdio

2019-08-20 Thread Wen Gong
From: Alagu Sankar 

The existing implementation of initiating multiple sdio transfers for
receive bundling is slowing down the receive speed. Combining the
transfers using a bundle method would be ideal.

The transmission utilization ratio for sdio bus for small packet is
slow, because the space and time cost for sdio bus is same for large
length packet and small length packet. So the speed of data for large
length packet is higher than small length.

Test result of different length of data:
data packet(byte)   cost time(us)   calculated rate(Mbps)
  256   2873
  512   33   124
 1024   35   234
 1792   45   318
14336  168   682
28672  333   688
57344  660   695

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-7-QCARMSWP-1.

Signed-off-by: Alagu Sankar 
Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/sdio.c | 101 -
 drivers/net/wireless/ath/ath10k/sdio.h |   7 ++-
 2 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/sdio.c 
b/drivers/net/wireless/ath/ath10k/sdio.c
index 8ed4fbd..d9395f0 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -24,6 +24,9 @@
 #include "trace.h"
 #include "sdio.h"
 
+#define ATH10K_SDIO_DMA_BUF_SIZE   (32 * 1024)
+#define ATH10K_SDIO_VSG_BUF_SIZE   (32 * 1024)
+
 /* inlined helper functions */
 
 static inline int ath10k_sdio_calc_txrx_padded_len(struct ath10k_sdio *ar_sdio,
@@ -489,11 +492,11 @@ static int ath10k_sdio_mbox_rx_process_packets(struct 
ath10k *ar,
return ret;
 }
 
-static int ath10k_sdio_mbox_alloc_pkt_bundle(struct ath10k *ar,
-struct ath10k_sdio_rx_data 
*rx_pkts,
-struct ath10k_htc_hdr *htc_hdr,
-size_t full_len, size_t act_len,
-size_t *bndl_cnt)
+static int ath10k_sdio_mbox_alloc_bundle(struct ath10k *ar,
+struct ath10k_sdio_rx_data *rx_pkts,
+struct ath10k_htc_hdr *htc_hdr,
+size_t full_len, size_t act_len,
+size_t *bndl_cnt)
 {
int ret, i;
 
@@ -534,6 +537,7 @@ static int ath10k_sdio_mbox_rx_alloc(struct ath10k *ar,
size_t full_len, act_len;
bool last_in_bundle;
int ret, i;
+   int pkt_cnt = 0;
 
if (n_lookaheads > ATH10K_SDIO_MAX_RX_MSGS) {
ath10k_warn(ar,
@@ -577,20 +581,22 @@ static int ath10k_sdio_mbox_rx_alloc(struct ath10k *ar,
 */
size_t bndl_cnt;
 
-   ret = ath10k_sdio_mbox_alloc_pkt_bundle(ar,
-   
_sdio->rx_pkts[i],
-   htc_hdr,
-   full_len,
-   act_len,
-   _cnt);
+   struct ath10k_sdio_rx_data *rx_pkts =
+   _sdio->rx_pkts[pkt_cnt];
+
+   ret = ath10k_sdio_mbox_alloc_bundle(ar,
+   rx_pkts,
+   htc_hdr,
+   full_len,
+   act_len,
+   _cnt);
 
if (ret) {
ath10k_warn(ar, "alloc_bundle error %d\n", ret);
goto err;
}
 
-   n_lookaheads += bndl_cnt;
-   i += bndl_cnt;
+   pkt_cnt += bndl_cnt;
/*Next buffer will be the last in the bundle */
last_in_bundle = true;
}
@@ -602,7 +608,7 @@ static int ath10k_sdio_mbox_rx_alloc(struct ath10k *ar,
if (htc_hdr->flags & ATH10K_HTC_FLAGS_RECV_1MORE_BLOCK)
full_len += ATH10K_HIF_MBOX_BLOCK_SIZE;
 
-   ret = ath10k_sdio_mbox_alloc_rx_pkt(_sdio->rx_pkts[i],
+   ret = ath10k_sdio_mbox_alloc_rx_pkt(_sdio->rx_pkts[pkt_cnt],
act_len,
full_len,
last_in_bundle,
@@ -611,9 +617,10 @@ static int 

[PATCH 4/7] ath10k: disable TX complete indication of htt for sdio

2019-08-20 Thread Wen Gong
Tx complete message from firmware cost bus bandwidth of sdio, and bus
bandwidth is the bollteneck of throughput, it will effect the bandwidth
occupancy of data packet of TX and RX.

This patch disable TX complete indication from firmware for htt data
packet, it results in significant performance improvement on TX path.

The downside of this patch is ath10k will not know the TX status of
the data packet for poor signal situation. Although upper network stack
or application layer have retry mechanism, the retry will be later than
ath10k get the TX fail status if not disable TX complete.

This patch only effect sdio chip, it will not effect PCI, SNOC etc.

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-7-QCARMSWP-1.

Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/core.c   |  6 +
 drivers/net/wireless/ath/ath10k/hif.h|  9 
 drivers/net/wireless/ath/ath10k/htc.c| 11 +
 drivers/net/wireless/ath/ath10k/htc.h|  3 +++
 drivers/net/wireless/ath/ath10k/htt.c|  5 +
 drivers/net/wireless/ath/ath10k/htt.h| 13 ++-
 drivers/net/wireless/ath/ath10k/htt_rx.c | 38 +++-
 drivers/net/wireless/ath/ath10k/htt_tx.c | 30 +
 drivers/net/wireless/ath/ath10k/hw.h |  2 +-
 drivers/net/wireless/ath/ath10k/sdio.c   | 28 +++
 10 files changed, 142 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.c 
b/drivers/net/wireless/ath/ath10k/core.c
index dc45d16..762bba0 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -30,6 +30,7 @@
 
 static unsigned int ath10k_cryptmode_param;
 static bool uart_print;
+static bool disable_tx_comp = true;
 static bool skip_otp;
 static bool rawmode;
 static bool fw_diag_log;
@@ -41,6 +42,9 @@
 module_param_named(debug_mask, ath10k_debug_mask, uint, 0644);
 module_param_named(cryptmode, ath10k_cryptmode_param, uint, 0644);
 module_param(uart_print, bool, 0644);
+
+/* If upper layer need the TX complete status, it can enable tx complete */
+module_param(disable_tx_comp, bool, 0644);
 module_param(skip_otp, bool, 0644);
 module_param(rawmode, bool, 0644);
 module_param(fw_diag_log, bool, 0644);
@@ -689,6 +693,8 @@ static void ath10k_init_sdio(struct ath10k *ar, enum 
ath10k_firmware_mode mode)
 * is used for SDIO. disable it until fixed
 */
param &= ~HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_SET;
+   if (disable_tx_comp)
+   param |= HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_SET;
 
/* Alternate credit size of 1544 as used by SDIO firmware is
 * not big enough for mac80211 / native wifi frames. disable it
diff --git a/drivers/net/wireless/ath/ath10k/hif.h 
b/drivers/net/wireless/ath/ath10k/hif.h
index 496ee34..0dd8973 100644
--- a/drivers/net/wireless/ath/ath10k/hif.h
+++ b/drivers/net/wireless/ath/ath10k/hif.h
@@ -56,6 +56,8 @@ struct ath10k_hif_ops {
 
int (*swap_mailbox)(struct ath10k *ar);
 
+   int (*get_htt_tx_complete)(struct ath10k *ar);
+
int (*map_service_to_pipe)(struct ath10k *ar, u16 service_id,
   u8 *ul_pipe, u8 *dl_pipe);
 
@@ -144,6 +146,13 @@ static inline int ath10k_hif_swap_mailbox(struct ath10k 
*ar)
return 0;
 }
 
+static inline int ath10k_hif_get_htt_tx_complete(struct ath10k *ar)
+{
+   if (ar->hif.ops->get_htt_tx_complete)
+   return ar->hif.ops->get_htt_tx_complete(ar);
+   return 0;
+}
+
 static inline int ath10k_hif_map_service_to_pipe(struct ath10k *ar,
 u16 service_id,
 u8 *ul_pipe, u8 *dl_pipe)
diff --git a/drivers/net/wireless/ath/ath10k/htc.c 
b/drivers/net/wireless/ath/ath10k/htc.c
index 1d4d1a1..7357a5a 100644
--- a/drivers/net/wireless/ath/ath10k/htc.c
+++ b/drivers/net/wireless/ath/ath10k/htc.c
@@ -660,6 +660,17 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc)
return 0;
 }
 
+void ath10k_htc_change_tx_credit_flow(struct ath10k_htc *htc,
+ enum ath10k_htc_ep_id eid,
+ bool enable)
+{
+   struct ath10k *ar = htc->ar;
+   struct ath10k_htc_ep *ep;
+
+   ep = >htc.endpoint[eid];
+   ep->tx_credit_flow_enabled = enable;
+}
+
 int ath10k_htc_connect_service(struct ath10k_htc *htc,
   struct ath10k_htc_svc_conn_req *conn_req,
   struct ath10k_htc_svc_conn_resp *conn_resp)
diff --git a/drivers/net/wireless/ath/ath10k/htc.h 
b/drivers/net/wireless/ath/ath10k/htc.h
index 8c79b9e..78bc3ae 100644
--- a/drivers/net/wireless/ath/ath10k/htc.h
+++ b/drivers/net/wireless/ath/ath10k/htc.h
@@ -367,6 +367,9 @@ struct ath10k_htc {
 int ath10k_htc_connect_service(struct ath10k_htc *htc,
   struct ath10k_htc_svc_conn_req  *conn_req,
   struct 

[PATCH 2/7] ath10k: change max RX bundle size from 8 to 32 for sdio

2019-08-20 Thread Wen Gong
The max bundle size support by firmware is 32, change it from 8 to 32
will help performance. This results in significant performance
improvement on RX path.

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-7-QCARMSWP-1.

Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/htc.h  | 2 +-
 drivers/net/wireless/ath/ath10k/sdio.c | 5 +++--
 drivers/net/wireless/ath/ath10k/sdio.h | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htc.h 
b/drivers/net/wireless/ath/ath10k/htc.h
index f55d3ca..8c79b9e 100644
--- a/drivers/net/wireless/ath/ath10k/htc.h
+++ b/drivers/net/wireless/ath/ath10k/htc.h
@@ -39,7 +39,7 @@
  * 4-byte aligned.
  */
 
-#define HTC_HOST_MAX_MSG_PER_RX_BUNDLE8
+#define HTC_HOST_MAX_MSG_PER_RX_BUNDLE32
 
 enum ath10k_htc_tx_flags {
ATH10K_HTC_FLAG_NEED_CREDIT_UPDATE = 0x01,
diff --git a/drivers/net/wireless/ath/ath10k/sdio.c 
b/drivers/net/wireless/ath/ath10k/sdio.c
index d9395f0..baa6051 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -24,8 +24,8 @@
 #include "trace.h"
 #include "sdio.h"
 
-#define ATH10K_SDIO_DMA_BUF_SIZE   (32 * 1024)
-#define ATH10K_SDIO_VSG_BUF_SIZE   (32 * 1024)
+#define ATH10K_SDIO_DMA_BUF_SIZE   (64 * 1024)
+#define ATH10K_SDIO_VSG_BUF_SIZE   (64 * 1024)
 
 /* inlined helper functions */
 
@@ -501,6 +501,7 @@ static int ath10k_sdio_mbox_alloc_bundle(struct ath10k *ar,
int ret, i;
 
*bndl_cnt = FIELD_GET(ATH10K_HTC_FLAG_BUNDLE_MASK, htc_hdr->flags);
+   *bndl_cnt += (FIELD_GET(GENMASK(3, 2), htc_hdr->flags) << 4);
 
if (*bndl_cnt > HTC_HOST_MAX_MSG_PER_RX_BUNDLE) {
ath10k_warn(ar,
diff --git a/drivers/net/wireless/ath/ath10k/sdio.h 
b/drivers/net/wireless/ath/ath10k/sdio.h
index 4896eca..3ca76c7 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.h
+++ b/drivers/net/wireless/ath/ath10k/sdio.h
@@ -89,10 +89,10 @@
  * to the maximum value (HTC_HOST_MAX_MSG_PER_RX_BUNDLE).
  *
  * in this case the driver must allocate
- * (HTC_HOST_MAX_MSG_PER_RX_BUNDLE * HTC_HOST_MAX_MSG_PER_RX_BUNDLE) skb's.
+ * (HTC_HOST_MAX_MSG_PER_RX_BUNDLE * 2) skb's.
  */
 #define ATH10K_SDIO_MAX_RX_MSGS \
-   (HTC_HOST_MAX_MSG_PER_RX_BUNDLE * HTC_HOST_MAX_MSG_PER_RX_BUNDLE)
+   (HTC_HOST_MAX_MSG_PER_RX_BUNDLE * 2)
 
 #define ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL   0x0868u
 #define ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_OFF 0xFFFE
-- 
1.9.1



[PATCH 0/7] ath10k: improve throughout of tcp/udp TX/RX of sdio

2019-08-20 Thread Wen Gong
The bottleneck of throughout on sdio chip is the bus bandwidth, to the
patches are all to increase the use ratio of sdio bus.

  udp-rxudp-txtcp-rxtcp-tx
without patches(Mbps)  320180   170   151
with patches(Mbps) 450410   400   320

These patches only affect sdio bus chip, explanation is mentioned in each
patch's commit log.

Alagu Sankar (1):
  ath10k: enable RX bundle receive for sdio

Wen Gong (6):
  ath10k: change max RX bundle size from 8 to 32 for sdio
  ath10k: add workqueue for RX path of sdio
  ath10k: disable TX complete indication of htt for sdio
  ath10k: add htt TX bundle for sdio
  ath10k: enable alt data of TX path for sdio
  ath10k: enable napi on RX path for sdio

 drivers/net/wireless/ath/ath10k/core.c   |  36 ++-
 drivers/net/wireless/ath/ath10k/core.h   |   4 +-
 drivers/net/wireless/ath/ath10k/hif.h|   9 +
 drivers/net/wireless/ath/ath10k/htc.c| 375 ---
 drivers/net/wireless/ath/ath10k/htc.h|  37 ++-
 drivers/net/wireless/ath/ath10k/htt.c|  15 ++
 drivers/net/wireless/ath/ath10k/htt.h|  21 +-
 drivers/net/wireless/ath/ath10k/htt_rx.c |  85 ++-
 drivers/net/wireless/ath/ath10k/htt_tx.c |  37 ++-
 drivers/net/wireless/ath/ath10k/hw.h |   2 +-
 drivers/net/wireless/ath/ath10k/sdio.c   | 281 ---
 drivers/net/wireless/ath/ath10k/sdio.h   |  31 ++-
 12 files changed, 846 insertions(+), 87 deletions(-)

-- 
1.9.1



[PATCH 3/7] ath10k: add workqueue for RX path of sdio

2019-08-20 Thread Wen Gong
The thread of read rx message by sdio bus from firmware is
synchronous, it will cost much time for process the left part
of rx message which includes indicate the rx packet to uppper
net stack. It will reduce the time of read from sdio.

This patch move the indication to a workqueue, it results in
significant performance improvement on RX path.

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-7-QCARMSWP-1.

Signed-off-by: Wen Gong 
---
 drivers/net/wireless/ath/ath10k/sdio.c | 117 -
 drivers/net/wireless/ath/ath10k/sdio.h |  20 ++
 2 files changed, 134 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/sdio.c 
b/drivers/net/wireless/ath/ath10k/sdio.c
index baa6051..f42aca6 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -417,6 +417,67 @@ static int ath10k_sdio_mbox_rx_process_packet(struct 
ath10k *ar,
return 0;
 }
 
+static struct ath10k_sdio_rx_request
+*ath10k_sdio_alloc_rx_req(struct ath10k *ar)
+{
+   struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar);
+   struct ath10k_sdio_rx_request *rx_req;
+
+   spin_lock_bh(_sdio->rx_lock);
+
+   if (list_empty(_sdio->rx_req_freeq)) {
+   rx_req = NULL;
+   ath10k_dbg(ar, ATH10K_DBG_SDIO, "rx_req alloc fail\n");
+   goto out;
+   }
+
+   rx_req = list_first_entry(_sdio->rx_req_freeq,
+ struct ath10k_sdio_rx_request, list);
+   list_del(_req->list);
+
+out:
+   spin_unlock_bh(_sdio->rx_lock);
+   return rx_req;
+}
+
+static void ath10k_sdio_free_rx_req(struct ath10k *ar,
+   struct ath10k_sdio_rx_request *rx_req)
+{
+   struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar);
+
+   memset(rx_req, 0, sizeof(*rx_req));
+
+   spin_lock_bh(_sdio->rx_lock);
+   list_add_tail(_req->list, _sdio->rx_req_freeq);
+   spin_unlock_bh(_sdio->rx_lock);
+}
+
+static int ath10k_sdio_prep_async_rx_req(struct ath10k *ar,
+struct sk_buff *skb,
+struct ath10k_htc_ep *ep)
+{
+   struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar);
+   struct ath10k_sdio_rx_request *rx_req;
+
+   /* Allocate a rx request for the message and queue it on the
+* SDIO rx workqueue.
+*/
+   rx_req = ath10k_sdio_alloc_rx_req(ar);
+   if (!rx_req) {
+   ath10k_warn(ar, "unable to allocate rx request for async 
request\n");
+   return -ENOMEM;
+   }
+
+   rx_req->skb = skb;
+   rx_req->ep = ep;
+
+   spin_lock_bh(_sdio->wr_async_lock_rx);
+   list_add_tail(_req->list, _sdio->wr_asyncq_rx);
+   spin_unlock_bh(_sdio->wr_async_lock_rx);
+
+   return 0;
+}
+
 static int ath10k_sdio_mbox_rx_process_packets(struct ath10k *ar,
   u32 lookaheads[],
   int *n_lookahead)
@@ -470,10 +531,16 @@ static int ath10k_sdio_mbox_rx_process_packets(struct 
ath10k *ar,
if (ret)
goto out;
 
-   if (!pkt->trailer_only)
-   ep->ep_ops.ep_rx_complete(ar_sdio->ar, pkt->skb);
-   else
+   if (!pkt->trailer_only) {
+   ret = ath10k_sdio_prep_async_rx_req(ar, pkt->skb, ep);
+   if (ret)
+   kfree_skb(pkt->skb);
+   else
+   queue_work(ar->workqueue_aux,
+  _sdio->wr_async_work_rx);
+   } else {
kfree_skb(pkt->skb);
+   }
 
/* The RX complete handler now owns the skb...*/
pkt->skb = NULL;
@@ -1322,6 +1389,26 @@ static void __ath10k_sdio_write_async(struct ath10k *ar,
ath10k_sdio_free_bus_req(ar, req);
 }
 
+static void ath10k_rx_indication_async_work(struct work_struct *work)
+{
+   struct ath10k_sdio *ar_sdio = container_of(work, struct ath10k_sdio,
+  wr_async_work_rx);
+   struct ath10k *ar = ar_sdio->ar;
+   struct ath10k_sdio_rx_request *req, *tmp_req;
+
+   spin_lock_bh(_sdio->wr_async_lock_rx);
+
+   list_for_each_entry_safe(req, tmp_req, _sdio->wr_asyncq_rx, list) {
+   list_del(>list);
+   spin_unlock_bh(_sdio->wr_async_lock_rx);
+   req->ep->ep_ops.ep_rx_complete(ar, req->skb);
+   ath10k_sdio_free_rx_req(ar, req);
+   spin_lock_bh(_sdio->wr_async_lock_rx);
+   }
+
+   spin_unlock_bh(_sdio->wr_async_lock_rx);
+}
+
 static void ath10k_sdio_write_async_work(struct work_struct *work)
 {
struct ath10k_sdio *ar_sdio = container_of(work, struct ath10k_sdio,
@@ -1810,10 +1897,24 @@ static void ath10k_sdio_irq_disable(struct ath10k 

Re: regression in ath10k dma allocation

2019-08-20 Thread Christoph Hellwig
On Tue, Aug 20, 2019 at 02:58:33PM +0800, Hillf Danton wrote:
> 
> On Tue, 20 Aug 2019 05:05:14 +0200 Christoph Hellwig wrote:
> > 
> > Tobias, plase try this patch:
> > 

New version below:

---
>From b8a805e93be5a5662323b8ac61fe686df839c4ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig 
Date: Tue, 20 Aug 2019 11:45:49 +0900
Subject: dma-direct: fix zone selection after an unaddressable CMA allocation

The new dma_alloc_contiguous hides if we allocate CMA or regular
pages, and thus fails to retry a ZONE_NORMAL allocation if the CMA
allocation succeeds but isn't addressable.  That means we either fail
outright or dip into a small zone that might not succeed either.

Thanks to Hillf Danton for debugging this issue.

Fixes: b1d2dc009dec ("dma-contiguous: add dma_{alloc,free}_contiguous() 
helpers")
Reported-by: Tobias Klausmann 
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c  |  3 +++
 include/linux/dma-contiguous.h |  5 +
 kernel/dma/contiguous.c|  9 +++--
 kernel/dma/direct.c| 10 +-
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d991d40f797f..f68a62c3c32b 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -965,10 +965,13 @@ static void *iommu_dma_alloc_pages(struct device *dev, 
size_t size,
 {
bool coherent = dev_is_dma_coherent(dev);
size_t alloc_size = PAGE_ALIGN(size);
+   int node = dev_to_node(dev);
struct page *page = NULL;
void *cpu_addr;
 
page = dma_alloc_contiguous(dev, alloc_size, gfp);
+   if (!page)
+   page = alloc_pages_node(node, gfp, get_order(alloc_size));
if (!page)
return NULL;
 
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index c05d4e661489..03f8e98e3bcc 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -160,10 +160,7 @@ bool dma_release_from_contiguous(struct device *dev, 
struct page *pages,
 static inline struct page *dma_alloc_contiguous(struct device *dev, size_t 
size,
gfp_t gfp)
 {
-   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-   size_t align = get_order(PAGE_ALIGN(size));
-
-   return alloc_pages_node(node, gfp, align);
+   return NULL;
 }
 
 static inline void dma_free_contiguous(struct device *dev, struct page *page,
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 2bd410f934b3..e6b450fdbeb6 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct 
page *pages,
  */
 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 {
-   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-   size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   size_t align = get_order(PAGE_ALIGN(size));
+   size_t count = size >> PAGE_SHIFT;
struct page *page = NULL;
struct cma *cma = NULL;
 
@@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, 
size_t size, gfp_t gfp)
 
/* CMA can be used only in the context which permits sleeping */
if (cma && gfpflags_allow_blocking(gfp)) {
+   size_t align = get_order(size);
size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
 
page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
}
 
-   /* Fallback allocation of normal pages */
-   if (!page)
-   page = alloc_pages_node(node, gfp, align);
return page;
 }
 
@@ -258,6 +254,7 @@ struct page *dma_alloc_contiguous(struct device *dev, 
size_t size, gfp_t gfp)
  * dma_free_contiguous() - release allocated pages
  * @dev:   Pointer to device for which the pages were allocated.
  * @page:  Pointer to the allocated pages.
+   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
  * @size:  Size of allocated pages.
  *
  * This function releases memory allocated by dma_alloc_contiguous(). As the
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 795c9b095d75..706113c6bebc 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
+   size_t alloc_size = PAGE_ALIGN(size);
+   int node = dev_to_node(dev);
struct page *page = NULL;
u64 phys_mask;
 
@@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, 
size_t size,
gfp &= ~__GFP_ZERO;
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
_mask);
+   page = dma_alloc_contiguous(dev, alloc_size, gfp);
+   if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+

Re: regression in ath10k dma allocation

2019-08-20 Thread Hillf Danton


On Tue, 20 Aug 2019 05:05:14 +0200 Christoph Hellwig wrote:
> 
> Tobias, plase try this patch:
> 
A minute!

> --
> >From 88c590a2ecafc8279388f25bfbe1ead8ea3507a6 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig 
> Date: Tue, 20 Aug 2019 11:45:49 +0900
> Subject: dma-direct: fix zone selection after an unaddressable CMA allocation
> 
> The new dma_alloc_contiguous hides if we allocate CMA or regular
> pages, and thus fails to retry a ZONE_NORMAL allocation if the CMA
> allocation succeeds but isn't addressable.  That means we either fail
> outright or dip into a small zone that might not succeed either.
> 
> Thanks to Hillf Danton for debugging this issue.
> 
> Fixes: b1d2dc009dec ("dma-contiguous: add dma_{alloc,free}_contiguous() 
> helpers")
> Reported-by: Tobias Klausmann 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/iommu/dma-iommu.c  | 3 +++
>  include/linux/dma-contiguous.h | 5 +
>  kernel/dma/contiguous.c| 9 +++--
>  kernel/dma/direct.c| 7 ++-
>  4 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index d991d40f797f..f68a62c3c32b 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -965,10 +965,13 @@ static void *iommu_dma_alloc_pages(struct device *dev, 
> size_t size,
>  {
>   bool coherent = dev_is_dma_coherent(dev);
>   size_t alloc_size = PAGE_ALIGN(size);
> + int node = dev_to_node(dev);
>   struct page *page = NULL;
>   void *cpu_addr;
>  
>   page = dma_alloc_contiguous(dev, alloc_size, gfp);
> + if (!page)
> + page = alloc_pages_node(node, gfp, get_order(alloc_size));
>   if (!page)
>   return NULL;
>  
> diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
> index c05d4e661489..03f8e98e3bcc 100644
> --- a/include/linux/dma-contiguous.h
> +++ b/include/linux/dma-contiguous.h
> @@ -160,10 +160,7 @@ bool dma_release_from_contiguous(struct device *dev, 
> struct page *pages,
>  static inline struct page *dma_alloc_contiguous(struct device *dev, size_t 
> size,
>   gfp_t gfp)
>  {
> - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
> - size_t align = get_order(PAGE_ALIGN(size));
> -
> - return alloc_pages_node(node, gfp, align);
> + return NULL;
>  }
>  
>  static inline void dma_free_contiguous(struct device *dev, struct page *page,
> diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
> index 2bd410f934b3..e6b450fdbeb6 100644
> --- a/kernel/dma/contiguous.c
> +++ b/kernel/dma/contiguous.c
> @@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, 
> struct page *pages,
>   */
>  struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
>  {
> - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
> - size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
> - size_t align = get_order(PAGE_ALIGN(size));
> + size_t count = size >> PAGE_SHIFT;
>   struct page *page = NULL;
>   struct cma *cma = NULL;
>  
> @@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, 
> size_t size, gfp_t gfp)
>  
>   /* CMA can be used only in the context which permits sleeping */
>   if (cma && gfpflags_allow_blocking(gfp)) {
> + size_t align = get_order(size);
>   size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
>  
>   page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
>   }
>  
> - /* Fallback allocation of normal pages */
> - if (!page)
> - page = alloc_pages_node(node, gfp, align);
>   return page;
>  }
>  
> @@ -258,6 +254,7 @@ struct page *dma_alloc_contiguous(struct device *dev, 
> size_t size, gfp_t gfp)
>   * dma_free_contiguous() - release allocated pages
>   * @dev:   Pointer to device for which the pages were allocated.
>   * @page:  Pointer to the allocated pages.
> + int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
>   * @size:  Size of allocated pages.
>   *
>   * This function releases memory allocated by dma_alloc_contiguous(). As the
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 795c9b095d75..d82d184463ce 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
> phys, size_t size)
>  struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>   dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
>  {
> + size_t alloc_size = PAGE_ALIGN(size);
> + int node = dev_to_node(dev);
>   struct page *page = NULL;
>   u64 phys_mask;
>  
> @@ -95,8 +97,11 @@ struct page *__dma_direct_alloc_pages(struct device *dev, 
> size_t size,
>   gfp &= ~__GFP_ZERO;
>   gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
>   _mask);
> + page = dma_alloc_contiguous(dev, alloc_size, gfp);
> +