Hi Gavin,
     Thanks for your comments.
        
+#if defined(__ARM64_NEON__)

No NEON intrinsics used, maybe RTE_ARCH_ARM64 is better.
In the following line __rte_always_inline is commonly used in DPDK, the effect 
is same.
/Gavin

For this patch, we don't use NEON intrinsics, but for tx& rx process, we use 
NEON intrinsics
for wqebb bigend conversion on arm platform, so in order to keep ingres, all 
intrinsics
optimization on arm we use __ARM64_NEON__ definitation.

I understand your intention is the reading of the status is observed before the 
following reads.
This can be fulfilled by __atomic_load_n(...) with __ATOMIC_ACQUIRE semantics.
This C11 way applies to all the arches, and you don't need the differentiation 
of arches.
/Gavin

Thanks, i have changed it to __atomic_load_n(...) with __ATOMIC_ACQUIRE 
semantics, and send a new patch V3.

Best regards
Xiaoyun Wang
在 2019/9/27 10:08, Gavin Hu (Arm Technology China) 写道:
Hi Xiaoyun,
        
-----Original Message-----
From: dev <dev-boun...@dpdk.org> On Behalf Of Xiaoyun wang
Sent: Wednesday, September 25, 2019 10:31 PM
To: ferruh.yi...@intel.com
Cc: dev@dpdk.org; xuanziya...@huawei.com; shahar.bel...@huawei.com;
luoxian...@huawei.com; tanya.brokh...@huawei.com;
zhouguoy...@huawei.com; wuli...@huawei.com; Xiaoyun wang
<cloud.wangxiao...@huawei.com>
Subject: [dpdk-dev] [PATCH v2 17/17] net/hinic: optimize tx&rx
performance

This patch optimizes receive packets performance
in arm platform.

Signed-off-by: Xiaoyun wang <cloud.wangxiao...@huawei.com>
---
  drivers/net/hinic/hinic_pmd_rx.c | 17 +++++++++++++++++
  drivers/net/hinic/hinic_pmd_rx.h | 11 +++++++++++
  2 files changed, 28 insertions(+)

diff --git a/drivers/net/hinic/hinic_pmd_rx.c
b/drivers/net/hinic/hinic_pmd_rx.c
index 37b4f5c..94071ee 100644
--- a/drivers/net/hinic/hinic_pmd_rx.c
+++ b/drivers/net/hinic/hinic_pmd_rx.c
@@ -950,6 +950,19 @@ void hinic_rx_alloc_pkts(struct hinic_rxq *rxq)
        }
  }

+#if defined(__ARM64_NEON__)
No NEON intrinsics used, maybe RTE_ARCH_ARM64 is better.
In the following line __rte_always_inline is commonly used in DPDK, the effect 
is same.
/Gavin

+static inline uint32_t __attribute__((always_inline))
+hinic_read_cqe_status(uintptr_t addr)
+{
+       uint32_t val;
+
+       asm volatile("ldar %x[val], [%x[addr]]"
+               : [val] "=r" (val)
+               : [addr] "r" (addr));
+       return val;
+}
+#endif
I understand your intention is the reading of the status is observed before the 
following reads.
This can be fulfilled by __atomic_load_n(...) with __ATOMIC_ACQUIRE semantics.
This C11 way applies to all the arches, and you don't need the differentiation 
of arches.
/Gavin
+
  u16 hinic_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, u16
nb_pkts)
  {
        struct rte_mbuf *rxm;
@@ -972,7 +985,11 @@ u16 hinic_recv_pkts(void *rx_queue, struct
rte_mbuf **rx_pkts, u16 nb_pkts)
        while (pkts < nb_pkts) {
                 /* 2. current ci is done */
                rx_cqe = &rxq->rx_cqe[sw_ci];
+#if defined(__X86_64_SSE__)
                status = rx_cqe->status;
+#elif defined(__ARM64_NEON__)
+               status = hinic_read_cqe_status((uintptr_t)&rxq-
rx_cqe[sw_ci]);
+#endif
                if (!HINIC_GET_RX_DONE_BE(status))
                        break;

diff --git a/drivers/net/hinic/hinic_pmd_rx.h
b/drivers/net/hinic/hinic_pmd_rx.h
index fe2735b..fa27e91 100644
--- a/drivers/net/hinic/hinic_pmd_rx.h
+++ b/drivers/net/hinic/hinic_pmd_rx.h
@@ -28,6 +28,7 @@ struct hinic_rq_ctrl {
        u32     ctrl_fmt;
  };

+#if defined(__X86_64_SSE__)
  struct hinic_rq_cqe {
        u32 status;
        u32 vlan_len;
@@ -36,6 +37,16 @@ struct hinic_rq_cqe {

        u32 rsvd[4];
  };
+#elif defined(__ARM64_NEON__)
+struct hinic_rq_cqe {
+       u32 status;
+       u32 vlan_len;
+       u32 offload_type;
+       u32 rss_hash;
+
+       u32 rsvd[4];
+} __rte_cache_aligned;
+#endif

  struct hinic_rq_cqe_sect {
        struct hinic_sge        sge;
--
1.8.3.1

Reply via email to