Re: [PATCH] arm64/lib: add accelerated do_csum for arm64

2019-01-03 Thread Christoph Hellwig
> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
> new file mode 100644
> index ..6931ef13ef87
> --- /dev/null
> +++ b/arch/arm64/lib/checksum.c
> @@ -0,0 +1,144 @@
> +/*
> + * arch/arm64/lib/checksum.c
> + *

No need to mention the file name.  On the other hand it really should
have a SPDX tag, and preferably a copyright notice.


Re: [PATCH] arm64/lib: add accelerated do_csum for arm64

2018-12-28 Thread kbuild test robot
Hi huhai,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on v4.20 next-20181224]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/huhai/arm64-lib-add-accelerated-do_csum-for-arm64/20181228-155335
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git 
for-next/core
config: arm64-allmodconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=arm64 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   arch/arm64/lib/checksum.c: In function 'do_csum':
>> arch/arm64/lib/checksum.c:85:5: warning: 'tmp1' may be used uninitialized in 
>> this function [-Wmaybe-uninitialized]
__asm__ __volatile__(
^~~

vim +/tmp1 +85 arch/arm64/lib/checksum.c

41  
42  /*
43   * Do a 64-bit checksum on an arbitrary memory area.
44   * Returns a 16bit checksum.
45   */
46  unsigned int do_csum(const unsigned char *buff, unsigned len)
47  {
48  unsigned odd, count;
49  unsigned long result = 0;
50  
51  if (unlikely(len == 0))
52  return result;
53  odd = 1 & (unsigned long) buff;
54  if (odd) {
55  result = *buff << 8;
56  len--;
57  buff++;
58  }
59  count = len >> 1;   /* nr of 16-bit words.. */
60  if (count) {
61  if (2 & (unsigned long) buff) {
62  result += *(unsigned short *)buff;
63  count--;
64  len -= 2;
65  buff += 2;
66  }
67  count >>= 1;/* nr of 32-bit words.. */
68  if (count) {
69  unsigned long zero;
70  unsigned long tmp1;
71  unsigned count64;
72  
73  if (4 & (unsigned long) buff) {
74  result += *(unsigned int *) buff;
75  count--;
76  len -= 4;
77  buff += 4;
78  }
79  count >>= 1;/* nr of 64-bit words.. */
80  
81  /* main loop using 64byte blocks */
82  zero = 0;
83  count64 = count >> 3;
84  while (count64) {
  > 85  __asm__ __volatile__(

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH] arm64/lib: add accelerated do_csum for arm64

2018-12-27 Thread huhai
do_csum() in lib/checksum.c is too slow in ARM64,
and we can use assembly and algorithm to accelerate it.

Signed-off-by: huhai 
---
 arch/arm64/include/asm/checksum.h |   3 +
 arch/arm64/lib/Makefile   |   2 +-
 arch/arm64/lib/checksum.c | 144 ++
 3 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/checksum.c

diff --git a/arch/arm64/include/asm/checksum.h 
b/arch/arm64/include/asm/checksum.h
index 0b6f5a7d4027..0d7b80fb300e 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
 }
 #define csum_fold csum_fold
 
+#define do_csum do_csum
+unsigned int do_csum(const unsigned char *buff, unsigned int len);
+
 static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 {
__uint128_t tmp;
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 69ff9887f724..4134730a121b 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-lib-y  := clear_user.o delay.o copy_from_user.o\
+lib-y  := checksum.o clear_user.o delay.o copy_from_user.o \
   copy_to_user.o copy_in_user.o copy_page.o\
   clear_page.o memchr.o memcpy.o memmove.o memset.o\
   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o   \
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index ..6931ef13ef87
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,144 @@
+/*
+ * arch/arm64/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ *
+ * Acknowledgements:
+ * This file is based on arch/x86/lib/csum-partial_64.c and
+ * arch/alpha/lib/checksum.c, which was written by Thomas Gleixner
+ * and Rick Gorton respectively.
+ */
+
+#include 
+#include 
+#include 
+
+static inline unsigned short from64to16(unsigned long x)
+{
+   /* Using extract instructions is a bit more efficient
+* than the original shift/bitmask version.
+*/
+
+   union {
+   unsigned long   ul;
+   unsigned intui[2];
+   unsigned short  us[4];
+   } in_v, tmp_v, out_v;
+
+   in_v.ul = x;
+   tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1];
+
+   /* Since the bits of tmp_v.sh[3] are going to always be zero,
+* we don't have to bother to add that in.
+*/
+   out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1]
+   + (unsigned long) tmp_v.us[2];
+
+   /* Similarly, out_v.us[2] is always zero for the final add.  */
+   return out_v.us[0] + out_v.us[1];
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 16bit checksum.
+ */
+unsigned int do_csum(const unsigned char *buff, unsigned len)
+{
+   unsigned odd, count;
+   unsigned long result = 0;
+
+   if (unlikely(len == 0))
+   return result;
+   odd = 1 & (unsigned long) buff;
+   if (odd) {
+   result = *buff << 8;
+   len--;
+   buff++;
+   }
+   count = len >> 1;   /* nr of 16-bit words.. */
+   if (count) {
+   if (2 & (unsigned long) buff) {
+   result += *(unsigned short *)buff;
+   count--;
+   len -= 2;
+   buff += 2;
+   }
+   count >>= 1;/* nr of 32-bit words.. */
+   if (count) {
+   unsigned long zero;
+   unsigned long tmp1;
+   unsigned count64;
+
+   if (4 & (unsigned long) buff) {
+   result += *(unsigned int *) buff;
+   count--;
+   len -= 4;
+   buff += 4;
+   }
+   count >>= 1;/* nr of 64-bit words.. */
+
+   /* main loop using 64byte blocks */
+   zero = 0;
+   count64 = count >> 3;
+   while (count64) {
+   __asm__ __volatile__(
+   "ldr %x3, [%x1, #0]\n"
+   "adds %x0, %x0, %x3\n"
+   "ldr %x3, [%x1, #8]\n"
+   "adcs %x0, %x0, %x3\n"
+   "ldr %x3, [%x1, #16]\n"
+   "adcs %x0, %x0, %x3\n"
+   "ldr %x3, [%x1, #24]\n"
+   "adcs %x0, %x0, %x3\n"
+   "ldr