This patch implements performant csum_partial for x86_64. The intent is to speed up checksum calculation, particularly for smaller lengths such as those that are present when doing skb_postpull_rcsum when getting CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY conversion.
- v4 - went back to C code with inline assembly for critical routines - implemented suggestion from Linus to deal with lengths < 8 Testing: Correctness: Verified correctness by testing arbitrary length buffer filled with random data. For each buffer I compared the computed checksum using the original algorithm for each possible alignment (0-7 bytes). Performance: Isolating old and new implementation for some common cases: Old New % Len/Aln nsecs nsecs Improv --------+-------+--------+------- 1400/0 195.6 181.7 7% (Big packet) 40/0 11.4 6.2 45% (Ipv6 hdr cmn case) 8/4 7.9 3.2 59% (UDP, VXLAN in IPv4) 14/0 8.9 5.9 33% (Eth hdr) 14/4 9.2 5.9 35% (Eth hdr in IPv4) 14/3 9.6 5.9 38% (Eth with odd align) 20/0 9.0 6.2 31% (IP hdr without options) 7/1 8.9 4.2 52% (buffer in one quad) 100/0 17.4 13.9 20% (medium-sized pkt) 100/2 17.8 14.2 20% (medium-sized pkt w/ alignment) Results from: Intel(R) Xeon(R) CPU X5650 @ 2.67GHz Also tested on these with similar results: Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz Branch prediction: To test the effects of poor branch prediction in the jump tables I tested checksum performance with runs for two combinations of length and alignment. As the baseline I performed the test by doing half of calls with the first combination, followed by using the second combination for the second half. In the test case, I interleave the two combinations so that in every call the length and alignment are different to defeat the effects of branch prediction. Running several cases, I did not see any material performance difference between the two scenarios (perf stat output is below), neither does either case show a significant number of branch misses. Interleave lengths case: perf stat --repeat 10 -e '{instructions, branches, branch-misses}' \ ./csum -M new-thrash -I -l 100 -S 24 -a 1 -c 100000000 Performance counter stats for './csum -M new-thrash -I -l 100 -S 24 -a 1 -c 100000000' (10 runs): 9,556,693,202 instructions ( +- 0.00% ) 1,176,208,640 branches ( +- 0.00% ) 19,487 branch-misses # 0.00% of all branches ( +- 6.07% ) 2.049732539 seconds time elapsed Non-interleave case: perf stat --repeat 10 -e '{instructions, branches, branch-misses}' \ ./csum -M new-thrash -l 100 -S 24 -a 1 -c 100000000 Performance counter stats for './csum -M new-thrash -l 100 -S 24 -a 1 -c 100000000' (10 runs): 9,782,188,310 instructions ( +- 0.00% ) 1,251,286,958 branches ( +- 0.01% ) 18,950 branch-misses # 0.00% of all branches ( +- 12.74% ) 2.271789046 seconds time elapsed Signed-off-by: Tom Herbert <t...@herbertland.com> --- arch/x86/include/asm/checksum_64.h | 21 ++++ arch/x86/lib/csum-partial_64.c | 225 ++++++++++++++++++++----------------- 2 files changed, 143 insertions(+), 103 deletions(-) diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index cd00e17..e20c35b 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -188,6 +188,27 @@ static inline unsigned add32_with_carry(unsigned a, unsigned b) return a; } +static inline unsigned long add64_with_carry(unsigned long a, unsigned long b) +{ + asm("addq %2,%0\n\t" + "adcq $0,%0" + : "=r" (a) + : "0" (a), "rm" (b)); + return a; +} + +static inline unsigned int add32_with_carry3(unsigned int a, unsigned int b, + unsigned int c) +{ + asm("addl %2,%0\n\t" + "adcl %3,%0\n\t" + "adcl $0,%0" + : "=r" (a) + : "" (a), "rm" (b), "rm" (c)); + + return a; +} + #define HAVE_ARCH_CSUM_ADD static inline __wsum csum_add(__wsum csum, __wsum addend) { diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 9845371..df82c9b 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -8,114 +8,66 @@ #include <linux/compiler.h> #include <linux/module.h> #include <asm/checksum.h> +#include <asm/word-at-a-time.h> -static inline unsigned short from32to16(unsigned a) +static inline unsigned long rotate_by8_if_odd(unsigned long sum, bool aligned) { - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; + if (unlikely(aligned)) + asm("rorq $0x8,%0" + : "=r" (sum) + : "0" (sum)); + return sum; } -/* - * Do a 64-bit checksum on an arbitrary memory area. - * Returns a 32bit checksum. - * - * This isn't as time critical as it used to be because many NICs - * do hardware checksumming these days. - * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. - */ -static unsigned do_csum(const unsigned char *buff, unsigned len) +/* Extract eight high order (nbo) bytes of quad "val". */ +static inline unsigned long csum_partial_lt8_head(unsigned long val, int len) { - unsigned odd, count; - unsigned long result = 0; + return val & ((1ul << len * 8) - 1); +} - if (unlikely(len == 0)) - return result; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } - - /* last up to 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); - - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +/* Extract eight low order (nbo) bytes of quad in "val". */ +static inline unsigned long csum_partial_lt8_tail(unsigned long val, int len) +{ + return val >> (8 * (8 - len)); +} + +/* Sum over buffer up to 64 bytes. */ +static unsigned long csum_partial_le64(const void *buff, unsigned int len, + unsigned long sum) +{ + asm("lea 40f(, %[slen], 4), %%r11\n\t" + "clc\n\t" + "jmpq *%%r11\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 0*8(%[src]),%[res]\n\t" + "nop\n\t" + "40: adcq $0,%[res]" + : [res] "=r" (sum) + : [src] "r" (buff), + [slen] "r" (-((unsigned long)(len >> 3))), "[res]" (sum) + : "r11"); + + if (len & 0x7) { + unsigned long val; + /* + * Since "len" is > 8 here we backtrack in the buffer to load + * the outstaning bytes into the low order bytes of a quad and + * sum those in csum_partial_lt8_tail. By doing this we avoid + * additional calls to load_unaligned_zeropad. + */ + + val = csum_partial_lt8_tail(*(unsigned long *)(buff + len - 8), + len & 0x7); + sum = add64_with_carry(val, sum); } - return result; + + return sum; } /* @@ -130,10 +82,77 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) * * it's best to have buff aligned on a 64-bit boundary */ +static unsigned int do_csum(const void *buff, int len, unsigned int sum) +{ + bool aligned = false; + unsigned long result = 0; + + /* + * For "small" lengths don't bother with alignment, x86 handles + * this pretty well. + */ + if (len <= 8) { + unsigned long val; + + /* Optimize trivial cases */ + if (len == 8) + return add32_with_carry3(sum, + *(unsigned int *)buff, + *(unsigned int *)(buff + 4)); + else if (len == 0) + return sum; + + val = load_unaligned_zeropad(buff); + result = csum_partial_lt8_head(val, len); + + return add32_with_carry3(sum, result >> 32, + result & 0xffffffff); + } else if (len <= 64) { + result = csum_partial_le64(buff, len, result); + + return add32_with_carry3(sum, result >> 32, + result & 0xffffffff); + } + + /* + * Length is greater than 64. Sum to eight byte alignment before + * proceeding with main loop. + */ + aligned = !!((unsigned long)buff & 0x1); + if (aligned) { + unsigned int align = 7 & -(unsigned long)buff; + + result = csum_partial_lt8_head(*(unsigned long *)buff, align); + buff += align; + len -= align; + result = rotate_by8_if_odd(result, align); + } + + /* main loop using 64byte blocks */ + for (; len >= 64; len -= 64, buff += 64) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "=r" (result) + : [src] "r" (buff), + "[res]" (result)); + } + + result = csum_partial_le64(buff, len, result); + result = rotate_by8_if_odd(result, aligned); + + return add32_with_carry3(sum, result >> 32, result & 0xffffffff); +} + __wsum csum_partial(const void *buff, int len, __wsum sum) { - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); + return (__force __wsum)do_csum(buff, len, (__force u32)sum); } /* -- 2.6.5