On Tue, Jan 25, 2022 at 08:23:18PM +0000, Miod Vallat wrote:
> in4_cksum(), used to compute packet checksums for the legacy internet
> protocol, has been hand-optimized for speed on most elderly platforms,
> with the most recent pieces of silicon using a portable C
> implementation.
>
> Most of these implementations, in a not-so-uncommon case, need to
> checksum an extra (``overlay'') header, and invoke memset or bzero on
> such an overlay, prior to initializing it and checksumming it.
>
> However, except for one byte, the zeroed parts are useless since they
> will not change the checksum, so that memset/bzero call can be removed,
> and the sum can omit the first 8 bytes which will always be zero.
>
> The following diff implements that idea. Plus on sparc64 you get one
> useless assembly instruction removed, for free, isn't that awesome?
>
> Affected platforms: alpha, amd64, arm64, hppa, landisk, luna88k, macppc,
> octeon, powerpc64, riscv64, sparc64. No need to test on armv7 and i386.
>
> Index: sys/arch/alpha/alpha/in_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/alpha/alpha/in_cksum.c,v
> retrieving revision 1.9
> diff -u -p -r1.9 in_cksum.c
> --- sys/arch/alpha/alpha/in_cksum.c 21 Aug 2014 14:24:08 -0000 1.9
> +++ sys/arch/alpha/alpha/in_cksum.c 25 Jan 2022 20:21:06 -0000
> @@ -212,14 +212,13 @@ in4_cksum(struct mbuf *m, u_int8_t nxt,
> panic("in4_cksum: bad mbuf chain");
> #endif
>
> - memset(&ipov, 0, sizeof(ipov));
> -
> - ipov.ih_len = htons(len);
> + ipov.ih_x1[8] = 0;
> ipov.ih_pr = nxt;
> + ipov.ih_len = htons(len);
> ipov.ih_src = mtod(m, struct ip *)->ip_src;
> ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
>
> - sum += in_cksumdata((caddr_t) &ipov, sizeof(ipov));
> + sum += in_cksumdata((caddr_t) &ipov + 8, sizeof(ipov) - 8);
I think this would be clearer with a comment.
/* assumes first 8 bytes are zeroes */
sum += in_cksumdata((caddr_t) &ipov + 8, sizeof(ipov) - 8);
> }
>
> /* skip over unnecessary part */
> Index: sys/arch/m88k/m88k/in_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/m88k/m88k/in_cksum.c,v
> retrieving revision 1.4
> diff -u -p -r1.4 in_cksum.c
> --- sys/arch/m88k/m88k/in_cksum.c 21 Aug 2014 14:24:08 -0000 1.4
> +++ sys/arch/m88k/m88k/in_cksum.c 25 Jan 2022 20:21:07 -0000
> @@ -95,19 +95,22 @@ in4_cksum(struct mbuf *m, uint8_t nxt, i
> {
> u_int16_t *w;
> u_int sum = 0;
> - struct ipovly ipov;
> + union {
> + struct ipovly ipov;
> + u_int16_t w[10];
> + } u;
>
> if (nxt != 0) {
> /* pseudo header */
> - bzero(&ipov, sizeof(ipov));
> - ipov.ih_len = htons(len);
> - ipov.ih_pr = nxt;
> - ipov.ih_src = mtod(m, struct ip *)->ip_src;
> - ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> - w = (u_int16_t *)&ipov;
> - /* assumes sizeof(ipov) == 20 */
> - sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
> - sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
> + u.ipov.ih_x1[8] = 0;
> + u.ipov.ih_pr = nxt;
> + u.ipov.ih_len = htons(len);
> + u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
> + u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> + w = u.w;
> + /* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
> + sum += w[4]; sum += w[5]; sum += w[6];
> + sum += w[7]; sum += w[8]; sum += w[9];
Please remove the trailing space that some of the changed lines have.
> }
>
> /* skip unnecessary part */
> Index: sys/arch/powerpc/powerpc/in_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/powerpc/powerpc/in_cksum.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 in_cksum.c
> --- sys/arch/powerpc/powerpc/in_cksum.c 22 Jul 2014 10:35:35 -0000
> 1.10
> +++ sys/arch/powerpc/powerpc/in_cksum.c 25 Jan 2022 20:21:07 -0000
> @@ -254,15 +254,15 @@ in4_cksum(struct mbuf *m, uint8_t nxt, i
>
> if (nxt != 0) {
> /* pseudo header */
> - memset(&u.ipov, 0, sizeof(u.ipov));
> - u.ipov.ih_len = htons(len);
> + u.ipov.ih_x1[8] = 0;
> u.ipov.ih_pr = nxt;
> + u.ipov.ih_len = htons(len);
> u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
> u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> w = u.w;
> - /* assumes sizeof(ipov) == 20 */
> - sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
> - sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
> + /* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
> + sum += w[4]; sum += w[5]; sum += w[6];
> + sum += w[7]; sum += w[8]; sum += w[9];
> }
>
> /* skip unnecessary part */
> Index: sys/arch/sparc64/sparc64/in4_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/sparc64/sparc64/in4_cksum.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 in4_cksum.c
> --- sys/arch/sparc64/sparc64/in4_cksum.c 24 Aug 2014 20:06:57 -0000
> 1.7
> +++ sys/arch/sparc64/sparc64/in4_cksum.c 25 Jan 2022 20:21:07 -0000
> @@ -93,7 +93,6 @@ extern int in_cksum_internal(struct mbuf
> int
> in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
> {
> - u_char *w;
> u_int sum = 0;
> struct ipovly ipov;
>
> @@ -106,24 +105,19 @@ in4_cksum(struct mbuf *m, u_int8_t nxt,
>
> if (nxt != 0) {
> /* pseudo header */
> - memset(&ipov, 0, sizeof(ipov));
> - ipov.ih_len = htons(len);
> - ipov.ih_pr = nxt;
> ipov.ih_src = mtod(m, struct ip *)->ip_src;
> ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> - w = (u_char *)&ipov;
> + sum = ((u_int)nxt << 16) | htons(len);
> /* assumes sizeof(ipov) == 20 */
> - __asm volatile(" lduw [%5 + 0], %1; "
> - " lduw [%5 + 4], %2; "
> - " lduw [%5 + 8], %3; add %0, %1, %0; "
> - " lduw [%5 + 12], %1; add %0, %2, %0; "
> - " lduw [%5 + 16], %2; add %0, %3, %0; "
> + __asm volatile(
> + " lduw [%5 + 12], %1; "
> + " lduw [%5 + 16], %2; "
> " mov -1, %3; add %0, %1, %0; "
> " srl %3, 0, %3; add %0, %2, %0; "
> - " srlx %0, 32, %2; and %0, %3, %1; "
> + " srlx %0, 32, %2; "
> " add %0, %2, %0; "
> : "=r" (sum), "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3)
> - : "0" (sum), "r" (w));
> + : "0" (sum), "r" (&ipov));
I might be missing something, but is the temporary register %3 needed
at all?
> }
>
> /* skip unnecessary part */
> Index: sys/netinet/in4_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/netinet/in4_cksum.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 in4_cksum.c
> --- sys/netinet/in4_cksum.c 8 Sep 2014 06:24:13 -0000 1.10
> +++ sys/netinet/in4_cksum.c 25 Jan 2022 20:21:09 -0000
> @@ -111,15 +111,15 @@ in4_cksum(struct mbuf *m, u_int8_t nxt,
> panic("in4_cksum: offset too short");
> if (m->m_len < sizeof(struct ip))
> panic("in4_cksum: bad mbuf chain");
> - bzero(&u.ipov, sizeof(u.ipov));
> - u.ipov.ih_len = htons(len);
> + u.ipov.ih_x1[8] = 0;
> u.ipov.ih_pr = nxt;
> + u.ipov.ih_len = htons(len);
> u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
> u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> w = u.w;
> - /* assumes sizeof(ipov) == 20 */
> - sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
> - sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
> + /* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
> + sum += w[4]; sum += w[5]; sum += w[6];
> + sum += w[7]; sum += w[8]; sum += w[9];
> }
>
> /* skip unnecessary part */
>
Tested on octeon and riscv64. I disabled hardware checksumming in the
NIC drivers and checked netstat -s output while moving packets with
tcpbench against an unpatched system.