On Tue, Jan 25, 2022 at 08:23:18PM +0000, Miod Vallat wrote:
> in4_cksum(), used to compute packet checksums for the legacy internet
> protocol, has been hand-optimized for speed on most elderly platforms,
> with the most recent pieces of silicon using a portable C
> implementation.
> 
> Most of these implementations, in a not-so-uncommon case, need to
> checksum an extra (``overlay'') header, and invoke memset or bzero on
> such an overlay, prior to initializing it and checksumming it.
> 
> However, except for one byte, the zeroed parts are useless since they
> will not change the checksum, so that memset/bzero call can be removed,
> and the sum can omit the first 8 bytes which will always be zero.
> 
> The following diff implements that idea. Plus on sparc64 you get one
> useless assembly instruction removed, for free, isn't that awesome?
> 
> Affected platforms: alpha, amd64, arm64, hppa, landisk, luna88k, macppc,
> octeon, powerpc64, riscv64, sparc64. No need to test on armv7 and i386.
> 
> Index: sys/arch/alpha/alpha/in_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/alpha/alpha/in_cksum.c,v
> retrieving revision 1.9
> diff -u -p -r1.9 in_cksum.c
> --- sys/arch/alpha/alpha/in_cksum.c   21 Aug 2014 14:24:08 -0000      1.9
> +++ sys/arch/alpha/alpha/in_cksum.c   25 Jan 2022 20:21:06 -0000
> @@ -212,14 +212,13 @@ in4_cksum(struct mbuf *m, u_int8_t nxt, 
>                       panic("in4_cksum: bad mbuf chain");
>  #endif
>  
> -             memset(&ipov, 0, sizeof(ipov));
> -
> -             ipov.ih_len = htons(len);
> +             ipov.ih_x1[8] = 0;
>               ipov.ih_pr = nxt;
> +             ipov.ih_len = htons(len);
>               ipov.ih_src = mtod(m, struct ip *)->ip_src;
>               ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
>  
> -             sum += in_cksumdata((caddr_t) &ipov, sizeof(ipov));
> +             sum += in_cksumdata((caddr_t) &ipov + 8, sizeof(ipov) - 8);

I think this would be clearer with a comment.

                /* assumes first 8 bytes are zeroes */
                sum += in_cksumdata((caddr_t) &ipov + 8, sizeof(ipov) - 8);

>       }
>  
>       /* skip over unnecessary part */
> Index: sys/arch/m88k/m88k/in_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/m88k/m88k/in_cksum.c,v
> retrieving revision 1.4
> diff -u -p -r1.4 in_cksum.c
> --- sys/arch/m88k/m88k/in_cksum.c     21 Aug 2014 14:24:08 -0000      1.4
> +++ sys/arch/m88k/m88k/in_cksum.c     25 Jan 2022 20:21:07 -0000
> @@ -95,19 +95,22 @@ in4_cksum(struct mbuf *m, uint8_t nxt, i
>  {
>       u_int16_t *w;
>       u_int sum = 0;
> -     struct ipovly ipov;
> +     union {
> +             struct ipovly ipov;
> +             u_int16_t w[10];
> +     } u;
>  
>       if (nxt != 0) {
>               /* pseudo header */
> -             bzero(&ipov, sizeof(ipov));
> -             ipov.ih_len = htons(len);
> -             ipov.ih_pr = nxt; 
> -             ipov.ih_src = mtod(m, struct ip *)->ip_src; 
> -             ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> -             w = (u_int16_t *)&ipov;
> -             /* assumes sizeof(ipov) == 20 */
> -             sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
> -             sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
> +             u.ipov.ih_x1[8] = 0; 
> +             u.ipov.ih_pr = nxt; 
> +             u.ipov.ih_len = htons(len);
> +             u.ipov.ih_src = mtod(m, struct ip *)->ip_src; 
> +             u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> +             w = u.w;
> +             /* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
> +             sum += w[4]; sum += w[5]; sum += w[6];
> +             sum += w[7]; sum += w[8]; sum += w[9];

Please remove the trailing space that some of the changed lines have.

>       }
>  
>       /* skip unnecessary part */
> Index: sys/arch/powerpc/powerpc/in_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/powerpc/powerpc/in_cksum.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 in_cksum.c
> --- sys/arch/powerpc/powerpc/in_cksum.c       22 Jul 2014 10:35:35 -0000      
> 1.10
> +++ sys/arch/powerpc/powerpc/in_cksum.c       25 Jan 2022 20:21:07 -0000
> @@ -254,15 +254,15 @@ in4_cksum(struct mbuf *m, uint8_t nxt, i
>  
>       if (nxt != 0) {
>               /* pseudo header */
> -             memset(&u.ipov, 0, sizeof(u.ipov));
> -             u.ipov.ih_len = htons(len);
> +             u.ipov.ih_x1[8] = 0;
>               u.ipov.ih_pr = nxt; 
> +             u.ipov.ih_len = htons(len);
>               u.ipov.ih_src = mtod(m, struct ip *)->ip_src; 
>               u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
>               w = u.w;
> -             /* assumes sizeof(ipov) == 20 */
> -             sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
> -             sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
> +             /* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
> +             sum += w[4]; sum += w[5]; sum += w[6];
> +             sum += w[7]; sum += w[8]; sum += w[9];
>       }
>  
>       /* skip unnecessary part */
> Index: sys/arch/sparc64/sparc64/in4_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/arch/sparc64/sparc64/in4_cksum.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 in4_cksum.c
> --- sys/arch/sparc64/sparc64/in4_cksum.c      24 Aug 2014 20:06:57 -0000      
> 1.7
> +++ sys/arch/sparc64/sparc64/in4_cksum.c      25 Jan 2022 20:21:07 -0000
> @@ -93,7 +93,6 @@ extern int in_cksum_internal(struct mbuf
>  int
>  in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
>  {
> -     u_char *w;
>       u_int sum = 0;
>       struct ipovly ipov;
>  
> @@ -106,24 +105,19 @@ in4_cksum(struct mbuf *m, u_int8_t nxt, 
>  
>       if (nxt != 0) {
>               /* pseudo header */
> -             memset(&ipov, 0, sizeof(ipov));
> -             ipov.ih_len = htons(len);
> -             ipov.ih_pr = nxt; 
>               ipov.ih_src = mtod(m, struct ip *)->ip_src; 
>               ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
> -             w = (u_char *)&ipov;
> +             sum = ((u_int)nxt << 16) | htons(len);
>               /* assumes sizeof(ipov) == 20 */
> -             __asm volatile(" lduw [%5 + 0], %1; "
> -                     " lduw [%5 + 4], %2; "
> -                     " lduw [%5 + 8], %3; add %0, %1, %0; "
> -                     " lduw [%5 + 12], %1; add %0, %2, %0; "
> -                     " lduw [%5 + 16], %2; add %0, %3, %0; "
> +             __asm volatile(
> +                     " lduw [%5 + 12], %1; "
> +                     " lduw [%5 + 16], %2; "
>                       " mov -1, %3; add %0, %1, %0; "
>                       " srl %3, 0, %3; add %0, %2, %0; "
> -                     " srlx %0, 32, %2; and %0, %3, %1; "
> +                     " srlx %0, 32, %2; "
>                       " add %0, %2, %0; "
>                       : "=r" (sum), "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3)
> -                     : "0" (sum), "r" (w));
> +                     : "0" (sum), "r" (&ipov));

I might be missing something, but is the temporary register %3 needed
at all?

>       }
>  
>       /* skip unnecessary part */
> Index: sys/netinet/in4_cksum.c
> ===================================================================
> RCS file: /OpenBSD/src/sys/netinet/in4_cksum.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 in4_cksum.c
> --- sys/netinet/in4_cksum.c   8 Sep 2014 06:24:13 -0000       1.10
> +++ sys/netinet/in4_cksum.c   25 Jan 2022 20:21:09 -0000
> @@ -111,15 +111,15 @@ in4_cksum(struct mbuf *m, u_int8_t nxt, 
>                       panic("in4_cksum: offset too short");
>               if (m->m_len < sizeof(struct ip))
>                       panic("in4_cksum: bad mbuf chain");
> -             bzero(&u.ipov, sizeof(u.ipov));
> -             u.ipov.ih_len = htons(len);
> +             u.ipov.ih_x1[8] = 0;
>               u.ipov.ih_pr = nxt;
> +             u.ipov.ih_len = htons(len);
>               u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
>               u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
>               w = u.w;
> -             /* assumes sizeof(ipov) == 20 */
> -             sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
> -             sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
> +             /* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
> +             sum += w[4]; sum += w[5]; sum += w[6];
> +             sum += w[7]; sum += w[8]; sum += w[9];
>       }
>  
>       /* skip unnecessary part */
> 

Tested on octeon and riscv64. I disabled hardware checksumming in the
NIC drivers and checked netstat -s output while moving packets with
tcpbench against an unpatched system.

Reply via email to