> > > > > > +/**
> > > > > > + * Copy bytes from one location to another,
> > > > > > + * locations should not overlap.
> > > > > > + * Use with n <= 16.
> > > > > > + *
> > > > > > + * Note: Copying uninitialized memory is perfectly
> acceptable.
> > > > > > + * Using e.g. memcpy(dst, src, 8) instead of
> > > > > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > > > > > + * avoids compiler warnings about source data may be
> > > uninitialized
> > > > > > + * [-Wmaybe-uninitialized].
> > > > > > + *
> > > > > > + * Note: Using "n & X" generates 3-byte "test" instructions,
> > > > > > + * instead of "n >= X", which would generate 4-byte "cmp"
> > > > > instructions.
> > > > > > + */
> > > > > > +static __rte_always_inline void *
> > > > > > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > > > > > +{
> > > > > > +   /* Faster way when size is known at build time. */
> > > > > > +   if (__rte_constant(n)) {
> > > > > > +           if (n == 2)
> > > > > > +                   return memcpy(dst, src, 2);
> > > > > > +           if (n == 4)
> > > > > > +                   return memcpy(dst, src, 4);
> > > > > > +           if (n == 6) /* 4 + 2 */
> > > > > > +                   return memcpy(dst, src, 6);
> > > > > > +           if (n == 8)
> > > > > > +                   return memcpy(dst, src, 8);
> > > > > > +           if (n == 10) /* 8 + 2 */
> > > > > > +                   return memcpy(dst, src, 10);
> > > > > > +           if (n == 12) /* 8 + 4 */
> > > > > > +                   return memcpy(dst, src, 12);
> > > > > > +           if (n == 16) {
> > > > > > +                   rte_mov16((uint8_t *)dst, (const uint8_t
> > > *)src);
> > > > > > +                   return dst;
> > > > > > +           }
> > > > > > +   }
> > > > > > +
> > > > > > +   if (n & 0x18) { /* n >= 8 */
> > > > >
> > > > > Probably 'n & 0x8'?
> > > >
> > > > It's intentional, to catch n == 0x10 too.
> > > > It seems the associated comment should be more verbose. How
> about:
> > > > if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18 */
> > >
> > > Ok, why just not simply : if (n >= 8) then?
> >
> > The reason mentioned in the function description:
> >  * Note: Using "n & X" generates 3-byte "test" instructions,
> >  * instead of "n >= X", which would generate 4-byte "cmp"
> instructions.
> Ah, I see the comment now, sorry missed it first time.
> Indeed 'cmp' Will be one byte longer.
> Though for non-constant 'n' gcc generates 90B of code for both
> versions:
> https://godbolt.org/z/K6861qGcr

True. The saved byte becomes an extra byte of NOP padding.

There might also be opportunity to eliminate one more branch by reorganizing 
the outer comparison like so:

if (n <= 32)
        if (n <= 16)
                return rte_mov16_or_less (dst, src, n);
        else
                return rte_mov17_to_32(dst, src, n);
else if (n <= 64)
        return rte_mov33_to_64(dst, src, n);

Or:

if (n <= 16)
        return rte_mov16_or_less (dst, src, n);
else if (n <= 64)
        if (n <= 32)
                return rte_mov17_to_32(dst, src, n);
        else
                return rte_mov33_to_64(dst, src, n);

I'll play around with it some more and post a v4.

> 
> 
> >
> > I'll move that comment down here, just before the comparisons.
> >
> > >
> > > > >
> > > > > > +           /* copy 8 ~ 16 bytes */
> > > > > > +           memcpy(dst, src, 8);
> > > > > > +           memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src -
> > > 8 +
> > > > > n, 8);
> > > > > > +   } else if (n & 0x4) {
> > > > > > +           /* copy 4 ~ 7 bytes */
> > > > > > +           memcpy(dst, src, 4);
> > > > > > +           memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src -
> > > 4 +
> > > > > n, 4);
> > > > > > +   } else if (n & 0x2) {
> > > > > > +           /* copy 2 ~ 3 bytes */
> > > > > > +           memcpy(dst, src, 2);
> > > > > > +           memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src -
> > > 2 +
> > > > > n, 2);
> > > > > > +   } else if (n & 0x1) {
> > > > > > +           /* copy 1 byte */
> > > > > > +           memcpy(dst, src, 1);
> > > > > > +   }
> > > > > > +   return dst;
> > > > > > +}

Reply via email to