It seems we're sticking with the C memcpy for a while (which does the bounds check and logging) but now we're missing out on the potential asm speedup. Let's try the best of both worlds by having the C memcpy call into memmove. Yes, it'll do another direction test, but then it will go zip zoom fast. On certain somewhat popular architectures it can skip the alignment checks, for instance.
Index: string/memcpy.c =================================================================== RCS file: /cvs/src/lib/libc/string/memcpy.c,v retrieving revision 1.2 diff -u -p -r1.2 memcpy.c --- string/memcpy.c 31 Aug 2015 02:53:57 -0000 1.2 +++ string/memcpy.c 5 Sep 2016 10:05:50 -0000 @@ -36,26 +36,14 @@ #include <syslog.h> /* - * sizeof(word) MUST BE A POWER OF TWO - * SO THAT wmask BELOW IS ALL ONES - */ -typedef long word; /* "word" used for optimal copy speed */ - -#define wsize sizeof(word) -#define wmask (wsize - 1) - -/* * Copy a block of memory, not handling overlap. */ void * -memcpy(void *dst0, const void *src0, size_t length) +memcpy(void *dst, const void *src, size_t length) { - char *dst = dst0; - const char *src = src0; - size_t t; if (length == 0 || dst == src) /* nothing to do */ - goto done; + return dst; if ((dst < src && dst + length > src) || (src < dst && src + length > dst)) { @@ -65,36 +53,7 @@ memcpy(void *dst0, const void *src0, siz abort(); } - /* - * Macros: loop-t-times; and loop-t-times, t>0 - */ -#define TLOOP(s) if (t) TLOOP1(s) -#define TLOOP1(s) do { s; } while (--t) + return memmove(dst, src, length); - /* - * Copy forward. - */ - t = (long)src; /* only need low bits */ - if ((t | (long)dst) & wmask) { - /* - * Try to align operands. This cannot be done - * unless the low bits match. - */ - if ((t ^ (long)dst) & wmask || length < wsize) - t = length; - else - t = wsize - (t & wmask); - length -= t; - TLOOP1(*dst++ = *src++); - } - /* - * Copy whole words, then mop up any trailing bytes. - */ - t = length / wsize; - TLOOP(*(word *)dst = *(word *)src; src += wsize; dst += wsize); - t = length & wmask; - TLOOP(*dst++ = *src++); -done: - return (dst0); } DEF_STRONG(memcpy);