TL;DR: on master string/test-memmove glibc test fails on my machine and I don't know why. Other tests work fine.
$ elf/ld.so --inhibit-cache --library-path . string/test-memmove simple_memmove __memmove_ssse3_rep __memmove_ssse3 __memmove_sse2_unaligned __memmove_ia32 string/test-memmove: Wrong result in function __memmove_sse2_unaligned dst "0x70000084" src "0x70000000" offset "43297733" https://sourceware.org/git/?p=glibc.git;a=blob;f=string/test-memmove.c;h=64e3651ba40604e47ddf6d633f4d0aea4644f60a;hb=HEAD Long story: I've trimmed __memmove_sse2_unaligned implementation down to test-memmove-xmm-unaligned.c (attached). It's supposed to show failed memmove attempts when those happen: $ gcc -ggdb3 -O2 -m32 test-memmove-xmm-unaligned.c -o test-memmove-xmm-unaligned -Wall && ./test-memmove-xmm-unaligned Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset= 3786689; expected=0039C7C1( 3786689) actual=0039C7C3( 3786691) bit_mismatch=00000002; iteration=1 Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset= 3786689; expected=0039C7C1( 3786689) actual=0039C7C3( 3786691) bit_mismatch=00000002; iteration=3 Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset= 5448641; expected=005323C1( 5448641) actual=005323C3( 5448643) bit_mismatch=00000002; iteration=5 Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset=29022145; expected=01BAD7C1(29022145) actual=01BAD7C3(29022147) bit_mismatch=00000002; iteration=9 $ gcc -ggdb3 -O2 -m64 test-memmove-xmm-unaligned.c -o test-memmove-xmm-unaligned -Wall && ./test-memmove-xmm-unaligned Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=25257857; expected=01816781(25257857) actual=01816783(25257859) bit_mismatch=00000002; iteration=43 Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=28109697; expected=01ACEB81(28109697) actual=01ACEB83(28109699) bit_mismatch=00000002; iteration=112 Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=18257633; expected=011696E1(18257633) actual=011696E3(18257635) bit_mismatch=00000002; iteration=363 Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=26981249; expected=019BB381(26981249) actual=019BB383(26981251) bit_mismatch=00000002; iteration=437 Note it is a single-bit corruption happening occasionally (not on every iteration). -m32 is way more error prone that -m64. Test example roughly implements these 2 loops: This fails: sfence loop { movdqu [src++],%xmm0 movntdq %xmm0,[dst++] } sfence This works: sfence loop { movdqu [src++],%xmm0 movdqu %xmm0,[dst++] } sfence Failures happen only on sandybridge CPU: Intel(R) Core(TM) i7-2700K CPU @ 3.50GHz kernel is 4.17.0-11928-g2837461dbe6f. Problem is not reproducible instantly after reboot. Machine has to be heavily loaded to start corrupting memory. A few hours of memtest86+ does not reveal any memory failures. I wonder if anyone else can reproduce this failure or should I start looking for a new CPU. From the above it looks like as if movntdq does not play well with XMM context save/restore and there is an 'mfence' missing somewhere in interrupt handling. If there is no obvious problems with glibc's memove() or my small test what can I do to rule-out/pin-down hardware or kernel problem? Thanks! -- Sergei
/* Test as: $ gcc -ggdb3 -O2 -m32 test-memmove-xmm-unaligned.c -o test-memmove-xmm-unaligned -Wall && ./test-memmove-xmm-unaligned Error example: Bad result in memmove(dst=0xd7cf5094, src=0xd7cf5010, len=268435456): offset= 8031729; expected=007A8DF1( 8031729) actual=007A8DF3( 8031731) bit_mismatch=00000002; iteration=2 Bad result in memmove(dst=0xd7cf5094, src=0xd7cf5010, len=268435456): offset=43626993; expected=0299B1F1(43626993) actual=0299B1F3(43626995) bit_mismatch=00000002; iteration=3 Bad result in memmove(dst=0xd7cf5094, src=0xd7cf5010, len=268435456): offset=25404913; expected=0183A5F1(25404913) actual=0183A5F3(25404915) bit_mismatch=00000002; iteration=4 ... */ #include <string.h> /* memmove */ #include <stdlib.h> /* exit */ #include <stdio.h> /* fprintf */ #include <sys/mman.h> /* mlock() */ #include <emmintrin.h> /* movdqu, sfence, movntdq */ typedef unsigned int u32; static void memmove_si128u (__m128i_u * dest, __m128i_u const *src, size_t items) __attribute__((noinline)); static void memmove_si128u (__m128i_u * dest, __m128i_u const *src, size_t items) { // emulate behaviour of optimised block for __memmove_sse2_unaligned: // sfence // loop(backwards) { // 8x movdqu mem->%xmm{N} // 8x movntdq %xmm{N}->mem // } // source: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S;h=9aa17de99c9c3415a9b5ac28fd9f1eb4457f916d;hb=HEAD#l244 // ASSUME: if ((unintptr_t)dest > (unintptr_t)src) { dest += items - 1; src += items - 1; _mm_sfence(); for (; items != 0; items-=8, dest-=8, src-=8) { __m128i xmm0 = _mm_loadu_si128(src-0); // movdqu __m128i xmm1 = _mm_loadu_si128(src-1); // movdqu __m128i xmm2 = _mm_loadu_si128(src-2); // movdqu __m128i xmm3 = _mm_loadu_si128(src-3); // movdqu __m128i xmm4 = _mm_loadu_si128(src-4); // movdqu __m128i xmm5 = _mm_loadu_si128(src-5); // movdqu __m128i xmm6 = _mm_loadu_si128(src-6); // movdqu __m128i xmm7 = _mm_loadu_si128(src-7); // movdqu if (0) { // this would work: _mm_storeu_si128(dest-0, xmm0);// movdqu _mm_storeu_si128(dest-1, xmm1);// movdqu _mm_storeu_si128(dest-2, xmm2);// movdqu _mm_storeu_si128(dest-3, xmm3);// movdqu _mm_storeu_si128(dest-4, xmm4);// movdqu _mm_storeu_si128(dest-5, xmm5);// movdqu _mm_storeu_si128(dest-6, xmm6);// movdqu _mm_storeu_si128(dest-7, xmm7);// movdqu } else { _mm_stream_si128(dest-0, xmm0); // movntdq _mm_stream_si128(dest-1, xmm1); // movntdq _mm_stream_si128(dest-2, xmm2); // movntdq _mm_stream_si128(dest-3, xmm3); // movntdq _mm_stream_si128(dest-4, xmm4); // movntdq _mm_stream_si128(dest-5, xmm5); // movntdq _mm_stream_si128(dest-6, xmm6); // movntdq _mm_stream_si128(dest-7, xmm7); // movntdq } } _mm_sfence(); } static void do_memmove (u32 * buf, size_t buf_elements, size_t iter) __attribute__((noinline)); static void do_memmove (u32 * buf, size_t buf_elements, size_t iter) { size_t elements_to_move = buf_elements / 2; // "memset" buffer with 0, 1, 2, 3, ... for (u32 i = 0; i < elements_to_move; i++) buf[i] = i; u32 * dst = buf + 64; // __memmove_sse2_unaligned // memmove(dst, buf, elements_to_move * sizeof (u32)); memmove_si128u((__m128i_u *)dst, (__m128i_u const *)buf, elements_to_move * sizeof (u32) / sizeof (__m128i)); // validate target buffer buffer with 0, 1, 2, 3, ... for (u32 i = 0; i < elements_to_move; i++) { u32 v = dst[i]; if (v != i) fprintf (stderr, "Bad result in memmove(dst=%p, src=%p, len=%zd)" ": offset=%8u; expected=%08X(%8u) actual=%08X(%8u) bit_mismatch=%08X; iteration=%zu\n", dst, buf, elements_to_move * sizeof (u32), i, i, i, v, v, v^i, iter); } } int main (void) { size_t size = 256 * 1024 * 1024; void * buf = malloc(size); mlock (buf, size); // wait for a failure for (size_t n = 0; ;++n) { do_memmove(buf, size / sizeof (u32), n); } free(buf); }
pgpUaSMyaxli4.pgp
Description: Цифровая подпись OpenPGP