Hi Linhaifeng,
On Tue, May 12, 2015 at 1:13 AM, Linhaifeng <haifeng.lin at huawei.com> wrote: > Hi, Ravi Kerur > > On 2015/5/9 5:19, Ravi Kerur wrote: > > Preliminary results on Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz, Ubuntu > > 14.04 x86_64 shows comparisons using AVX/SSE instructions taking 1/3rd > > CPU ticks for 16, 32, 48 and 64 bytes comparison. In addition, > > I had write a program to test rte_memcmp and I have a question about the > result. > Why cost same CPU ticks for 128 256 512 1024 1500 bytes? Is there any > problem in > my test? > > If you can wait until Thursday I will probably send v3 patch which will have full memcmp support. In your program try with volatile pointer and see if it helps. > > [root at localhost test]# gcc avx_test.c -O3 -I > /data/linhf/v2r2c00/open-source/dpdk/dpdk-2.0.0/x86_64-native-linuxapp-gcc/include/ > -mavx2 -DRTE_MACHINE_CPUFLAG_AVX2 > [root at localhost test]# ./a.out 0 > each test run 100000000 times > copy 16 bytes costs average 7(rte_memcmp) 10(memcmp) ticks > copy 32 bytes costs average 9(rte_memcmp) 11(memcmp) ticks > copy 64 bytes costs average 6(rte_memcmp) 13(memcmp) ticks > copy 128 bytes costs average 11(rte_memcmp) 14(memcmp) ticks > copy 256 bytes costs average 9(rte_memcmp) 14(memcmp) ticks > copy 512 bytes costs average 9(rte_memcmp) 14(memcmp) ticks > copy 1024 bytes costs average 9(rte_memcmp) 14(memcmp) ticks > copy 1500 bytes costs average 11(rte_memcmp) 14(memcmp) ticks > [root at localhost test]# ./a.out 1 > each test run 100000000 times > copy 16 bytes costs average 2(rte_memcpy) 10(memcpy) ticks > copy 32 bytes costs average 2(rte_memcpy) 10(memcpy) ticks > copy 64 bytes costs average 3(rte_memcpy) 10(memcpy) ticks > copy 128 bytes costs average 7(rte_memcpy) 12(memcpy) ticks > copy 256 bytes costs average 9(rte_memcpy) 23(memcpy) ticks > copy 512 bytes costs average 14(rte_memcpy) 34(memcpy) ticks > copy 1024 bytes costs average 37(rte_memcpy) 61(memcpy) ticks > copy 1500 bytes costs average 62(rte_memcpy) 87(memcpy) ticks > > > Here is my program: > > #include <stdio.h> > #include <rte_cycles.h> > #include <smmintrin.h> > #include <rte_memcpy.h> > #include <rte_memcmp.h> > > #define TIMES 100000000L > > void test_memcpy(size_t n) > { > uint64_t start, end, i, start2, end2; > uint8_t *src, *dst; > > src = (uint8_t*)malloc(n * sizeof(uint8_t)); > dst = (uint8_t*)malloc(n * sizeof(uint8_t)); > > start = rte_rdtsc(); > for (i = 0; i < TIMES; i++) { > rte_memcpy(dst, src, n); > } > end = rte_rdtsc(); > > start2 = rte_rdtsc(); > for (i = 0; i < TIMES; i++) { > memcpy(dst, src, n); > } > end2 = rte_rdtsc(); > > > free(src); > free(dst); > > printf("copy %u bytes costs average %llu(rte_memcpy) %llu(memcpy) > ticks\n", n, (end - start)/TIMES, (end2 - start2)/TIMES); > } > > int test_memcmp(size_t n) > { > uint64_t start, end, i, start2, end2, j; > uint8_t *src, *dst; > int *ret; > > src = (uint8_t*)malloc(n * sizeof(uint8_t)); > dst = (uint8_t*)malloc(n * sizeof(uint8_t)); > ret = (int*)malloc(TIMES * sizeof(int)); > > start = rte_rdtsc(); > for (i = 0; i < TIMES; i++) { > ret[i] = rte_memcmp(dst, src, n); > } > end = rte_rdtsc(); > > start2 = rte_rdtsc(); > for (i = 0; i < TIMES; i++) { > ret[i] = memcmp(dst, src, n); > } > end2 = rte_rdtsc(); > > // avoid gcc to optimize memcmp > for (i = 0; i < TIMES; i++) { > t += ret[i]; > } > > free(src); > free(dst); > > printf("copy %u bytes costs average %llu(rte_memcmp) %llu(memcmp) > ticks\n", n, (end - start)/TIMES, (end2 - start2)/TIMES); > return t; > } > > > > > int main(int narg, char** args) > { > printf("each test run %llu times\n", TIMES); > > if (narg < 2) { > printf("usage:./avx_test 0/1 1:test memcpy 0:test > memcmp\n"); > return -1; > } > > if (atoi(args[1])) { > test_memcpy(16); > test_memcpy(32); > test_memcpy(64); > test_memcpy(128); > test_memcpy(256); > test_memcpy(512); > test_memcpy(1024); > test_memcpy(1500); > } else { > test_memcmp(16); > test_memcmp(32); > test_memcmp(64); > test_memcmp(128); > test_memcmp(256); > test_memcmp(512); > test_memcmp(1024); > test_memcmp(1500); > } > } > > > > > > >