[AMD Official Use Only]

Hi Thomas,

I hope this can make some explanation to your question.
We(AMD Linux library support team) have implemented the custom tailored memcpy 
solution which is a close match with DPDK use case requirements like the below.
1)      Min 64B length data packet with cache aligned Source and Destination.
2)      Non-Temporal load and temporal store for cache aligned source for both 
RX and TX paths. Could not implement the non-temporal store for TX_PATH, as 
non-Temporal load/stores works only with 32B aligned addresses for AVX2
3)      This solution works for all AVX2 supported AMD machines.

Internally we have completed the integrity testing and benchmarking of the 
solution and found gains of 8.4% to 14.5% specifically on Milan CPU(3rd Gen of 
EPYC Processor)

Thanks for your support,
Keesang

-----Original Message-----
From: Thomas Monjalon <tho...@monjalon.net>
Sent: Tuesday, October 19, 2021 5:31 AM
To: Aman Kumar <aman.ku...@vvdntech.in>
Cc: dev@dpdk.org; rasl...@nvidia.com; as...@nvidia.com; s...@nvidia.com; 
viachesl...@nvidia.com; akozy...@nvidia.com; ma...@nvidia.com; 
anatoly.bura...@intel.com; Song, Keesang <keesang.s...@amd.com>; 
aman.ku...@vvdntech.in; jerinjac...@gmail.com; bruce.richard...@intel.com; 
konstantin.anan...@intel.com; david.march...@redhat.com
Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to 
eal

[CAUTION: External Email]

19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for AMD EPYC
> platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> meson to build dpdk for AMD EPYC platforms.

Please split in 2 patches: platform & memcpy.

What optimization is specific to EPYC?

I dislike the asm code below.
What is AMD specific inside?
Can it use compiler intrinsics as it is done elsewhere?

> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +                                         const void *src,
> +                                         size_t size) {
> +     asm volatile goto("movq %0, %%rsi\n\t"
> +     "movq %1, %%rdi\n\t"
> +     "movq %2, %%rdx\n\t"
> +     "cmpq   $(128), %%rdx\n\t"
> +     "jb     202f\n\t"
> +     "201:\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +     "addq   $128, %%rsi\n\t"
> +     "addq   $128, %%rdi\n\t"
> +     "subq   $128, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +     "jae    201b\n\t"
> +     "202:\n\t"
> +     "cmpq   $64, %%rdx\n\t"
> +     "jb     203f\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +     "addq   $64, %%rsi\n\t"
> +     "addq   $64, %%rdi\n\t"
> +     "subq   $64, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "203:\n\t"
> +     "cmpq   $32, %%rdx\n\t"
> +     "jb     204f\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "addq   $32, %%rsi\n\t"
> +     "addq   $32, %%rdi\n\t"
> +     "subq   $32, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "204:\n\t"
> +     "cmpb   $16, %%dl\n\t"
> +     "jb     205f\n\t"
> +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> +     "addq   $16, %%rsi\n\t"
> +     "addq   $16, %%rdi\n\t"
> +     "subq   $16, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "205:\n\t"
> +     "cmpb   $2, %%dl\n\t"
> +     "jb     208f\n\t"
> +     "cmpb   $4, %%dl\n\t"
> +     "jbe    207f\n\t"
> +     "cmpb   $8, %%dl\n\t"
> +     "jbe    206f\n\t"
> +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +     "movq   (%%rsi), %%rsi\n\t"
> +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +     "movq   %%rsi, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "206:\n\t"
> +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +     "movl   (%%rsi), %%esi\n\t"
> +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +     "movl   %%esi, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "207:\n\t"
> +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +     "movzwl (%%rsi), %%esi\n\t"
> +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +     "movw   %%si, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "208:\n\t"
> +     "movzbl (%%rsi), %%ecx\n\t"
> +     "movb   %%cl, (%%rdi)"
> +     :
> +     : "r"(src), "r"(dst), "r"(size)
> +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +     : done
> +     );
> +done:
> +     return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> +     asm goto("movq  %0, %%rsi\n\t"
> +     "movq   %1, %%rdi\n\t"
> +     "movq   %2, %%rdx\n\t"
> +     "movq    %%rdi, %%rax\n\t"
> +     "cmp     $32, %%rdx\n\t"
> +     "jb      101f\n\t"
> +     "cmp     $(32 * 2), %%rdx\n\t"
> +     "ja      108f\n\t"
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "101:\n\t"
> +     /* Less than 1 VEC.  */
> +     "cmpb    $32, %%dl\n\t"
> +     "jae     103f\n\t"
> +     "cmpb    $16, %%dl\n\t"
> +     "jae     104f\n\t"
> +     "cmpb    $8, %%dl\n\t"
> +     "jae     105f\n\t"
> +     "cmpb    $4, %%dl\n\t"
> +     "jae     106f\n\t"
> +     "cmpb    $1, %%dl\n\t"
> +     "ja      107f\n\t"
> +     "jb      102f\n\t"
> +     "movzbl  (%%rsi), %%ecx\n\t"
> +     "movb    %%cl, (%%rdi)\n\t"
> +     "102:\n\t"
> +     "jmp %l[done]\n\t"
> +     "103:\n\t"
> +     /* From 32 to 63.  No branch when size == 32.  */
> +     "vmovdqu (%%rsi), %%ymm0\n\t"
> +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     /* From 16 to 31.  No branch when size == 16.  */
> +     "104:\n\t"
> +     "vmovdqu (%%rsi), %%xmm0\n\t"
> +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +     "jmp %l[done]\n\t"
> +     "105:\n\t"
> +     /* From 8 to 15.  No branch when size == 8.  */
> +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +     "movq    (%%rsi), %%rsi\n\t"
> +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +     "movq    %%rsi, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "106:\n\t"
> +     /* From 4 to 7.  No branch when size == 4.  */
> +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +     "movl    (%%rsi), %%esi\n\t"
> +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +     "movl    %%esi, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "107:\n\t"
> +     /* From 2 to 3.  No branch when size == 2.  */
> +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +     "movzwl  (%%rsi), %%esi\n\t"
> +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +     "movw    %%si, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "108:\n\t"
> +     /* More than 2 * VEC and there may be overlap between destination */
> +     /* and source.  */
> +     "cmpq    $(32 * 8), %%rdx\n\t"
> +     "ja      111f\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "jb      109f\n\t"
> +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "109:\n\t"
> +     /* Copy from 2 * VEC to 4 * VEC. */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "110:\n\t"
> +     "jmp %l[done]\n\t"
> +     "111:\n\t"
> +     "cmpq    %%rsi, %%rdi\n\t"
> +     "ja      113f\n\t"
> +     /* Source == destination is less common.  */
> +     "je      110b\n\t"
> +     /* Load the first VEC and last 4 * VEC to
> +      * support overlapping addresses.
> +      */
> +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +     /* Save start and stop of the destination buffer.  */
> +     "movq    %%rdi, %%r11\n\t"
> +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +     /* Align destination for aligned stores in the loop.  Compute */
> +     /* how much destination is misaligned.  */
> +     "movq    %%rdi, %%r8\n\t"
> +     "andq    $(32 - 1), %%r8\n\t"
> +     /* Get the negative of offset for alignment.  */
> +     "subq    $32, %%r8\n\t"
> +     /* Adjust source.  */
> +     "subq    %%r8, %%rsi\n\t"
> +     /* Adjust destination which should be aligned now.  */
> +     "subq    %%r8, %%rdi\n\t"
> +     /* Adjust length.  */
> +     "addq    %%r8, %%rdx\n\t"
> +     /* Check non-temporal store threshold.  */
> +     "cmpq    $(1024*1024), %%rdx\n\t"
> +     "ja      115f\n\t"
> +     "112:\n\t"
> +     /* Copy 4 * VEC a time forward.  */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "addq    $(32 * 4), %%rsi\n\t"
> +     "subq    $(32 * 4), %%rdx\n\t"
> +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "addq    $(32 * 4), %%rdi\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      112b\n\t"
> +     /* Store the last 4 * VEC.  */
> +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +     /* Store the first VEC.  */
> +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "113:\n\t"
> +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +     /* Save stop of the destination buffer.  */
> +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +     /* Align destination end for aligned stores in the loop.  Compute */
> +     /* how much destination end is misaligned.  */
> +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +     "movq    %%r11, %%r9\n\t"
> +     "movq    %%r11, %%r8\n\t"
> +     "andq    $(32 - 1), %%r8\n\t"
> +     /* Adjust source.  */
> +     "subq    %%r8, %%rcx\n\t"
> +     /* Adjust the end of destination which should be aligned now.  */
> +     "subq    %%r8, %%r9\n\t"
> +     /* Adjust length.  */
> +     "subq    %%r8, %%rdx\n\t"
> +      /* Check non-temporal store threshold.  */
> +     "cmpq    $(1024*1024), %%rdx\n\t"
> +     "ja      117f\n\t"
> +     "114:\n\t"
> +     /* Copy 4 * VEC a time backward.  */
> +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +     "subq    $(32 * 4), %%rcx\n\t"
> +     "subq    $(32 * 4), %%rdx\n\t"
> +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +     "subq    $(32 * 4), %%r9\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      114b\n\t"
> +     /* Store the first 4 * VEC. */
> +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +     /* Store the last VEC. */
> +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +
> +     "115:\n\t"
> +     /* Don't use non-temporal store if there is overlap between */
> +     /* destination and source since destination may be in cache */
> +     /* when source is loaded. */
> +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> +     "cmpq    %%r10, %%rsi\n\t"
> +     "jb      112b\n\t"
> +     "116:\n\t"
> +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "addq    $(32*4), %%rsi\n\t"
> +     "subq    $(32*4), %%rdx\n\t"
> +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "addq    $(32*4), %%rdi\n\t"
> +     "cmpq    $(32*4), %%rdx\n\t"
> +     "ja      116b\n\t"
> +     "sfence\n\t"
> +     /* Store the last 4 * VEC.  */
> +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +     /* Store the first VEC.  */
> +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "117:\n\t"
> +     /* Don't use non-temporal store if there is overlap between */
> +     /* destination and source since destination may be in cache */
> +     /* when source is loaded.  */
> +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> +     "cmpq    %%r10, %%r9\n\t"
> +     "jb      114b\n\t"
> +     "118:\n\t"
> +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +     "subq    $(32*4), %%rcx\n\t"
> +     "subq    $(32*4), %%rdx\n\t"
> +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +     "subq    $(32 * 4), %%r9\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      118b\n\t"
> +     "sfence\n\t"
> +     /* Store the first 4 * VEC.  */
> +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +     /* Store the last VEC.  */
> +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]"
> +     :
> +     : "r"(src), "r"(dst), "r"(len)
> +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", 
> "ymm0",
> +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +     : done
> +     );
> +done:
> +     return dst;
> +}



Reply via email to