https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87743
--- Comment #2 from H.J. Lu <hjl.tools at gmail dot com> --- [hjl@gnu-efi-2 pr87317]$ cat y.c #define MAX 4 long long int dst[MAX]; short src[MAX]; void foo (void) { int i; for (i = 0; i < MAX; i++) dst[i] = src[i]; } [hjl@gnu-efi-2 pr87317]$ /export/ssd/build/tools-build/glibc-many/install/compilers/aarch64-linux-gnu/bin/aarch64-glibc-linux-gnu-gcc -S -O3 y.c [hjl@gnu-efi-2 pr87317]$ cat y.s .arch armv8-a .file "y.c" .text .align 2 .p2align 3,,7 .global foo .type foo, %function foo: .LFB0: .cfi_startproc adrp x3, src add x1, x3, :lo12:src adrp x2, dst add x0, x2, :lo12:dst ldrsh x5, [x3, #:lo12:src] ldrsh x4, [x1, 2] ldrsh x3, [x1, 4] ldrsh x1, [x1, 6] str x5, [x2, #:lo12:dst] stp x4, x3, [x0, 8] str x1, [x0, 24] ret .cfi_endproc .LFE0: .size foo, .-foo .comm src,8,8 .comm dst,32,8 .ident "GCC: (GNU) 8.2.1 20180922" .section .note.GNU-stack,"",@progbits [hjl@gnu-efi-2 pr87317]$ gcc -march=haswell -S -O3 y.c [hjl@gnu-efi-2 pr87317]$ cat y.s .file "y.c" .text .p2align 4,,15 .globl foo .type foo, @function foo: .LFB0: .cfi_startproc movswq src(%rip), %rax movswq src+4(%rip), %rcx movswq src+6(%rip), %rdx vmovq %rax, %xmm0 movswq src+2(%rip), %rax vmovq %rcx, %xmm1 vpinsrq $1, %rdx, %xmm1, %xmm1 vpinsrq $1, %rax, %xmm0, %xmm0 vinserti128 $0x1, %xmm1, %ymm0, %ymm0 vmovdqu %ymm0, dst(%rip) vzeroupper ret .cfi_endproc .LFE0: .size foo, .-foo .comm src,8,8 .comm dst,32,32 .ident "GCC: (GNU) 8.2.1 20181011 (Red Hat 8.2.1-4)" .section .note.GNU-stack,"",@progbits [hjl@gnu-efi-2 pr87317]$ I don't see much differences between x86-64 and arm64.