gcc version 4.5.0-rc20100406 /**************/ #include <arm_neon.h>
void x(int32x4_t a, int32x4_t b, int32x4_t *p) { #define X(n) p[n] = vaddq_s32(p[n], a); p[n] = vorrq_s32(p[n], b); X(0); X(1); X(2); X(3); X(4); X(5); X(6); X(7); X(8); X(9); X(10); X(11); X(12); } /**************/ # gcc -O2 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=hard -c test.c # objdump -d test.o 00000000 <x>: 0: edd0eb2c vldr d30, [r0, #176] ; 0xb0 4: edd0fb2e vldr d31, [r0, #184] ; 0xb8 8: ecd02b04 vldmia r0, {d18-d19} c: ed2d8b10 vpush {d8-d15} 10: ed904b30 vldr d4, [r0, #192] ; 0xc0 14: ed905b32 vldr d5, [r0, #200] ; 0xc8 18: e24dd020 sub sp, sp, #32 1c: f22ec8c0 vadd.i32 q6, q15, q0 20: f26228c0 vadd.i32 q9, q9, q0 24: edd04b08 vldr d20, [r0, #32] 28: edd05b0a vldr d21, [r0, #40] ; 0x28 2c: edd0cb18 vldr d28, [r0, #96] ; 0x60 30: edd0db1a vldr d29, [r0, #104] ; 0x68 34: f264e840 vadd.i32 q15, q2, q0 38: f26448c0 vadd.i32 q10, q10, q0 3c: f26cc8c0 vadd.i32 q14, q14, q0 40: edd00b04 vldr d16, [r0, #16] 44: edd01b06 vldr d17, [r0, #24] 48: edd0ab14 vldr d26, [r0, #80] ; 0x50 4c: edd0bb16 vldr d27, [r0, #88] ; 0x58 50: ec8dcb04 vstmia sp, {d12-d13} 54: f222c1d2 vorr q6, q9, q1 58: f26008c0 vadd.i32 q8, q8, q0 5c: f26aa8c0 vadd.i32 q13, q13, q0 60: edd06b0c vldr d22, [r0, #48] ; 0x30 64: edd07b0e vldr d23, [r0, #56] ; 0x38 68: edd08b10 vldr d24, [r0, #64] ; 0x40 6c: edd09b12 vldr d25, [r0, #72] ; 0x48 70: ed906b1c vldr d6, [r0, #112] ; 0x70 74: ed907b1e vldr d7, [r0, #120] ; 0x78 78: ed908b20 vldr d8, [r0, #128] ; 0x80 7c: ed909b22 vldr d9, [r0, #136] ; 0x88 80: ed90ab24 vldr d10, [r0, #144] ; 0x90 84: ed90bb26 vldr d11, [r0, #152] ; 0x98 88: ed90eb28 vldr d14, [r0, #160] ; 0xa0 8c: ed90fb2a vldr d15, [r0, #168] ; 0xa8 90: edcdeb04 vstr d30, [sp, #16] 94: edcdfb06 vstr d31, [sp, #24] 98: ec80cb04 vstmia r0, {d12-d13} 9c: f224c1d2 vorr q6, q10, q1 a0: f26c41d2 vorr q10, q14, q1 a4: ecddcb04 vldmia sp, {d28-d29} a8: f26021d2 vorr q9, q8, q1 ac: f26888c0 vadd.i32 q12, q12, q0 b0: f26ae1d2 vorr q15, q13, q1 b4: f26668c0 vadd.i32 q11, q11, q0 b8: f26ca1d2 vorr q13, q14, q1 bc: f2266840 vadd.i32 q3, q3, q0 c0: f2288840 vadd.i32 q4, q4, q0 c4: f22aa840 vadd.i32 q5, q5, q0 c8: f22ee840 vadd.i32 q7, q7, q0 cc: edddcb04 vldr d28, [sp, #16] d0: eddddb06 vldr d29, [sp, #24] d4: f22601d2 vorr q0, q11, q1 d8: f22841d2 vorr q2, q12, q1 dc: f2680152 vorr q8, q4, q1 e0: f26a6152 vorr q11, q5, q1 e4: f26e8152 vorr q12, q7, q1 e8: edc02b04 vstr d18, [r0, #16] ec: edc03b06 vstr d19, [r0, #24] f0: f2662152 vorr q9, q3, q1 f4: f22c21d2 vorr q1, q14, q1 f8: ed80cb08 vstr d12, [r0, #32] fc: ed80db0a vstr d13, [r0, #40] ; 0x28 100: ed800b0c vstr d0, [r0, #48] ; 0x30 104: ed801b0e vstr d1, [r0, #56] ; 0x38 108: ed804b10 vstr d4, [r0, #64] ; 0x40 10c: ed805b12 vstr d5, [r0, #72] ; 0x48 110: edc0eb14 vstr d30, [r0, #80] ; 0x50 114: edc0fb16 vstr d31, [r0, #88] ; 0x58 118: edc04b18 vstr d20, [r0, #96] ; 0x60 11c: edc05b1a vstr d21, [r0, #104] ; 0x68 120: edc02b1c vstr d18, [r0, #112] ; 0x70 124: edc03b1e vstr d19, [r0, #120] ; 0x78 128: edc00b20 vstr d16, [r0, #128] ; 0x80 12c: edc01b22 vstr d17, [r0, #136] ; 0x88 130: edc06b24 vstr d22, [r0, #144] ; 0x90 134: edc07b26 vstr d23, [r0, #152] ; 0x98 138: edc08b28 vstr d24, [r0, #160] ; 0xa0 13c: edc09b2a vstr d25, [r0, #168] ; 0xa8 140: edc0ab2c vstr d26, [r0, #176] ; 0xb0 144: edc0bb2e vstr d27, [r0, #184] ; 0xb8 148: ed802b30 vstr d2, [r0, #192] ; 0xc0 14c: ed803b32 vstr d3, [r0, #200] ; 0xc8 150: e28dd020 add sp, sp, #32 154: ecbd8b10 vpop {d8-d15} 158: e12fff1e bx lr This shows multiple performance problems: 1. The use of inherently slower VLDR/VSTR instructions instead of VLD1/VST1 2. Failure to make proper use of ARM Cortex-A8 NEON LS/ALU dual issue 3. Unnecessary spills to stack This is a general issue with NEON intrinsics, causing serious performance problems for practically any nontrivial code. I guess this itself can be a meta-bug, with each individual performance issue tracked separately. -- Summary: Poor instructions selection, scheduling and registers allocation for ARM NEON intrinsics Product: gcc Version: 4.5.0 Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: target AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: siarhei dot siamashka at gmail dot com GCC target triplet: armv7l-unknown-linux-gnueabi http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43725