On Thu, Oct 02, 2014 at 11:15:10AM +0400, Evgeny wrote: > Hold on. The patch has a conflict with previously approved here.
Which? In any case, I've already committed the patch. That said, if what you care now about is pr52252-atom.c with -O2 -ftree-vectorize -mavx2, let's look at all the permutations it needs below. The patch I've committed improved f3, from 3x vpshufb + 2x vpermq + 2x vpor into vperm2i128 $33 + vpalignr $11, left other permutations as is. So, do you have suggestions on what insns we should use for the other permutations? Then we can figure out how to best achieve that. Note that with dozens of permutation instructions in the ISA the computational complexity of finding optimal sequence might be too expensive. typedef unsigned char V __attribute__ ((vector_size (32))); V a, b, c; void f1 (void) { c = __builtin_shuffle (a, b, (V) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 }); } void f2 (void) { c = __builtin_shuffle (a, b, (V) { 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29 }); } void f3 (void) { c = __builtin_shuffle (a, b, (V) { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42 }); } void f4 (void) { c = __builtin_shuffle (a, b, (V) { 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 }); } void f5 (void) { c = __builtin_shuffle (a, b, (V) { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52 }); } void f6 (void) { c = __builtin_shuffle (a, b, (V) { 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 }); } Jakub