Hi! As the testcase shows, the C/C++ FEs narrow the shift counters from whatever type they had originally to unsigned int (previously signed int). Then the vect-patterns code, to be able to use vector by vector shifts attempts to narrow or widen them again to the right type. If there is already a cast from the right precision, it just uses the rhs1 of that case, otherwise it adds a cast to the pattern, which performs the needed widening or narrowing.
Unfortunately, we have information loss during optimizations, we don't know anymore if it was say: long a[64], b[64]; void foo (void) { for (int i = 0; i < 64; i++) a[i] <<= b[i]; // Here we don't need any masking, it would be UB // if b isn't in range } void bar (void) { for (int i = 0; i < 64; i++) a[i] <<= (unsigned) b[i]; // But here we can't just use b[i] as the // shift count, because the upper bits are to be masked off. } void baz (void) { for (int i = 0; i < 64; i++) a[i] <<= b[i] - 0x7200000000ULL; // And here the optimizers will likely // optimize away the subtraction, because there // is implicit cast to (unsigned int). We need // to mask instead of using b[i] directly. } But, not casting say long long shift counters to unsigned int would penalize other code, computing unneeded operations. So I'm afraid we want the following fix, which I've bootstrapped/regtested on x86_64-linux and i686-linux. For short/char shifts we don't need this of course. Ok for trunk? 2016-03-22 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/70354 * tree-vect-patterns.c (vect_recog_vector_vector_shift_pattern): If oprnd0 is wider than oprnd1 and there is a cast from the wider type to oprnd1, mask it with the mask of the narrower type. * gcc.dg/vect/pr70354-1.c: New test. * gcc.dg/vect/pr70354-2.c: New test. * gcc.target/i386/avx2-pr70354-1.c: New test. * gcc.target/i386/avx2-pr70354-2.c: New test. --- gcc/tree-vect-patterns.c.jj 2016-03-04 15:42:12.000000000 +0100 +++ gcc/tree-vect-patterns.c 2016-03-22 15:28:24.403579426 +0100 @@ -2097,7 +2097,20 @@ vect_recog_vector_vector_shift_pattern ( if (TYPE_MODE (TREE_TYPE (rhs1)) == TYPE_MODE (TREE_TYPE (oprnd0)) && TYPE_PRECISION (TREE_TYPE (rhs1)) == TYPE_PRECISION (TREE_TYPE (oprnd0))) - def = rhs1; + { + if (TYPE_PRECISION (TREE_TYPE (oprnd1)) + >= TYPE_PRECISION (TREE_TYPE (rhs1))) + def = rhs1; + else + { + tree mask + = build_low_bits_mask (TREE_TYPE (rhs1), + TYPE_PRECISION (TREE_TYPE (oprnd1))); + def = vect_recog_temp_ssa_var (TREE_TYPE (rhs1), NULL); + def_stmt = gimple_build_assign (def, BIT_AND_EXPR, rhs1, mask); + new_pattern_def_seq (stmt_vinfo, def_stmt); + } + } } if (def == NULL_TREE) --- gcc/testsuite/gcc.dg/vect/pr70354-1.c.jj 2016-03-22 15:36:42.210847707 +0100 +++ gcc/testsuite/gcc.dg/vect/pr70354-1.c 2016-03-22 15:47:45.448878909 +0100 @@ -0,0 +1,50 @@ +/* PR tree-optimization/70354 */ +/* { dg-do run } */ + +#ifndef main +#include "tree-vect.h" +#endif + +long long int b[64], c[64], g[64]; +unsigned long long int a[64], d[64], e[64], f[64], h[64]; + +__attribute__ ((noinline, noclone)) void +foo (void) +{ + int i; + for (i = 0; i < 64; i++) + { + d[i] = h[i] << (((((unsigned long long int) b[i] * e[i]) + << (-a[i] - 3752448776177690134ULL)) + - 8214565720323784703ULL) - 1ULL); + e[i] = (_Bool) (f[i] + (unsigned long long int) g[i]); + g[i] = c[i]; + } +} + +int +main () +{ + int i; +#ifndef main + check_vect (); +#endif + if (__CHAR_BIT__ != 8 || sizeof (long long int) != 8) + return 0; + for (i = 0; i < 64; ++i) + { + a[i] = 14694295297531861425ULL; + b[i] = -1725558902283030715LL; + c[i] = 4402992416302558097LL; + e[i] = 6297173129107286501ULL; + f[i] = 13865724171235650855ULL; + g[i] = 982871027473857427LL; + h[i] = 8193845517487445944ULL; + } + foo (); + for (i = 0; i < 64; i++) + if (d[i] != 8193845517487445944ULL || e[i] != 1 + || g[i] != 4402992416302558097ULL) + abort (); + return 0; +} --- gcc/testsuite/gcc.dg/vect/pr70354-2.c.jj 2016-03-22 15:36:45.527802852 +0100 +++ gcc/testsuite/gcc.dg/vect/pr70354-2.c 2016-03-22 16:07:09.397164461 +0100 @@ -0,0 +1,37 @@ +/* PR tree-optimization/70354 */ +/* { dg-do run } */ + +#ifndef main +#include "tree-vect.h" +#endif + +unsigned long long a[64], b[64]; + +__attribute__((noinline, noclone)) void +foo (void) +{ + int i; + for (i = 0; i < 64; i++) + a[i] <<= (b[i] - 0x1200000000ULL); +} + +int +main () +{ + int i; +#ifndef main + check_vect (); +#endif + if (__CHAR_BIT__ != 8 || sizeof (long long int) != 8) + return 0; + for (i = 0; i < 64; i++) + { + a[i] = 0x1234ULL; + b[i] = 0x1200000000ULL + (i % 54); + } + foo (); + for (i = 0; i < 64; i++) + if (a[i] != (0x1234ULL << (i % 54))) + abort (); + return 0; +} --- gcc/testsuite/gcc.target/i386/avx2-pr70354-1.c.jj 2016-03-22 15:44:42.228356553 +0100 +++ gcc/testsuite/gcc.target/i386/avx2-pr70354-1.c 2016-03-22 15:47:26.738131930 +0100 @@ -0,0 +1,16 @@ +/* PR tree-optimization/70354 */ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-vectorize -mavx2" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/pr70354-1.c" + +static void +avx2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx2-pr70354-2.c.jj 2016-03-22 15:44:50.283247629 +0100 +++ gcc/testsuite/gcc.target/i386/avx2-pr70354-2.c 2016-03-22 15:44:55.856172268 +0100 @@ -0,0 +1,16 @@ +/* PR tree-optimization/70354 */ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-vectorize -mavx2" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/pr70354-2.c" + +static void +avx2_test (void) +{ + do_main (); +} Jakub