[gcc r16-7601] tree-optimization/124068 - fix missed AVX2 vectorization of shift

Richard Biener via Gcc-cvs Fri, 20 Feb 2026 06:16:49 -0800

https://gcc.gnu.org/g:cdc4d4ada2e09f307c0bce6394352079088186a1


commit r16-7601-gcdc4d4ada2e09f307c0bce6394352079088186a1
Author: Richard Biener <[email protected]>
Date:   Wed Feb 18 13:46:38 2026 +0100

    tree-optimization/124068 - fix missed AVX2 vectorization of shift
    
    The following fixes a regression in AVX2 vectorization because on
    trunk we are now correctly determine we can shorten a shift operation
    but we never really bothered to check we can implement the
    resulting operation.  With the patch we now check this.  For shifts
    and rotates we have the choice between vector-vector and vector-scalar
    operations which in the end depends on whether we perform SLP or not
    and how the shift operand matches up.  The patch heuristically
    assumes that constant or external shifts can be handled by vector-scalar
    operations.
    
    As we were not checking for target support was to allow recursive matching
    other patterns, the following still errors on that side in case the
    original operation was not supported by the target or it is binary and
    the 2nd operand is a constant.  This helps avoiding regressions in
    gcc.dg/vect/vect-over-widen-13.c and gcc.dg/vect/vect-div-bitmask-1.c
    and gcc.target/aarch64/sve2/div-by-bitmask_1.c where the operation in
    question is integer division.
    
            PR tree-optimization/124068
            * tree-vect-patterns.cc (target_has_vecop_for_code): Move
            earlier, add defaulted optab_subtype parameter.
            (vect_recog_over_widening_pattern): Check that the target
            supports the narrowed operation before committing to the
            pattern.
    
            * gcc.target/i386/vect-shift-1.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.target/i386/vect-shift-1.c | 10 ++++++
 gcc/tree-vect-patterns.cc                    | 52 +++++++++++++++++++---------
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/vect-shift-1.c 
b/gcc/testsuite/gcc.target/i386/vect-shift-1.c
new file mode 100644
index 000000000000..eb31ef48ba06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shift-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target avx2 } } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */
+
+void f (short* acc)
+{
+  for (unsigned char row = 0; row < 16; ++row)
+    acc[row] = acc[row] << row;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 97130206a214..a5413ff00a3e 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1036,6 +1036,17 @@ vect_reassociating_reduction_p (vec_info *vinfo,
   return true;
 }
 
+/* Return true iff the target has a vector optab implementing the operation
+   CODE on type VECTYPE with SUBTYPE.  */
+
+static bool
+target_has_vecop_for_code (tree_code code, tree vectype,
+                          enum optab_subtype subtype = optab_vector)
+{
+  optab voptab = optab_for_tree_code (code, vectype, subtype);
+  return voptab && can_implement_p (voptab, TYPE_MODE (vectype));
+}
+
 /* match.pd function to match
    (cond (cmp@3 a b) (convert@1 c) (convert@2 d))
    with conditions:
@@ -3160,16 +3171,36 @@ vect_recog_over_widening_pattern (vec_info *vinfo,
       && (code == PLUS_EXPR || code == MINUS_EXPR || code == MULT_EXPR))
     op_type = build_nonstandard_integer_type (new_precision, true);
 
-  /* We specifically don't check here whether the target supports the
-     new operation, since it might be something that a later pattern
-     wants to rewrite anyway.  If targets have a minimum element size
-     for some optabs, we should pattern-match smaller ops to larger ops
-     where beneficial.  */
   tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
   tree op_vectype = get_vectype_for_scalar_type (vinfo, op_type);
   if (!new_vectype || !op_vectype)
     return NULL;
 
+  /* Verify we can handle the new operation.  For shifts and rotates
+     apply heuristic of whether we are likely facing vector-vector or
+     vector-scalar operation.  Since we are eventually expecting that
+     a later pattern might eventually want to rewrite an unsupported
+     into a supported case error on that side in case the original
+     operation was not supported either or this is a binary operation
+     and the 2nd operand is constant.  */
+  if (code == RSHIFT_EXPR || code == LSHIFT_EXPR || code == RROTATE_EXPR)
+    {
+      if (!target_has_vecop_for_code (code, op_vectype, optab_vector)
+         && ((unprom[1].dt != vect_external_def
+              && unprom[1].dt != vect_constant_def)
+             || !target_has_vecop_for_code (code, op_vectype, optab_scalar))
+         && !(!target_has_vecop_for_code (code, *type_out, optab_vector)
+              && ((unprom[1].dt != vect_external_def
+                   || unprom[1].dt != vect_constant_def)
+                  || !target_has_vecop_for_code (code, *type_out,
+                                                 optab_scalar))))
+       return NULL;
+    }
+  else if (!target_has_vecop_for_code (code, op_vectype, optab_vector)
+          && (target_has_vecop_for_code (code, *type_out, optab_vector)
+              && !(nops == 2 && unprom[1].dt == vect_constant_def)))
+    return NULL;
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "demoting %T to %T\n",
                     type, new_type);
@@ -4151,17 +4182,6 @@ vect_recog_vector_vector_shift_pattern (vec_info *vinfo,
   return pattern_stmt;
 }
 
-/* Return true iff the target has a vector optab implementing the operation
-   CODE on type VECTYPE.  */
-
-static bool
-target_has_vecop_for_code (tree_code code, tree vectype)
-{
-  optab voptab = optab_for_tree_code (code, vectype, optab_vector);
-  return voptab
-        && can_implement_p (voptab, TYPE_MODE (vectype));
-}
-
 /* Verify that the target has optabs of VECTYPE to perform all the steps
    needed by the multiplication-by-immediate synthesis algorithm described by
    ALG and VAR.  If SYNTH_SHIFT_P is true ensure that vector addition is

[gcc r16-7601] tree-optimization/124068 - fix missed AVX2 vectorization of shift

Reply via email to