Hi,

In a previous patch I did most of the work for this, but forgot to change the check for number of arguments matching between call and simdclone. This check should accept calls without a mask to be matched against simdclones with mask arguments. I also added tests to verify this feature actually works.


For the simd-builtins tests I decided to remove the sin (double) simdclone which would now be used, because it was inbranch and we enable their use for not inbranch. Given the nature of the test, removing it made more sense, but thats not a strong opinion, happy to change.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu.

OK for trunk?

PS: I'll be away for two weeks from tomorrow, it would be really nice if this can go in for gcc-14, otherwise the previous work I did for this won't have any actual visible effect :(


gcc/ChangeLog:

        * tree-vect-stmts.cc (vectorizable_simd_clone_call): Allow unmasked
        calls to use masked simdclones.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/vect-simd-clone-20.c: New file.
        * gfortran.dg/simd-builtins-1.h: Adapt.
        * gfortran.dg/simd-builtins-6.f90: Adapt.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c
new file mode 100644
index 
0000000000000000000000000000000000000000..9f51a68f3a0c8851af2cd26bd8235c771b851d7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c
@@ -0,0 +1,87 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd --param vect-epilogues-nomask=0" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+/* Test that simd inbranch clones work correctly.  */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+/* A simple function that will be cloned.  */
+#pragma omp declare simd inbranch
+TYPE __attribute__((noinline))
+foo (TYPE a)
+{
+  return a + 1;
+}
+
+/* Check that "inbranch" clones are called correctly.  */
+
+void __attribute__((noipa))
+masked (TYPE * __restrict a, TYPE * __restrict b, int size)
+{
+  #pragma omp simd
+  for (int i = 0; i < size; i++)
+    b[i] = foo(a[i]);
+}
+
+/* Check that "inbranch" works when there might be unrolling.  */
+
+void __attribute__((noipa))
+masked_fixed (TYPE * __restrict a, TYPE * __restrict b)
+{
+  #pragma omp simd
+  for (int i = 0; i < 128; i++)
+    b[i] = foo(a[i]);
+}
+
+/* Validate the outputs.  */
+
+void
+check_masked (TYPE *b, int size)
+{
+  for (int i = 0; i < size; i++)
+    if (b[i] != (TYPE)(i + 1))
+      {
+       __builtin_printf ("error at %d\n", i);
+       __builtin_exit (1);
+      }
+}
+
+int
+main ()
+{
+  TYPE a[1024];
+  TYPE b[1024];
+
+  for (int i = 0; i < 1024; i++)
+    a[i] = i;
+
+  masked_fixed (a, b);
+  check_masked (b, 128);
+
+  /* Test various sizes to cover machines with different vectorization
+     factors.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size);
+      check_masked (b, size);
+    }
+
+  /* Test sizes that might exercise the partial vector code-path.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size-4);
+      check_masked (b, size-4);
+    }
+
+  return 0;
+}
+
+/* Ensure the the in-branch simd clones are used on targets that support them. 
 */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { x86_64*-*-* } } } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gfortran.dg/simd-builtins-1.h 
b/gcc/testsuite/gfortran.dg/simd-builtins-1.h
index 
88d555cf41ad065ea525a63d7c05d15d3e5b54ed..08b73514a67d5791d35203530d039741946e9dcc
 100644
--- a/gcc/testsuite/gfortran.dg/simd-builtins-1.h
+++ b/gcc/testsuite/gfortran.dg/simd-builtins-1.h
@@ -1,4 +1,3 @@
-!GCC$ builtin (sin) attributes simd (inbranch)
 !GCC$ builtin (sinf) attributes simd (notinbranch)
 !GCC$ builtin (cosf) attributes simd
 !GCC$ builtin (cosf) attributes simd (notinbranch)
diff --git a/gcc/testsuite/gfortran.dg/simd-builtins-6.f90 
b/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
index 
60bcac78f3e0cc492930f3eb73cf97065312dc1c..2c68f9f1818a35674a0aef15793aa312a48199a8
 100644
--- a/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
+++ b/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
@@ -2,7 +2,6 @@
 ! { dg-additional-options "-nostdinc -Ofast -fdump-tree-optimized" }
 ! { dg-additional-options "-msse2 -mno-avx" { target i?86-*-linux* 
x86_64-*-linux* } }
 
-!GCC$ builtin (sin) attributes simd (inbranch)
 !GCC$ builtin (sinf) attributes simd (notinbranch)
 !GCC$ builtin (cosf) attributes simd
 !GCC$ builtin (cosf) attributes simd (notinbranch)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
a9200767f67a4c9a8e106259be97a7bc7cd7e9dc..5f262cae2aae784e3ef4fd07455b7aa742797b51
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4153,10 +4153,19 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
       {
        unsigned int this_badness = 0;
        unsigned int num_calls;
+       /* The number of arguments in the call and the number of parameters in
+          the simdclone should match.  However, when the simdclone is
+          'inbranch', it could have one more paramater than nargs when using
+          an inbranch simdclone to call a non-inbranch call, either in a
+          non-masked loop using a all true constant mask, or inside a masked
+          loop using it's mask.  */
+       size_t simd_nargs = n->simdclone->nargs;
+       if (!masked_call_offset && n->simdclone->inbranch)
+         simd_nargs--;
        if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
                                  &num_calls)
            || (!n->simdclone->inbranch && (masked_call_offset > 0))
-           || nargs != n->simdclone->nargs)
+           || (nargs != simd_nargs))
          continue;
        if (num_calls != 1)
          this_badness += exact_log2 (num_calls) * 4096;

Reply via email to