Hi All,

There was a bug in the ACLE specication for dot product which has now
been fixed[1].  This means some intrinsics were missing and are added by this
patch.

Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.

Ok for master?

[1] https://github.com/ARM-software/acle/releases/tag/r2021Q3

Thanks,
Tamar

gcc/ChangeLog:

        * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32,
        vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New
        * config/arm/arm_neon_builtins.def (usdot): Add V16QI.
        (usdot_laneq, sudot_laneq): New.
        * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New.
        (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code.

gcc/testsuite/ChangeLog:

        * gcc.target/arm/simd/vdot-2-1.c: Add new tests.
        * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output.

--- inline copy of patch -- 
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f7d6a63bab9f5aa
 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
   return __builtin_neon_usdotv8qi_ssus (__r, __a, __b);
 }
 
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_neon_usdotv16qi_ssus (__r, __a, __b);
+}
+
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a,
@@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a,
   return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
+                 int8x16_t __b, const int __index)
+{
+  return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
+                  int8x16_t __b, const int __index)
+{
+  return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
+                 uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
+                  uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 
 #pragma GCC pop_options
diff --git a/gcc/config/arm/arm_neon_builtins.def 
b/gcc/config/arm/arm_neon_builtins.def
index 
f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d160a7d6f595f057
 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
 VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
 VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
 
-VAR1 (USTERNOP, usdot, v8qi)
+VAR2 (USTERNOP, usdot, v8qi, v16qi)
 VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
 VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
+VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
+VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
 
 VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
 VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 
848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44db5e33405bb5fa1
 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
            DOTPROD_I8MM)
          (match_operand:VCVTI 1 "register_operand" "0")))]
   "TARGET_I8MM"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations in the v8.6 I8MM extension.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+       (plus:VCVTI
+         (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                        (match_operand:V16QI 3 "register_operand" "t")
+                        (match_operand:SI 4 "immediate_operand" "i")]
+                        DOTPROD_I8MM)
+         (match_operand:VCVTI 1 "register_operand" "0")))]
+  "TARGET_I8MM"
   {
-    operands[4] = GEN_INT (INTVAL (operands[4]));
-    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+    int lane = INTVAL (operands[4]);
+    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+      {
+       operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+       return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+    else
+      {
+       operands[4] = GEN_INT (lane);
+       return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+      }
   }
   [(set_attr "type" "neon_dot<q>")]
 )
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c 
b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
index 
88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e238b7403b4f135
 100644
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target arm_hard_ok } */
 /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
 /* { dg-add-options arm_v8_2a_i8mm }  */
-/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */
+/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon.h>
@@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
   return vusdot_s32 (r, x, y);
 }
 
+/*
+**usfooq:
+**     ...
+**     vusdot\.s8      q0, q1, q2
+**     bx      lr
+*/
+int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
 /*
 **usfoo_lane:
 **     ...
@@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
   return vsudotq_lane_s32 (r, x, y, 1);
 }
 
+/*
+**usfoo_laneq:
+**     ...
+**     vusdot\.s8      d0, d1, d3\[0\]
+**     bx      lr
+*/
+int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**usfooq_laneq:
+**     ...
+**     vusdot\.s8      q0, q1, d5\[1\]
+**     bx      lr
+*/
+int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+/*
+**sfoo_laneq:
+**     ...
+**     vsudot\.u8      d0, d1, d3\[0\]
+**     bx      lr
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_laneq:
+**     ...
+**     vsudot\.u8      q0, q1, d5\[1\]
+**     bx      lr
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
 /*
 **usfoo_untied:
 **     ...
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c 
b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
index 
1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0f081f80381b05c
 100644
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target arm_hard_ok } */
 /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
 /* { dg-add-options arm_v8_2a_i8mm }  */
-/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } */
+/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian 
-mfpu=auto" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon.h>
@@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
   return vusdot_s32 (r, x, y);
 }
 
+/*
+**usfooq:
+**     ...
+**     vusdot\.s8      q0, q1, q2
+**     bx      lr
+*/
+int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
 /*
 **usfoo_lane:
 **     ...
@@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
   return vsudotq_lane_s32 (r, x, y, 1);
 }
 
+/*
+**usfoo_laneq:
+**     ...
+**     vusdot\.s8      d0, d1, d3\[0\]
+**     bx      lr
+*/
+int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**usfooq_laneq:
+**     ...
+**     vusdot\.s8      q0, q1, d5\[1\]
+**     bx      lr
+*/
+int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+/*
+**sfoo_laneq:
+**     ...
+**     vsudot\.u8      d0, d1, d3\[0\]
+**     bx      lr
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_laneq:
+**     ...
+**     vsudot\.u8      q0, q1, d5\[1\]
+**     bx      lr
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
 /*
 **usfoo_untied:
 **     ...
@@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, int32x2_t r, 
uint8x8_t x, int8x8_
 {
   return vusdot_lane_s32 (r, x, y, 0);
 }
+


-- 
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f7d6a63bab9f5aa 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
   return __builtin_neon_usdotv8qi_ssus (__r, __a, __b);
 }
 
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_neon_usdotv16qi_ssus (__r, __a, __b);
+}
+
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a,
@@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a,
   return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
+		  int8x16_t __b, const int __index)
+{
+  return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
+		   int8x16_t __b, const int __index)
+{
+  return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
+		  uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
+		   uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 
 #pragma GCC pop_options
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d160a7d6f595f057 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
 VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
 VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
 
-VAR1 (USTERNOP, usdot, v8qi)
+VAR2 (USTERNOP, usdot, v8qi, v16qi)
 VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
 VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
+VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
+VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
 
 VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
 VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44db5e33405bb5fa1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
 	    DOTPROD_I8MM)
 	  (match_operand:VCVTI 1 "register_operand" "0")))]
   "TARGET_I8MM"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations in the v8.6 I8MM extension.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			 (match_operand:V16QI 3 "register_operand" "t")
+			 (match_operand:SI 4 "immediate_operand" "i")]
+			 DOTPROD_I8MM)
+	  (match_operand:VCVTI 1 "register_operand" "0")))]
+  "TARGET_I8MM"
   {
-    operands[4] = GEN_INT (INTVAL (operands[4]));
-    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+    int lane = INTVAL (operands[4]);
+    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+      {
+	operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+    else
+      {
+	operands[4] = GEN_INT (lane);
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+      }
   }
   [(set_attr "type" "neon_dot<q>")]
 )
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
index 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e238b7403b4f135 100644
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target arm_hard_ok } */
 /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
 /* { dg-add-options arm_v8_2a_i8mm }  */
-/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */
+/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon.h>
@@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
   return vusdot_s32 (r, x, y);
 }
 
+/*
+**usfooq:
+**	...
+**	vusdot\.s8	q0, q1, q2
+**	bx	lr
+*/
+int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
 /*
 **usfoo_lane:
 **	...
@@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
   return vsudotq_lane_s32 (r, x, y, 1);
 }
 
+/*
+**usfoo_laneq:
+**	...
+**	vusdot\.s8	d0, d1, d3\[0\]
+**	bx	lr
+*/
+int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**usfooq_laneq:
+**	...
+**	vusdot\.s8	q0, q1, d5\[1\]
+**	bx	lr
+*/
+int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+/*
+**sfoo_laneq:
+**	...
+**	vsudot\.u8	d0, d1, d3\[0\]
+**	bx	lr
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_laneq:
+**	...
+**	vsudot\.u8	q0, q1, d5\[1\]
+**	bx	lr
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
 /*
 **usfoo_untied:
 **	...
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
index 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0f081f80381b05c 100644
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target arm_hard_ok } */
 /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
 /* { dg-add-options arm_v8_2a_i8mm }  */
-/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } */
+/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian -mfpu=auto" } */
 /* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon.h>
@@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
   return vusdot_s32 (r, x, y);
 }
 
+/*
+**usfooq:
+**	...
+**	vusdot\.s8	q0, q1, q2
+**	bx	lr
+*/
+int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
 /*
 **usfoo_lane:
 **	...
@@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
   return vsudotq_lane_s32 (r, x, y, 1);
 }
 
+/*
+**usfoo_laneq:
+**	...
+**	vusdot\.s8	d0, d1, d3\[0\]
+**	bx	lr
+*/
+int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**usfooq_laneq:
+**	...
+**	vusdot\.s8	q0, q1, d5\[1\]
+**	bx	lr
+*/
+int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+/*
+**sfoo_laneq:
+**	...
+**	vsudot\.u8	d0, d1, d3\[0\]
+**	bx	lr
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_laneq:
+**	...
+**	vsudot\.u8	q0, q1, d5\[1\]
+**	bx	lr
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
 /*
 **usfoo_untied:
 **	...
@@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_
 {
   return vusdot_lane_s32 (r, x, y, 0);
 }
+

Reply via email to