Hi,

The vst<n>_lane_<lane_type> intrinsics should write
(sizeof (lane_type) * n) bytes to memory.

In their current form, their asm constraints suggest a write size of
(sizeof (vector_type) * n). This is anywhere from 1 to 16 times too
much data, can cause huge headaches with dead store elimination.

This patch better models how much data we will be writing, which in
turn lets us eliminate the memory clobber. Together, we avoid the
problems with dead store elimination.

Tested with aarch64.exp and checked the C++ neon mangling test which
often breaks when you do these ugly casts.

OK?

Thanks,
James

---
gcc/

2013-10-29  James Greenhalgh  <james.greenha...@arm.com>

        * config/aarch64/arm_neon.h
        (__ST2_LANE_FUNC): Better model data size.
        (__ST3_LANE_FUNC): Likewise.
        (__ST4_LANE_FUNC): Likewise.
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 787ff15..7a63ea1 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -14704,16 +14704,19 @@ __LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
 
 #define __ST2_LANE_FUNC(intype, ptrtype, regsuffix,			\
 			lnsuffix, funcsuffix, Q)			\
+  typedef struct { ptrtype __x[2]; } __ST2_LANE_STRUCTURE_##intype;	\
   __extension__ static __inline void					\
   __attribute__ ((__always_inline__))					\
-  vst2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
+  vst2 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr,			\
 				     intype b, const int c)		\
   {									\
+    __ST2_LANE_STRUCTURE_##intype *__p =				\
+				(__ST2_LANE_STRUCTURE_##intype *)ptr;	\
     __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"	\
 	     "st2 {v16." #lnsuffix ", v17." #lnsuffix "}[%2], %0\n\t"	\
-	     : "=Q"(*(intype *) ptr)					\
+	     : "=Q"(*__p)						\
 	     : "Q"(b), "i"(c)						\
-	     : "memory", "v16", "v17");					\
+	     : "v16", "v17");						\
   }
 
 __ST2_LANE_FUNC (int8x8x2_t, int8_t, 8b, b, s8,)
@@ -14743,16 +14746,19 @@ __ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
 
 #define __ST3_LANE_FUNC(intype, ptrtype, regsuffix,			\
 			lnsuffix, funcsuffix, Q)			\
+  typedef struct { ptrtype __x[3]; } __ST3_LANE_STRUCTURE_##intype;	\
   __extension__ static __inline void					\
   __attribute__ ((__always_inline__))					\
-  vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
+  vst3 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr,			\
 				     intype b, const int c)		\
   {									\
+    __ST3_LANE_STRUCTURE_##intype *__p =				\
+				(__ST3_LANE_STRUCTURE_##intype *)ptr;	\
     __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"	\
 	     "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t"	\
-	     : "=Q"(*(intype *) ptr)					\
+	     : "=Q"(*__p)						\
 	     : "Q"(b), "i"(c)						\
-	     : "memory", "v16", "v17", "v18");				\
+	     : "v16", "v17", "v18");					\
   }
 
 __ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,)
@@ -14782,16 +14788,19 @@ __ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
 
 #define __ST4_LANE_FUNC(intype, ptrtype, regsuffix,			\
 			lnsuffix, funcsuffix, Q)			\
+  typedef struct { ptrtype __x[4]; } __ST4_LANE_STRUCTURE_##intype;	\
   __extension__ static __inline void					\
   __attribute__ ((__always_inline__))					\
-  vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
+  vst4 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr,			\
 				     intype b, const int c)		\
   {									\
+    __ST4_LANE_STRUCTURE_##intype *__p =				\
+				(__ST4_LANE_STRUCTURE_##intype *)ptr;	\
     __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"	\
 	     "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t"	\
-	     : "=Q"(*(intype *) ptr)					\
+	     : "=Q"(*__p)						\
 	     : "Q"(b), "i"(c)						\
-	     : "memory", "v16", "v17", "v18", "v19");			\
+	     : "v16", "v17", "v18", "v19");				\
   }
 
 __ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,)

Reply via email to