Module Name:    src
Committed By:   riastradh
Date:           Sun Aug  9 02:49:38 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: arm_neon.h
        src/sys/crypto/chacha/arch/arm: arm_neon.h

Log Message:
Fix some clang neon intrinsics.

Compile-tested only, with -Wno-nonportable-vector-initializers.  Need
to address -- and test -- this stuff properly but this is progress.


To generate a diff of this commit:
cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/arm_neon.h
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/chacha/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.9 src/sys/crypto/aes/arch/arm/arm_neon.h:1.10
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.9	Sun Aug  9 02:48:38 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Sun Aug  9 02:49:38 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.9 2020/08/09 02:48:38 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.10 2020/08/09 02:49:38 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -85,6 +85,8 @@ typedef __attribute__((neon_vector_type(
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
@@ -218,7 +220,7 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
 	    (int8x16_t)__hi_r, (__i), 48);				      \
-	return __builtin_shufflevector(__r, __r,			      \
+	__builtin_shufflevector(__r, __r,				      \
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN */
@@ -326,19 +328,37 @@ vqtbl1q_u8(uint8x16_t __tab, uint8x16_t 
 	return (uint8x16_t)__out64;
 #endif
 #elif defined(__clang__)
-#ifdef __LITTLE_ENDIAN__
-	return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
-	    (int8x16_t)__idx, 48);
-#else
-	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
+#ifndef __LITTLE_ENDIAN__
+	__tab = __builtin_shufflevector(__tab, __tab,
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
-	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
+	__idx = __builtin_shufflevector(__idx, __idx,
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
-	uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
-	    (int8x16_t)__idx, __i, 48);
-	return __builtin_shufflevector(__r, __r,
+#endif
+	uint8x16_t __r;
+#ifdef __aarch64__
+	__r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48);
+#else
+	uint64x2_t __tab64 = (uint64x2_t)__tab;
+	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
+	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
+	uint64x2_t __idx64, __out64;
+	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
+
+	__idx64 = (uint64x2_t)__idx;
+	__idxlo = (int8x8_t)__idx64[0];
+	__idxhi = (int8x8_t)__idx64[1];
+	__outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
+	    (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16);
+	__outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
+	    (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16);
+	__out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi };
+	__r = (uint8x16_t)__out64;
+#endif
+#ifndef __LITTLE_ENDIAN__
+	__r = __builtin_shufflevector(__r, __r,
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
+	return __r;
 #endif
 }
 
@@ -579,7 +599,7 @@ vsriq_n_u32(uint32x4_t __vins, uint32x4_
 	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	      \
 	    (int32x4_t)(__vsh), (__bits), 34)
 #else
-#define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
+#define	vsriq_n_s32(__vins, __vsh, __bits) (				      \
 {									      \
 	int32x4_t __tvins = (__vins);					      \
 	int32x4_t __tvsh = (__vsh);					      \

Index: src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.5 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.6
--- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.5	Sun Aug  9 02:48:38 2020
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h	Sun Aug  9 02:49:38 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.5 2020/08/09 02:48:38 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.6 2020/08/09 02:49:38 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -85,6 +85,8 @@ typedef __attribute__((neon_vector_type(
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
@@ -218,7 +220,7 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
 	    (int8x16_t)__hi_r, (__i), 48);				      \
-	return __builtin_shufflevector(__r, __r,			      \
+	__builtin_shufflevector(__r, __r,				      \
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN */
@@ -326,19 +328,37 @@ vqtbl1q_u8(uint8x16_t __tab, uint8x16_t 
 	return (uint8x16_t)__out64;
 #endif
 #elif defined(__clang__)
-#ifdef __LITTLE_ENDIAN__
-	return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
-	    (int8x16_t)__idx, 48);
-#else
-	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
+#ifndef __LITTLE_ENDIAN__
+	__tab = __builtin_shufflevector(__tab, __tab,
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
-	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
+	__idx = __builtin_shufflevector(__idx, __idx,
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
-	uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
-	    (int8x16_t)__idx, __i, 48);
-	return __builtin_shufflevector(__r, __r,
+#endif
+	uint8x16_t __r;
+#ifdef __aarch64__
+	__r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48);
+#else
+	uint64x2_t __tab64 = (uint64x2_t)__tab;
+	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
+	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
+	uint64x2_t __idx64, __out64;
+	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
+
+	__idx64 = (uint64x2_t)__idx;
+	__idxlo = (int8x8_t)__idx64[0];
+	__idxhi = (int8x8_t)__idx64[1];
+	__outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
+	    (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16);
+	__outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
+	    (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16);
+	__out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi };
+	__r = (uint8x16_t)__out64;
+#endif
+#ifndef __LITTLE_ENDIAN__
+	__r = __builtin_shufflevector(__r, __r,
 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
+	return __r;
 #endif
 }
 
@@ -579,7 +599,7 @@ vsriq_n_u32(uint32x4_t __vins, uint32x4_
 	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	      \
 	    (int32x4_t)(__vsh), (__bits), 34)
 #else
-#define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
+#define	vsriq_n_s32(__vins, __vsh, __bits) (				      \
 {									      \
 	int32x4_t __tvins = (__vins);					      \
 	int32x4_t __tvsh = (__vsh);					      \

Reply via email to