On Tue, 26 Feb 2019 at 05:03, <ndesaulni...@google.com> wrote: > > Clang warns: vector initializers are not compatible with NEON intrinsics > in big endian mode [-Wnonportable-vector-initialization] > > While this is usually the case, it's not an issue for this case since > we're initializing the uint8x16_t (16x uint8_t's) with the same value. > > Instead, use vdupq_n_u8 which both compilers lower into a single movi > instruction: https://godbolt.org/z/vBrgzt > > This avoids the static storage for a constant value. > > Link: https://github.com/ClangBuiltLinux/linux/issues/214 > Suggested-by: Nathan Chancellor <natechancel...@gmail.com> > Signed-off-by: Nick Desaulniers <ndesaulni...@google.com>
Much better, thanks, Did you double check that the intrinsic exists on 32-bit ARM as well? I assume it does, but please make sure if you haven't yet. If so, Reviewed-by: Ard Biesheuvel <ard.biesheu...@linaro.org> > --- > lib/raid6/neon.uc | 5 ++--- > lib/raid6/recov_neon_inner.c | 7 ++----- > 2 files changed, 4 insertions(+), 8 deletions(-) > > diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc > index d5242f544551..b7c68030da4f 100644 > --- a/lib/raid6/neon.uc > +++ b/lib/raid6/neon.uc > @@ -28,7 +28,6 @@ > > typedef uint8x16_t unative_t; > > -#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) > #define NSIZE sizeof(unative_t) > > /* > @@ -61,7 +60,7 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned > long bytes, void **ptrs) > int d, z, z0; > > register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; > - const unative_t x1d = NBYTES(0x1d); > + const unative_t x1d = vdupq_n_u8(0x1d); > > z0 = disks - 3; /* Highest data disk */ > p = dptr[z0+1]; /* XOR parity */ > @@ -92,7 +91,7 @@ void raid6_neon$#_xor_syndrome_real(int disks, int start, > int stop, > int d, z, z0; > > register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; > - const unative_t x1d = NBYTES(0x1d); > + const unative_t x1d = vdupq_n_u8(0x1d); > > z0 = stop; /* P/Q right side optimization */ > p = dptr[disks-2]; /* XOR parity */ > diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c > index 8cd20c9f834a..7d00c31a6547 100644 > --- a/lib/raid6/recov_neon_inner.c > +++ b/lib/raid6/recov_neon_inner.c > @@ -10,11 +10,6 @@ > > #include <arm_neon.h> > > -static const uint8x16_t x0f = { > - 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, > - 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, > -}; > - > #ifdef CONFIG_ARM > /* > * AArch32 does not provide this intrinsic natively because it does not > @@ -41,6 +36,7 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, > uint8_t *q, uint8_t *dp, > uint8x16_t pm1 = vld1q_u8(pbmul + 16); > uint8x16_t qm0 = vld1q_u8(qmul); > uint8x16_t qm1 = vld1q_u8(qmul + 16); > + uint8x16_t x0f = vdupq_n_u8(0x0f); > > /* > * while ( bytes-- ) { > @@ -87,6 +83,7 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, > uint8_t *q, uint8_t *dq, > { > uint8x16_t qm0 = vld1q_u8(qmul); > uint8x16_t qm1 = vld1q_u8(qmul + 16); > + uint8x16_t x0f = vdupq_n_u8(0x0f); > > /* > * while (bytes--) { > -- > 2.21.0.rc2.261.ga7da99ff1b-goog >