Re: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by constant.

Kyrylo Tkachov Tue, 30 Jul 2024 04:22:45 -0700

Hi Jennifer,

> On 30 Jul 2024, at 09:47, Jennifer Schmitz <jschm...@nvidia.com> wrote:
> 
> Dear Richard,
> Thanks for the feedback. Great to see this patch approved! I made the changes 
> as suggested.
> Best,
> Jennifer
> <0001-SVE-intrinsics-Add-strength-reduction-for-division-b.patch>


Thanks, I’m okay with the patch as well and have pushed it to trunk with 
7cde140863e.
To commit future patches yourself you should apply for Write After Approval 
commit access by filling in the form at 
https://sourceware.org/cgi-bin/pdw/ps_form.cgi . You can use my email address 
as approver.

Kyrill


> 
>> On 29 Jul 2024, at 22:55, Richard Sandiford <richard.sandif...@arm.com> 
>> wrote:
>> 
>> External email: Use caution opening links or attachments
>> 
>> 
>> Thanks for doing this.
>> 
>> Jennifer Schmitz <jschm...@nvidia.com> writes:
>>> [...]
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c 
>>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
>>> index c49ca1aa524..6500b64c41b 100644
>>> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
>>> @@ -1,6 +1,9 @@
>>> /* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>>> 
>>> #include "test_sve_acle.h"
>>> +#include <stdint.h>
>>> +
>> 
>> I think it'd better to drop the explicit include of stdint.h.  arm_sve.h
>> is defined to include stdint.h itself, and we rely on that elsewhere.
>> 
>> Same for div_s64.c.
> Done.
>> 
>>> +#define MAXPOW 1<<30
>>> 
>>> /*
>>> ** div_s32_m_tied1:
>>> @@ -53,10 +56,27 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, 
>>> int32_t,
>>>              z0 = svdiv_n_s32_m (p0, z1, x0),
>>>              z0 = svdiv_m (p0, z1, x0))
>>> 
>>> +/*
>>> +** div_1_s32_m_tied1:
>>> +**   sel     z0\.s, p0, z0\.s, z0\.s
>>> +**   ret
>>> +*/
>>> +TEST_UNIFORM_Z (div_1_s32_m_tied1, svint32_t,
>>> +             z0 = svdiv_n_s32_m (p0, z0, 1),
>>> +             z0 = svdiv_m (p0, z0, 1))
>>> +
>>> +/*
>>> +** div_1_s32_m_untied:
>>> +**   sel     z0\.s, p0, z1\.s, z1\.s
>>> +**   ret
>>> +*/
>>> +TEST_UNIFORM_Z (div_1_s32_m_untied, svint32_t,
>>> +             z0 = svdiv_n_s32_m (p0, z1, 1),
>>> +             z0 = svdiv_m (p0, z1, 1))
>>> +
>> 
>> [ Thanks for adding the tests (which look good to me).  If the output
>> ever improves in future, we can "defend" the improvement by changing
>> the test.  But in the meantime, the above defends something that is
>> known to work. ]
>> 
>>> [...]
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c 
>>> b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>>> new file mode 100644
>>> index 00000000000..1a3c25b817d
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>>> @@ -0,0 +1,91 @@
>>> +/* { dg-do run { target aarch64_sve128_hw } } */
>>> +/* { dg-options "-O2 -msve-vector-bits=128" } */
>>> +
>>> +#include <arm_sve.h>
>>> +#include <stdint.h>
>>> +
>>> +typedef svbool_t pred __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svfloat16_t svfloat16_ __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svfloat32_t svfloat32_ __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svfloat64_t svfloat64_ __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svint32_t svint32_ __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svint64_t svint64_ __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svuint32_t svuint32_ __attribute__((arm_sve_vector_bits(128)));
>>> +typedef svuint64_t svuint64_ __attribute__((arm_sve_vector_bits(128)));
>>> +
>>> +#define F(T, TS, P, OP1, OP2)                                              
>>>   \
>>> +{                                                                    \
>>> +  T##_t op1 = (T##_t) OP1;                                           \
>>> +  T##_t op2 = (T##_t) OP2;                                           \
>>> +  sv##T##_ res = svdiv_##P (pg, svdup_##TS (op1), svdup_##TS (op2)); \
>>> +  sv##T##_ exp = svdup_##TS (op1 / op2);                             \
>>> +  if (svptest_any (pg, svcmpne (pg, exp, res)))                            
>>>   \
>>> +    __builtin_abort ();                                                    
>>>   \
>>> +                                                                     \
>>> +  sv##T##_ res_n = svdiv_##P (pg, svdup_##TS (op1), op2);            \
>>> +  if (svptest_any (pg, svcmpne (pg, exp, res_n)))                    \
>>> +    __builtin_abort ();                                                    
>>>   \
>>> +}
>>> +
>>> +#define TEST_TYPES_1(T, TS)                                          \
>>> +  F (T, TS, m, 79, 16)                                                     
>>>   \
>>> +  F (T, TS, z, 79, 16)                                                     
>>>   \
>>> +  F (T, TS, x, 79, 16)
>>> +
>>> +#define TEST_TYPES                                                   \
>>> +  TEST_TYPES_1 (float16, f16)                                              
>>>   \
>>> +  TEST_TYPES_1 (float32, f32)                                              
>>>   \
>>> +  TEST_TYPES_1 (float64, f64)                                              
>>>   \
>>> +  TEST_TYPES_1 (int32, s32)                                          \
>>> +  TEST_TYPES_1 (int64, s64)                                          \
>>> +  TEST_TYPES_1 (uint32, u32)                                         \
>>> +  TEST_TYPES_1 (uint64, u64)
>>> +
>>> +#define TEST_VALUES_S_1(B, OP1, OP2)                                 \
>>> +  F (int##B, s##B, x, OP1, OP2)
>>> +
>>> +#define TEST_VALUES_S                                                      
>>>   \
>>> +  TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                         \
>>> +  TEST_VALUES_S_1 (64, INT64_MIN, INT64_MIN)                         \
>>> +  TEST_VALUES_S_1 (32, -7, 4)                                              
>>>   \
>>> +  TEST_VALUES_S_1 (64, -7, 4)                                              
>>>   \
>>> +  TEST_VALUES_S_1 (32, INT32_MAX, (1 << 30))                         \
>>> +  TEST_VALUES_S_1 (64, INT64_MAX, (1ULL << 62))                            
>>>   \
>>> +  TEST_VALUES_S_1 (32, INT32_MIN, (1 << 30))                         \
>>> +  TEST_VALUES_S_1 (64, INT64_MIN, (1ULL << 62))                            
>>>   \
>>> +  TEST_VALUES_S_1 (32, INT32_MAX, 1)                                 \
>>> +  TEST_VALUES_S_1 (64, INT64_MAX, 1)                                 \
>>> +  TEST_VALUES_S_1 (32, INT32_MIN, 16)                                      
>>>   \
>>> +  TEST_VALUES_S_1 (64, INT64_MIN, 16)                                      
>>>   \
>>> +  TEST_VALUES_S_1 (32, INT32_MAX, -5)                                      
>>>   \
>>> +  TEST_VALUES_S_1 (64, INT64_MAX, -5)                                      
>>>   \
>>> +  TEST_VALUES_S_1 (32, INT32_MIN, -4)                                      
>>>   \
>>> +  TEST_VALUES_S_1 (64, INT64_MIN, -4)
>>> +
>>> +#define TEST_VALUES_U_1(B, OP1, OP2)                                 \
>>> +  F (uint##B, u##B, x, OP1, OP2)
>>> +
>>> +#define TEST_VALUES_U                                                      
>>>   \
>>> +  TEST_VALUES_U_1 (32, UINT32_MAX, UINT32_MAX)                             
>>>   \
>>> +  TEST_VALUES_U_1 (64, UINT64_MAX, UINT64_MAX)                             
>>>   \
>>> +  TEST_VALUES_U_1 (32, UINT32_MAX, (1 << 31))                              
>>>   \
>>> +  TEST_VALUES_U_1 (64, UINT64_MAX, (1ULL << 63))                     \
>>> +  TEST_VALUES_U_1 (32, 7, 4)                                         \
>>> +  TEST_VALUES_U_1 (64, 7, 4)                                         \
>>> +  TEST_VALUES_U_1 (32, 7, 3)                                         \
>>> +  TEST_VALUES_U_1 (64, 7, 3)                                         \
>>> +  TEST_VALUES_U_1 (32, 11, 1)                                              
>>>   \
>>> +  TEST_VALUES_U_1 (64, 11, 1)
>>> +
>>> +#define TEST_VALUES                                                  \
>>> +  TEST_VALUES_S                                                            
>>>   \
>>> +  TEST_VALUES_U
>>> +
>>> +int
>>> +main (void)
>>> +{
>>> +  const pred pg = svptrue_b64 ();
>> 
>> I think this should svptrue_b8 instead.  As it stands, the:
>> 
>> if (svptest_any (pg, svcmpne (pg, ...)))
>>   __builtin_abort ();
>> 
>> tests will only check the first element in each 64-bit chunk.
> Done.
>> 
>> OK with those changes from my POV, but please give others 24 hours
>> to comment.
>> 
>> Thanks,
>> Richard
>> 
>>> +  TEST_TYPES
>>> +  TEST_VALUES
>>> +  return 0;
>>> +}
>

Re: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by constant.

Reply via email to