Hi Claudio,

> On 22 Jul 2024, at 11:30, Claudio Bantaloukas <claudio.bantalou...@arm.com> 
> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> The ACLE declares several helper types and functions to
> facilitate construction of `fpm` arguments.
> 
> gcc/ChangeLog:
> 
>        * config/aarch64/arm_acle.h (fpm_t): New type representing fpmr values.
>        (enum __ARM_FPM_FORMAT): New enum representing valid fp8 formats.
>        (enum __ARM_FPM_OVERFLOW): New enum representing how some fp8
>        calculations work.
>        (arm_fpm_init): New.
>        (arm_set_fpm_src1_format): Likewise.
>        (arm_set_fpm_src2_format): Likewise.
>        (arm_set_fpm_dst_format): Likewise.
>        (arm_set_fpm_overflow_cvt): Likewise.
>        (arm_set_fpm_overflow_mul): Likewise.
>        (arm_set_fpm_lscale): Likewise.
>        (arm_set_fpm_lscale2): Likewise.
>        (arm_set_fpm_nscale): Likewise.

According to the FP8 pull request in ACLE:
https://github.com/ARM-software/acle/pull/323/files
These functions should all be in the implementation namespace, that is they 
should begin with __arm*.

> 
> gcc/testsuite/ChangeLog:
> 
>        * gcc.target/aarch64/acle/fp8-helpers.c: New test of fpmr helper 
> functions.
> ---
> gcc/config/aarch64/arm_acle.h                 | 37 +++++++++++++
> .../gcc.target/aarch64/acle/fp8-helpers.c     | 52 +++++++++++++++++++
> 2 files changed, 89 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c
> 
> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
> index 2aa681090fa..4f0c600582a 100644
> --- a/gcc/config/aarch64/arm_acle.h
> +++ b/gcc/config/aarch64/arm_acle.h
> @@ -385,6 +385,43 @@ __rndrrs (uint64_t *__res)
> 
> #pragma GCC pop_options
> 
> +#ifdef __ARM_FEATURE_FP8
> +
> +typedef uint64_t fpm_t;
> +
> +enum __ARM_FPM_FORMAT
> +{
> +  __ARM_FPM_E5M2,
> +  __ARM_FPM_E4M3,
> +};
> +
> +enum __ARM_FPM_OVERFLOW
> +{
> +  __ARM_FPM_INFNAN,
> +  __ARM_FPM_SATURATE,
> +};
> +
> +#define arm_fpm_init() (0)
> +
> +#define arm_set_fpm_src1_format(__fpm, __format) \
> +  ((__fpm & ~(uint64_t)0x7) | (__format & (uint64_t)0x7))
> +#define arm_set_fpm_src2_format(__fpm, __format) \
> +  ((__fpm & ~((uint64_t)0x7 << 3)) | ((__format & (uint64_t)0x7) << 3))
> +#define arm_set_fpm_dst_format(__fpm, __format) \
> +  ((__fpm & ~((uint64_t)0x7 << 6)) | ((__format & (uint64_t)0x7) << 6))
> +#define arm_set_fpm_overflow_cvt(__fpm, __behaviour) \
> +  ((__fpm & ~((uint64_t)0x1 << 15)) | ((__behaviour & (uint64_t)0x1) << 15))
> +#define arm_set_fpm_overflow_mul(__fpm, __behaviour) \
> +  ((__fpm & ~((uint64_t)0x1 << 14)) | ((__behaviour & (uint64_t)0x1) << 14))
> +#define arm_set_fpm_lscale(__fpm, __scale) \
> +  ((__fpm & ~((uint64_t)0x7f << 16)) | ((__scale & (uint64_t)0x7f) << 16))
> +#define arm_set_fpm_lscale2(__fpm, __scale) \
> +  ((__fpm & ~((uint64_t)0x3f << 32)) | ((__scale & (uint64_t)0x3f) << 32))
> +#define arm_set_fpm_nscale(__fpm, __scale) \
> +  ((__fpm & ~((uint64_t)0xff << 24)) | ((__scale & (uint64_t)0xff) << 24))
> +

I’d feel more comfortable if these were implemented as proper intrinsics in 
aarch64-builtins.cc <http://aarch64-builtins.cc/> rather than open-coding them 
like that. For one it would guarantee tighter control over the code generation.
This version would likely fluctuate a lot based on optimization level, 
evolution of unrelated passes in the mined and other things.
Additionally, the ACLE mandates certain ranges for the scale parameter in these 
intrinsics. Having them as builtins would allow us to add validation code to 
help the user here.

> +#endif
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c
> new file mode 100644
> index 00000000000..e654326c387
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c
> @@ -0,0 +1,52 @@
> +/* Test the fp8 ACLE helper functions.  */
> +/* { dg-do compile } */
> +/* { dg-options "-std=c90 -pedantic-errors -O1 -march=armv9.4-a+fp8" } */
> +
> +#include <arm_acle.h>
> +
> +void
> +test_prepare_fpmr_sysreg ()
> +{
> +
> +#define _S_EQ(expr, expected)                                                
>   \
> +  _Static_assert (expr == expected, #expr " == " #expected)
> +
> +  _S_EQ (arm_fpm_init (), 0);
> +
> +  /* Bits [2:0] */
> +  _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E5M2), 0);
> +  _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x1);
> +
> +  /* Bits [5:3] */
> +  _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E5M2), 0);
> +  _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x8);
> +
> +  /* Bits [8:6] */
> +  _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E5M2), 0);
> +  _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x40);
> +
> +  /* Bit 14 */
> +  _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_INFNAN), 0);
> +  _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_SATURATE),
> + 0x4000);
> +
> +  /* Bit 15 */
> +  _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_INFNAN), 0);
> +  _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_SATURATE),
> + 0x8000);
> +
> +  /* Bits [22:16] */
> +  _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 0), 0);
> +  _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 127), 0x7F0000);
> +
> +  /* Bits [37:32] */
> +  _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 0), 0);
> +  _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 63), 0x3F00000000);
> +
> +  /* Bits [31:24] */
> +  _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 0), 0);
> +  _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 127), 0x7F000000);
> +  _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), -128), 0x80000000);
> +

Given my comment above, I’d expect to see some code generation tests as well to 
check that the FPM modification functions above generate tight code.

Thanks,
Kyrill


> +#undef _S_EQ
> +}

Reply via email to