[clang] [llvm] [Offload][CUDA] Add initial cuda_runtime.h overlay (PR #94821)
@@ -0,0 +1,30 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO arsenm wrote: Better to enable supported cases? https://github.com/llvm/llvm-project/pull/94821 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)
@@ -0,0 +1,9 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple amdgcn %s -emit-llvm -o - | FileCheck %s arsenm wrote: Why do you need -fclang-abi-compat=latest https://github.com/llvm/llvm-project/pull/94830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)
@@ -0,0 +1,21 @@ +//===-- AMDGPUTypes.def - Metadata about AMDGPU types ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file defines various AMDGPU builtin types. +// +//===--===// + +#ifndef AMDGPU_OPAQUE_TYPE +#define AMDGPU_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \ + AMDGPU_TYPE(Name, Id, SingletonId) +#endif + +AMDGPU_OPAQUE_TYPE("__buffer_rsrc_t", "__buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy) arsenm wrote: Should it include an amdgpu prefix? https://github.com/llvm/llvm-project/pull/94830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/94830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)
@@ -2200,6 +2206,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { Align = 8; \ break; #include "clang/Basic/WebAssemblyReferenceTypes.def" +case BuiltinType::AMDGPUBufferRsrc: + Width = 128; + Align = 128; arsenm wrote: If we were exposing the pointer, it would be 160/192 https://github.com/llvm/llvm-project/pull/94830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)
https://github.com/arsenm commented: Need stacked PR that adds the make_buffer_rsrc builtin that shows its use https://github.com/llvm/llvm-project/pull/94830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)
@@ -1091,6 +1091,9 @@ enum PredefinedTypeIDs { // \brief WebAssembly reference types with auto numeration #define WASM_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID, #include "clang/Basic/WebAssemblyReferenceTypes.def" +// \breif AMDGPU types with auto numeration arsenm wrote: Typo breif https://github.com/llvm/llvm-project/pull/94830 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -16055,6 +16145,90 @@ of the two arguments. -0.0 is considered to be less than +0.0 for this intrinsic. Note that these are the semantics specified in the draft of IEEE 754-2019. +.. _i_minimumnum: + +'``llvm.minimumnum.*``' Intrinsic +^ + +Syntax: +""" + +This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.minimumnum.f32(float %Val0, float %Val1) + declare double@llvm.minimumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +" + +The '``llvm.minimumnum.*``' intrinsics return the minimum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the lesser of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in IEEE 754-2019. + +.. _i_maximumnum: + +'``llvm.maximumnum.*``' Intrinsic +^ + +Syntax: +""" + +This is an overloaded intrinsic. You can use ``llvm.maximumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.maximumnum.f32(float %Val0, float %Val1) + declare double@llvm.maximumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.maximumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.maximumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.maximumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +" + +The '``llvm.maximumnum.*``' intrinsics return the maximum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the greater of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in IEEE 754-2019. arsenm wrote: Copy paste error minimumNumber. Also like above, this is not the signaling nan behavior (where the behavior inverts from quiet nan) https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -16055,6 +16145,90 @@ of the two arguments. -0.0 is considered to be less than +0.0 for this intrinsic. Note that these are the semantics specified in the draft of IEEE 754-2019. +.. _i_minimumnum: + +'``llvm.minimumnum.*``' Intrinsic +^ + +Syntax: +""" + +This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.minimumnum.f32(float %Val0, float %Val1) + declare double@llvm.minimumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +" + +The '``llvm.minimumnum.*``' intrinsics return the minimum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the lesser of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in IEEE 754-2019. + +.. _i_maximumnum: + +'``llvm.maximumnum.*``' Intrinsic +^ + +Syntax: +""" + +This is an overloaded intrinsic. You can use ``llvm.maximumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.maximumnum.f32(float %Val0, float %Val1) + declare double@llvm.maximumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.maximumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.maximumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.maximumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +" + +The '``llvm.maximumnum.*``' intrinsics return the maximum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the greater of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in IEEE 754-2019. arsenm wrote: Should also state the explicit difference here, on the intrinsic, the comparison to minnum https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -15874,6 +15874,96 @@ The returned value is completely identical to the input except for the sign bit; in particular, if the input is a NaN, then the quiet/signaling bit and payload are perfectly preserved. +.. _i_fminmax_family: + +'``llvm.min.*``' Intrinsics Comparation +^^^ + +Standard: +" + +IEEE754 and ISO C define some min/max operations, and they have some differences +on working with qNaN/sNaN and +0.0/-0.0. Here is the list: + +.. list-table:: + :header-rows: 2 + + * - ``ISO C`` + - fmin/fmax + - none + - fmininum/fmaximum + - fminimum_num/fmaximum_num + + * - ``IEEE754`` + - none + - nimNUM/maxNUM (2008) + - minimum/maximum (2019) + - minimumNumber/maximumNumber (2019) + + * - ``+0.0 vs -0.0`` + - either one + - +0.0 > -0.0 + - +0.0 > -0.0 + - +0.0 > -0.0 + + * - ``NUM vs sNaN`` + - qNaN, invalid excpetion + - qNaN, invalid excpetion + - qNaN, invalid excpetion + - NUM, invalid excpetion arsenm wrote: Typo exception throughout https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -15874,6 +15874,96 @@ The returned value is completely identical to the input except for the sign bit; in particular, if the input is a NaN, then the quiet/signaling bit and payload are perfectly preserved. +.. _i_fminmax_family: + +'``llvm.min.*``' Intrinsics Comparation +^^^ + +Standard: +" + +IEEE754 and ISO C define some min/max operations, and they have some differences +on working with qNaN/sNaN and +0.0/-0.0. Here is the list: + +.. list-table:: + :header-rows: 2 + + * - ``ISO C`` + - fmin/fmax + - none + - fmininum/fmaximum + - fminimum_num/fmaximum_num + + * - ``IEEE754`` + - none + - nimNUM/maxNUM (2008) + - minimum/maximum (2019) + - minimumNumber/maximumNumber (2019) + + * - ``+0.0 vs -0.0`` + - either one + - +0.0 > -0.0 + - +0.0 > -0.0 + - +0.0 > -0.0 + + * - ``NUM vs sNaN`` + - qNaN, invalid excpetion + - qNaN, invalid excpetion + - qNaN, invalid excpetion + - NUM, invalid excpetion + + * - ``NUM vs qNaN`` + - NUM, no excpetion + - NUM, no excpetion + - qNaN, no excpetion + - NUM, no excpetion + arsenm wrote: cover nan vs. nan case https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -16055,6 +16145,90 @@ of the two arguments. -0.0 is considered to be less than +0.0 for this intrinsic. Note that these are the semantics specified in the draft of IEEE 754-2019. +.. _i_minimumnum: + +'``llvm.minimumnum.*``' Intrinsic +^ + +Syntax: +""" + +This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.minimumnum.f32(float %Val0, float %Val1) + declare double@llvm.minimumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +" + +The '``llvm.minimumnum.*``' intrinsics return the minimum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the lesser of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in IEEE 754-2019. arsenm wrote: This is not the IEEE semantics for signaling nan. For a signaling nan, returns a quiet nan, not the non-non operand https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > "aggregates" here might even be unusual cases like `<4 x i8>` Vectors aren't aggregates and are more reasonable https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > `voffset` and `soffset` are "offset that goes in VGPRs" and "offset that goes > in SGPRs", with the latter having some different bounds-checking semantics on > ... at least some of the gfx9's, IIRC. > Right, that's the problem. We need to know the parameters of the SRD in order to make use of the scalar offset. Ideally we would have one pointer operand and be able to addressing mode match into soffset/voffset/imm https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > 2. What I mean is that "types that work" isn't the right framing: any type > can be legalized to one or more types that work. That is, down in the isel > legalizer, if I call for, for example >```llvm >%0 = call {i64, i64, i8} @llvm.amdgcn.raw.buffer.ptr.load(ptr addrspace(8) > %rsrc, i32 %off, ...) >``` Handling arbitrary aggregates here isn't really reasonable or necessary. We can restrict this to a reasonable set of legal-ish types https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > 1. For the swizzled case, that's `struct.ptr.buffer.*`, and yeah, those will > always need builtins because LLVM can't deal in 2D addressing schemes But the raw buffer intrinsics have both the soffset and voffset parameters though? Not just the struct https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > Actually, even ignoring address space 7, it feels like these builtins if you > could `raw.ptr.buffer.store` any type you liked, and then they could be > type-varying in Clang? We could either have a builtin for all the types that would work, or if we want to treat them more like a normal pointer, clang could verify you only use them with types that will work https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior { // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available // encodings do not distinguish between signalling and quiet NaN. NanOnly, + + // This behavior is present in Float6E3M2FN and Float6E2M3FN types. + // There is no representation for Inf or NaN. + NoNanInf, arsenm wrote: Invert and call SupportsNonFinite? https://github.com/llvm/llvm-project/pull/94735 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)
@@ -878,6 +896,10 @@ void IEEEFloat::copySignificand(const IEEEFloat ) { for the significand. If double or longer, this is a signalling NaN, which may not be ideal. If float, this is QNaN(0). */ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NoNanInf) { +assert(false && "This floating point format does not support NaN\n"); +return; arsenm wrote: llvm_unreachable, also no dead return https://github.com/llvm/llvm-project/pull/94735 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Add timeout for GPU detection utilities (PR #94751)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/94751 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Add timeout for GPU detection utilities (PR #94751)
@@ -205,7 +205,7 @@ class ToolChain { /// Executes the given \p Executable and returns the stdout. llvm::Expected> - executeToolChainProgram(StringRef Executable) const; + executeToolChainProgram(StringRef Executable, unsigned Timeout = 0) const; arsenm wrote: Name this SecondsToWait to match ExecuteAndWait? https://github.com/llvm/llvm-project/pull/94751 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)
@@ -1881,6 +1890,20 @@ TEST(APFloatTest, getSmallest) { EXPECT_TRUE(test.isFiniteNonZero()); EXPECT_TRUE(test.isDenormal()); EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + test = APFloat::getSmallest(APFloat::Float6E3M2FN(), false); + expected = APFloat(APFloat::Float6E3M2FN(), "0x0.1p0"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + test = APFloat::getSmallest(APFloat::Float6E2M3FN(), false); + expected = APFloat(APFloat::Float6E2M3FN(), "0x0.2p0"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); } arsenm wrote: Should also test getZero and the other special case constructors https://github.com/llvm/llvm-project/pull/94735 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)
@@ -47,6 +47,10 @@ static std::string convertToString(double d, unsigned Prec, unsigned Pad, return std::string(Buffer.data(), Buffer.size()); } +static bool hasNanOrInf(APFloat::Semantics S) { + return (S != APFloat::S_Float6E3M2FN) && (S != APFloat::S_Float6E2M3FN); +} arsenm wrote: Probably would be useful as a helper somewhere in APFloat or Semantics https://github.com/llvm/llvm-project/pull/94735 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] `used` globals are fake (PR #93601)
@@ -8642,8 +8642,11 @@ The '``llvm.used``' Global Variable The ``@llvm.used`` global is an array which has :ref:`appending linkage `. This array contains a list of pointers to named global variables, functions and aliases which may optionally -have a pointer cast formed of bitcast or getelementptr. For example, a legal -use of it is: +have a pointer cast formed of bitcast or getelementptr. The address space of the +pointers is always unspecified, rather than the globals address space, since arsenm wrote: Do not say unspecified. Say it is the default address space. 0 is not an unspecified address space https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] Global constructors/destructors are globals (PR #93914)
https://github.com/arsenm commented: Is this redundant with #93601? https://github.com/llvm/llvm-project/pull/93914 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
arsenm wrote: Commit message also needs to be updated https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2922,18 +2922,19 @@ static void emitUsed(CodeGenModule , StringRef Name, if (List.empty()) return; + llvm::Type *UsedPtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext()); arsenm wrote: Best to just use get(Ctx, 0) https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2922,18 +2922,19 @@ static void emitUsed(CodeGenModule , StringRef Name, if (List.empty()) return; + llvm::Type *UsedPtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext()); + // Convert List to what ConstantArray needs. SmallVector UsedArray; UsedArray.resize(List.size()); for (unsigned i = 0, e = List.size(); i != e; ++i) { -UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( -cast(&*List[i]), CGM.GlobalsInt8PtrTy); +UsedArray[i] = llvm::ConstantExpr::getPointerCast( arsenm wrote: Although we should get rid of it, getPointerBitCastOrAddrSpaceCast is the correct helper for now. We should have a getAddrSpaceCast that doesn't throw a fit on same typed pointers but getPointerCast will try to insert ptrtoint etc. https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
https://github.com/arsenm approved this pull request. lgtm with nit https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > If we do want addrspace(7), we'll need to expose `make.buffer.rsrc` and give > it a `p7` variant probably. Yes. We probably should expose some kind of custom type instead of directly using a C address_space(7) attribute https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -0,0 +1,19 @@ +; RUN: not --crash llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s 2>&1 | FileCheck %s arsenm wrote: This should also be repeated for all 3 intrinsics https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
https://github.com/arsenm requested changes to this pull request. @jayfoad's testcase fails and the same test should be repeated for all 3 intrinsics https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -0,0 +1,19 @@ +; RUN: not --crash llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s 2>&1 | FileCheck %s arsenm wrote: This is not an IR verifier test, it is a codegen test that fails the machine verifier. A machine verifier would go in test/MachineVerifier, and preferably would be written in MIR and not rely on codegen. I thought the point of this test was to show that the selection did not produce a machine verifier error after selection, so this is broken https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Use `I` to decorate imm argument for `__builtin_amdgcn_global_load_lds` (PR #94376)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/94376 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
@@ -0,0 +1,264 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s --check-prefixes=VERDE +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck %s --check-prefixes=GFX8 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s --check-prefixes=GFX11 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef short v2i16 __attribute__((ext_vector_type(2))); +typedef int v2i32 __attribute__((ext_vector_type(2))); +typedef half v2f16 __attribute__((ext_vector_type(2))); +typedef float v2f32 __attribute__((ext_vector_type(2))); +typedef short v4i16 __attribute__((ext_vector_type(4))); +typedef int v4i32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); +typedef float v4f32 __attribute__((ext_vector_type(4))); + +// VERDE-LABEL: @test_amdgcn_raw_buffer_store_i8( +// VERDE-NEXT: entry: +// VERDE-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 [[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0) +// VERDE-NEXT:ret void +// +// GFX8-LABEL: @test_amdgcn_raw_buffer_store_i8( +// GFX8-NEXT: entry: +// GFX8-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 [[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0) +// GFX8-NEXT:ret void +// +// GFX11-LABEL: @test_amdgcn_raw_buffer_store_i8( +// GFX11-NEXT: entry: +// GFX11-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 [[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0) +// GFX11-NEXT:ret void +// +void test_amdgcn_raw_buffer_store_i8(char vdata, v4i32 rsrc) { arsenm wrote: Look at the actual ImmArg> values. It's only the cachepolicy argument https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)
@@ -0,0 +1,293 @@ +// REQUIRES: amdgpu-registered-target +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature +// RUN: %clang_cc1 -cc1 -std=c23 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s + +void sink_0(...); +void sink_1(int, ...); +void sink_2(double, int, ...); + +// Simple scalar values + +// CHECK-LABEL: define {{[^@]+}}@zero_varargs +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0() #[[ATTR2:[0-9]+]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void zero_varargs(int f0, double f1) +{ + sink_0(); + sink_1(f0); + sink_2(f1, f0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_i32 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i32 noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], i32 noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_i32(int f0, double f1, int v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_ptr +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], ptr noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0(ptr noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], ptr noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], ptr noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_ptr(int f0, double f1, void* v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_f64 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], double noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], double noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_f64(int f0, double f1, double v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + + +// C has various type promotion rules for variadics + +// CHECK-LABEL: define {{[^@]+}}@one_i8 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i8 noundef signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[CONV:%.*]] = sext i8 [[V0]] to i32 +// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_i8(int f0, double f1, char v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_i16 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i16 noundef signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[CONV:%.*]] = sext i16 [[V0]] to i32 +// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_i16(int f0, double f1, short v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_f32 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], float noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[CONV:%.*]] = fpext float [[V0]] to double +// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], double noundef
[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)
arsenm wrote: > @arsenm You're right about passing larger things indirectly. I'm intending to > land this as-is, with the types inlined, as that unblocks #93362. I'm nervous > that the extra pointer indirection will hit the same memory error that > tweaking codegen in that patch hits (it's a similar sort of pattern to the > top level argument passing) and wish to postpone that until there is a > working baseline. I do think we need to revisit some threshold at some point. We should use byref for anything that's most likely to hit the stack anyway https://github.com/llvm/llvm-project/pull/94083 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [libc] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)
@@ -0,0 +1,1037 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimization pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The strategy is to turn the ... part of a variadic function into a va_list +// and fix up the call sites. The majority of the pass is target independent. +// The exceptions are the va_list type itself and the rules for where to store +// variables in memory such that va_arg can iterate over them given a va_list. +// +// The majority of the plumbing is splitting the variadic function into a +// single basic block that packs the variadic arguments into a va_list and +// a second function that does the work of the original. That packing is +// exactly what is done by va_start. Further, the transform from ... to va_list +// replaced va_start with an operation to copy a va_list from the new argument, +// which is exactly a va_copy. This is useful for reducing target-dependence. +// +// A va_list instance is a forward iterator, where the primary operation va_arg +// is dereference-then-increment. This interface forces significant convergent +// evolution between target specific implementations. The variation in runtime +// data layout is limited to that representable by the iterator, parameterised +// by the type passed to the va_arg instruction. +// +// Therefore the majority of the target specific subtlety is packing arguments +// into a stack allocated buffer such that a va_list can be initialised with it +// and the va_arg expansion for the target will find the arguments at runtime. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +// There is one "clever" invariant in use. va_start intrinsics that are not +// within a varidic functions are an error in the IR verifier. When this +// transform moves blocks from a variadic function into a fixed arity one, it +// moves va_start intrinsics along with everything else. That means that the +// va_start intrinsics that need to be rewritten to use the trailing argument +// are exactly those that are in non-variadic functions so no further state +// is needed to distinguish those that need to be rewritten. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { + +cl::opt ExpandVariadicsModeOption( +DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE), +cl::init(ExpandVariadicsMode::Unspecified), +cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified", + "Use the implementation defaults"), + clEnumValN(ExpandVariadicsMode::Disable, "disable", + "Disable the pass entirely"), + clEnumValN(ExpandVariadicsMode::Optimize, "optimize", + "Optimise without changing ABI"), + clEnumValN(ExpandVariadicsMode::Lowering, "lowering", + "Change variadic calling convention"))); + +bool commandLineOverride() { + return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified; +} + +// Instances of this class encapsulate the target-dependant behaviour as a +// function of triple. Implementing a new ABI is adding a case to the switch +// in create(llvm::Triple) at the end of this file. +class VariadicABIInfo { +protected: + VariadicABIInfo() {} + +public: + static std::unique_ptr create(llvm::Triple const ); + + // Allow overriding whether the pass runs on a per-target basis + virtual bool enableForTarget() = 0; + + //
[clang] [libc] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)
@@ -0,0 +1,1037 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimization pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The strategy is to turn the ... part of a variadic function into a va_list +// and fix up the call sites. The majority of the pass is target independent. +// The exceptions are the va_list type itself and the rules for where to store +// variables in memory such that va_arg can iterate over them given a va_list. +// +// The majority of the plumbing is splitting the variadic function into a +// single basic block that packs the variadic arguments into a va_list and +// a second function that does the work of the original. That packing is +// exactly what is done by va_start. Further, the transform from ... to va_list +// replaced va_start with an operation to copy a va_list from the new argument, +// which is exactly a va_copy. This is useful for reducing target-dependence. +// +// A va_list instance is a forward iterator, where the primary operation va_arg +// is dereference-then-increment. This interface forces significant convergent +// evolution between target specific implementations. The variation in runtime +// data layout is limited to that representable by the iterator, parameterised +// by the type passed to the va_arg instruction. +// +// Therefore the majority of the target specific subtlety is packing arguments +// into a stack allocated buffer such that a va_list can be initialised with it +// and the va_arg expansion for the target will find the arguments at runtime. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +// There is one "clever" invariant in use. va_start intrinsics that are not +// within a varidic functions are an error in the IR verifier. When this +// transform moves blocks from a variadic function into a fixed arity one, it +// moves va_start intrinsics along with everything else. That means that the +// va_start intrinsics that need to be rewritten to use the trailing argument +// are exactly those that are in non-variadic functions so no further state +// is needed to distinguish those that need to be rewritten. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { + +cl::opt ExpandVariadicsModeOption( +DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE), +cl::init(ExpandVariadicsMode::Unspecified), +cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified", + "Use the implementation defaults"), + clEnumValN(ExpandVariadicsMode::Disable, "disable", + "Disable the pass entirely"), + clEnumValN(ExpandVariadicsMode::Optimize, "optimize", + "Optimise without changing ABI"), + clEnumValN(ExpandVariadicsMode::Lowering, "lowering", + "Change variadic calling convention"))); + +bool commandLineOverride() { + return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified; +} + +// Instances of this class encapsulate the target-dependant behaviour as a +// function of triple. Implementing a new ABI is adding a case to the switch +// in create(llvm::Triple) at the end of this file. +class VariadicABIInfo { +protected: + VariadicABIInfo() {} + +public: + static std::unique_ptr create(llvm::Triple const ); arsenm wrote: const always to the left. East const is weird
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
@@ -0,0 +1,264 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s --check-prefixes=VERDE +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck %s --check-prefixes=GFX8 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s --check-prefixes=GFX11 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef short v2i16 __attribute__((ext_vector_type(2))); +typedef int v2i32 __attribute__((ext_vector_type(2))); +typedef half v2f16 __attribute__((ext_vector_type(2))); +typedef float v2f32 __attribute__((ext_vector_type(2))); +typedef short v4i16 __attribute__((ext_vector_type(4))); +typedef int v4i32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); +typedef float v4f32 __attribute__((ext_vector_type(4))); + +// VERDE-LABEL: @test_amdgcn_raw_buffer_store_i8( +// VERDE-NEXT: entry: +// VERDE-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 [[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0) +// VERDE-NEXT:ret void +// +// GFX8-LABEL: @test_amdgcn_raw_buffer_store_i8( +// GFX8-NEXT: entry: +// GFX8-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 [[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0) +// GFX8-NEXT:ret void +// +// GFX11-LABEL: @test_amdgcn_raw_buffer_store_i8( +// GFX11-NEXT: entry: +// GFX11-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 [[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0) +// GFX11-NEXT:ret void +// +void test_amdgcn_raw_buffer_store_i8(char vdata, v4i32 rsrc) { arsenm wrote: We are trying to move to using real pointers (i.e. use llvm.amdgcn.raw.ptr.buffer.store instead of llvm.amdgcn.raw.buffer.store) https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)
arsenm wrote: > Is there really a good use case for this? Can you use regular stores to > addrspace(7) instead? @krzysz00 I see these regularly used via inline asm in various ML code. We need to expose these in some way to stop people from doing that > > Also, do you really need a separate builtin for every legal type, or is there > some way they can be type-overloaded? Yes, I imagined we would handle images similar to the elementwise intrinsics. However, I don't think that approach works for loads. If we have to have overloads for loads, we probably should mirror it for stores. I think it makes more sense to solve the issue for the load case before the stores. They're a bit more complicated because you have the sign vs. zero extended cases to consider, and the overload would be on the return type https://github.com/llvm/llvm-project/pull/94576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][AMDGPU] Use `I` to decorate imm argument for `__builtin_amdgcn_global_load_lds` (PR #94376)
https://github.com/arsenm commented: Missing non-constant tests for each parameter? https://github.com/llvm/llvm-project/pull/94376 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)
@@ -197,12 +202,20 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { return ABIArgInfo::getDirect(LTy, 0, nullptr, false); } -ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, +ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic, unsigned ) const { assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); Ty = useFirstFieldIfTransparentUnion(Ty); + if (Variadic) { arsenm wrote: The bigger concern is using giant aggregate types is not great IR, and not all that well supported. a 65k element array will crash in SelectionDAG for example https://github.com/llvm/llvm-project/pull/94083 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)
@@ -0,0 +1,293 @@ +// REQUIRES: amdgpu-registered-target +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature +// RUN: %clang_cc1 -cc1 -std=c23 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s + +void sink_0(...); +void sink_1(int, ...); +void sink_2(double, int, ...); + +// Simple scalar values + +// CHECK-LABEL: define {{[^@]+}}@zero_varargs +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0() #[[ATTR2:[0-9]+]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void zero_varargs(int f0, double f1) +{ + sink_0(); + sink_1(f0); + sink_2(f1, f0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_i32 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i32 noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], i32 noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_i32(int f0, double f1, int v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_ptr +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], ptr noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0(ptr noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], ptr noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], ptr noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_ptr(int f0, double f1, void* v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_f64 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], double noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], double noundef [[V0]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_f64(int f0, double f1, double v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + + +// C has various type promotion rules for variadics + +// CHECK-LABEL: define {{[^@]+}}@one_i8 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i8 noundef signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[CONV:%.*]] = sext i8 [[V0]] to i32 +// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_i8(int f0, double f1, char v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_i16 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i16 noundef signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[CONV:%.*]] = sext i16 [[V0]] to i32 +// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:ret void +// +void one_i16(int f0, double f1, short v0) +{ + sink_0(v0); + sink_1(f0, v0); + sink_2(f1, f0, v0); +} + +// CHECK-LABEL: define {{[^@]+}}@one_f32 +// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], float noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[CONV:%.*]] = fpext float [[V0]] to double +// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double noundef [[CONV]]) #[[ATTR2]] +// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef [[F1]], i32 noundef [[F0]], double noundef
[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)
@@ -197,12 +202,20 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { return ABIArgInfo::getDirect(LTy, 0, nullptr, false); } -ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, +ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic, unsigned ) const { assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); Ty = useFirstFieldIfTransparentUnion(Ty); + if (Variadic) { arsenm wrote: Wouldn't we still want to follow the isAggregateTypeForABI rules? Large structs should still go through byref? https://github.com/llvm/llvm-project/pull/94083 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -32,27 +32,29 @@ class StoreInst; /// These are the kinds of recurrences that we support. enum class RecurKind { - None, ///< Not a recurrence. - Add, ///< Sum of integers. - Mul, ///< Product of integers. - Or, ///< Bitwise or logical OR of integers. - And, ///< Bitwise or logical AND of integers. - Xor, ///< Bitwise or logical XOR of integers. - SMin, ///< Signed integer min implemented in terms of select(cmp()). - SMax, ///< Signed integer max implemented in terms of select(cmp()). - UMin, ///< Unsigned integer min implemented in terms of select(cmp()). - UMax, ///< Unsigned integer max implemented in terms of select(cmp()). - FAdd, ///< Sum of floats. - FMul, ///< Product of floats. - FMin, ///< FP min implemented in terms of select(cmp()). - FMax, ///< FP max implemented in terms of select(cmp()). - FMinimum, ///< FP min with llvm.minimum semantics - FMaximum, ///< FP max with llvm.maximum semantics - FMulAdd, ///< Sum of float products with llvm.fmuladd(a * b + sum). - IAnyOf, ///< Any_of reduction with select(icmp(),x,y) where one of (x,y) is -///< loop invariant, and both x and y are integer type. - FAnyOf///< Any_of reduction with select(fcmp(),x,y) where one of (x,y) is -///< loop invariant, and both x and y are integer type. + None,///< Not a recurrence. + Add, ///< Sum of integers. + Mul, ///< Product of integers. + Or, ///< Bitwise or logical OR of integers. + And, ///< Bitwise or logical AND of integers. + Xor, ///< Bitwise or logical XOR of integers. + SMin,///< Signed integer min implemented in terms of select(cmp()). + SMax,///< Signed integer max implemented in terms of select(cmp()). + UMin,///< Unsigned integer min implemented in terms of select(cmp()). + UMax,///< Unsigned integer max implemented in terms of select(cmp()). + FAdd,///< Sum of floats. + FMul,///< Product of floats. + FMin,///< FP min implemented in terms of select(cmp()). + FMax,///< FP max implemented in terms of select(cmp()). + FMinimum,///< FP min with llvm.minimum semantics + FMaximum,///< FP max with llvm.maximum semantics + FMinimumnum, ///< FP min with llvm.minimumnum semantics + FMaximumnum, ///< FP max with llvm.maximumnum semantics arsenm wrote: Not sure this is tested, but updating each optimization should be split into a separate change https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
arsenm wrote: You should add the mentioned convergence-tokens.ll test function https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang][CodeGen] Global constructors/destructors are globals (PR #93914)
arsenm wrote: > Perhaps an alternative is to tweak LangRef wording to say that that these are > always emitted as unqualified ptrs, and that their ephemeral nature implies > that their AS is meaningless? I think this is the correct way to handle it. Also we'll need a few stripPointerCasts added somewhere https://github.com/llvm/llvm-project/pull/93914 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)
@@ -0,0 +1,1023 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimization pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The strategy is to turn the ... part of a variadic function into a va_list +// and fix up the call sites. The majority of the pass is target independent. +// The exceptions are the va_list type itself and the rules for where to store +// variables in memory such that va_arg can iterate over them given a va_list. +// +// The majority of the plumbing is splitting the variadic function into a +// single basic block that packs the variadic arguments into a va_list and +// a second function that does the work of the original. That packing is +// exactly what is done by va_start. Further, the transform from ... to va_list +// replaced va_start with an operation to copy a va_list from the new argument, +// which is exactly a va_copy. This is useful for reducing target-dependence. +// +// A va_list instance is a forward iterator, where the primary operation va_arg +// is dereference-then-increment. This interface forces significant convergent +// evolution between target specific implementations. The variation in runtime +// data layout is limited to that representable by the iterator, parameterised +// by the type passed to the va_arg instruction. +// +// Therefore the majority of the target specific subtlety is packing arguments +// into a stack allocated buffer such that a va_list can be initialised with it +// and the va_arg expansion for the target will find the arguments at runtime. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +// There is one "clever" invariant in use. va_start intrinsics that are not +// within a varidic functions are an error in the IR verifier. When this +// transform moves blocks from a variadic function into a fixed arity one, it +// moves va_start intrinsics along with everything else. That means that the +// va_start intrinsics that need to be rewritten to use the trailing argument +// are exactly those that are in non-variadic functions so no further state +// is needed to distinguish those that need to be rewritten. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { + +cl::opt ExpandVariadicsModeOption( +DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE), +cl::init(ExpandVariadicsMode::Unspecified), +cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified", + "Use the implementation defaults"), + clEnumValN(ExpandVariadicsMode::Disable, "disable", + "Disable the pass entirely"), + clEnumValN(ExpandVariadicsMode::Optimize, "optimize", + "Optimise without changing ABI"), + clEnumValN(ExpandVariadicsMode::Lowering, "lowering", + "Change variadic calling convention"))); + +bool commandLineOverride() { + return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified; +} + +// Instances of this class encapsulate the target-dependant behaviour as a +// function of triple. Implementing a new ABI is adding a case to the switch +// in create(llvm::Triple) at the end of this file. +class VariadicABIInfo { +protected: + VariadicABIInfo() {} + +public: + static std::unique_ptr create(llvm::Triple const ); + + // Allow overriding whether the pass runs on a per-target basis + virtual bool enableForTarget() = 0; + + //
[clang] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)
@@ -0,0 +1,1023 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimization pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The strategy is to turn the ... part of a variadic function into a va_list +// and fix up the call sites. The majority of the pass is target independent. +// The exceptions are the va_list type itself and the rules for where to store +// variables in memory such that va_arg can iterate over them given a va_list. +// +// The majority of the plumbing is splitting the variadic function into a +// single basic block that packs the variadic arguments into a va_list and +// a second function that does the work of the original. That packing is +// exactly what is done by va_start. Further, the transform from ... to va_list +// replaced va_start with an operation to copy a va_list from the new argument, +// which is exactly a va_copy. This is useful for reducing target-dependence. +// +// A va_list instance is a forward iterator, where the primary operation va_arg +// is dereference-then-increment. This interface forces significant convergent +// evolution between target specific implementations. The variation in runtime +// data layout is limited to that representable by the iterator, parameterised +// by the type passed to the va_arg instruction. +// +// Therefore the majority of the target specific subtlety is packing arguments +// into a stack allocated buffer such that a va_list can be initialised with it +// and the va_arg expansion for the target will find the arguments at runtime. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +// There is one "clever" invariant in use. va_start intrinsics that are not +// within a varidic functions are an error in the IR verifier. When this +// transform moves blocks from a variadic function into a fixed arity one, it +// moves va_start intrinsics along with everything else. That means that the +// va_start intrinsics that need to be rewritten to use the trailing argument +// are exactly those that are in non-variadic functions so no further state +// is needed to distinguish those that need to be rewritten. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { + +cl::opt ExpandVariadicsModeOption( +DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE), +cl::init(ExpandVariadicsMode::Unspecified), +cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified", + "Use the implementation defaults"), + clEnumValN(ExpandVariadicsMode::Disable, "disable", + "Disable the pass entirely"), + clEnumValN(ExpandVariadicsMode::Optimize, "optimize", + "Optimise without changing ABI"), + clEnumValN(ExpandVariadicsMode::Lowering, "lowering", + "Change variadic calling convention"))); + +bool commandLineOverride() { + return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified; +} + +// Instances of this class encapsulate the target-dependant behaviour as a +// function of triple. Implementing a new ABI is adding a case to the switch +// in create(llvm::Triple) at the end of this file. +class VariadicABIInfo { +protected: + VariadicABIInfo() {} + +public: + static std::unique_ptr create(llvm::Triple const ); + + // Allow overriding whether the pass runs on a per-target basis + virtual bool enableForTarget() = 0; + + //
[clang] [llvm] [clang][CodeGen] Global constructors/destructors are globals (PR #93914)
arsenm wrote: > The third argument here is like for llvm.used, it's a way to associate the > entry with a global or function. If the corresponding global or function is > omitted from the output then the entry will be removed. It isn't used for > anything at run time. So I think there should be a consistent story between > llvm.used and llvm.global_[cd]tors. I briefly skimmed the codegen and as far as I could tell an addrspacecast constant expression in global_ctors won't do the right thing, we probably should fix that too https://github.com/llvm/llvm-project/pull/93914 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2928,12 +2928,13 @@ static void emitUsed(CodeGenModule , StringRef Name, for (unsigned i = 0, e = List.size(); i != e; ++i) { UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( -cast(&*List[i]), CGM.Int8PtrTy); arsenm wrote: You aren't emitting a real global, it's this fake special case IR construct. It doesn't fit into the boxes provided by the target global/program/alloca address spaces https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [WIP] Expand variadic functions in IR (PR #89007)
arsenm wrote: > I think the comments here are fed into #93362 successfully, will go through > the list again to check. So #93362 is the replacement, and not the sequential next piece? Can we close this one then? https://github.com/llvm/llvm-project/pull/89007 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -5005,8 +5007,11 @@ void computeKnownFPClass(const Value *V, const APInt , // If either operand is not NaN, the result is not NaN. if (NeverNaN && (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)) Known.knownNot(fcNan); + if (NeverNaN && + (IID == Intrinsic::minimumnum || IID == Intrinsic::maximumnum)) +Known.knownNot(fcNan); - if (IID == Intrinsic::maxnum) { + if (IID == Intrinsic::maxnum || IID == Intrinsic::maximumnum) { arsenm wrote: Best to keep the value tracking handling in a separate PR. This is missing test coverage https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -16049,6 +16094,84 @@ of the two arguments. -0.0 is considered to be less than +0.0 for this intrinsic. Note that these are the semantics specified in the draft of IEEE 754-2019. +.. _i_minimumnum: + +'``llvm.minimumnum.*``' Intrinsic +^ + +Syntax: +""" + +This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.minimumnum.f32(float %Val0, float %Val1) + declare double@llvm.minimumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +" + +The '``llvm.minimumnum.*``' intrinsics return the minimum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"" +If both operands are NaNs, returns qNaN. Otherwise returns the lesser +of the two arguments. -0.0 is considered to be less than +0.0 for this +intrinsic. Note that these are the semantics specified in IEEE 754-2019. arsenm wrote: Needs to spell out the signaling nan behavior. If we're fixing minnum's snan behavior to match IEEE, this is identical except with the stronger guarantee for signed zero ordering. The documentation should also explicitly state this is the only difference, to help reduce confusion. Alternatively, we could add an immediate bool parameter to minnum/maxnum for whether the ordering of 0 is guaranteed I hate the naming mess we've ended up with here, but I guess C23 has damned us. If you're going to match the C23 names, this should be `llvm.minimum.num` with an extra _ https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
@@ -3636,6 +3648,22 @@ def Fmin : FPMathTemplate, LibBuiltin<"math.h"> { let OnlyBuiltinPrefixedAliasIsConstexpr = 1; } +def FmaximumNum : FPMathTemplate, LibBuiltin<"math.h"> { arsenm wrote: I'd prefer to split the clang changes into a separate change https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)
https://github.com/arsenm commented: > 3. PowerPC: has some interaction with the behavior of `minnum/maxnum`: need > define `fcanonicalize`. AMDGPU has the same handling. This is to break the signaling nan handling from IEEE to the broken old glibc libm behavior. If we fix the definition to match IEEE, this is no longer necessary and the operation is directly legal https://github.com/llvm/llvm-project/pull/93841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
arsenm wrote: > Does this need IR autoupgrade? This type of auto upgrade is free, it just happens https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -5461,8 +5461,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper , SmallVector PartialRes; unsigned NumParts = Size / 32; - MachineInstrBuilder Src0Parts, Src2Parts; - Src0Parts = B.buildUnmerge(PartialResTy, Src0); + MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0), Src2Parts; arsenm wrote: Don't also declare Src2Parts on the same line https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -1208,7 +1225,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. the output. llvm.amdgcn.sdot2Provides direct access to v_dot2_i32_i16 across targets which - support such instructions. This performs signed dot product + upport such instructions. This performs signed dot product arsenm wrote: Stray typo introduced https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -5387,6 +5387,98 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper , return true; } +// TODO: Fix pointer type handling +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper , + MachineInstr , + Intrinsic::ID IID) const { + + MachineIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo = *B.getMRI(); + + auto createLaneOp = [, ](Register Src0, Register Src1, Register Src2, + LLT VT) -> Register { +auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); +switch (IID) { +case Intrinsic::amdgcn_readfirstlane: + return LaneOp.getReg(0); +case Intrinsic::amdgcn_readlane: + return LaneOp.addUse(Src1).getReg(0); +case Intrinsic::amdgcn_writelane: + return LaneOp.addUse(Src1).addUse(Src2).getReg(0); +default: + llvm_unreachable("unhandled lane op"); +} + }; + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) { +Src1 = MI.getOperand(3).getReg(); +if (IID == Intrinsic::amdgcn_writelane) { + Src2 = MI.getOperand(4).getReg(); +} + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { +// Already legal +return true; + } + + if (Size < 32) { +Src0 = B.buildAnyExt(S32, Src0).getReg(0); +if (Src2.isValid()) + Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); + +Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); +B.buildTrunc(DstReg, LaneOpDst); + +MI.eraseFromParent(); +return true; + } + + if (Size % 32 != 0) +return false; + + LLT PartialResTy = S32; + if (Ty.isVector()) { +LLT EltTy = Ty.getElementType(); +switch (EltTy.getSizeInBits()) { +case 16: + PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2)); + break; +case 32: + PartialResTy = EltTy; + break; +default: + // Handle all other cases via S32 pieces; + break; +} + } + + SmallVector PartialRes; + unsigned NumParts = Size / 32; + MachineInstrBuilder Src0Parts, Src2Parts; + Src0Parts = B.buildUnmerge(PartialResTy, Src0); arsenm wrote: Fold declare + initialize https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -1170,6 +1170,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics. :ref:`llvm.set.fpenv` Sets the floating point environment to the specifies state. + llvm.amdgcn.readfirstlaneProvides direct access to v_readfirstlane_b32. Returns the value in + the lowest active lane of the input operand. Currently implemented + for i16, i32, float, half, bf16, <2 x i16>, <2 x half>, <2 x bfloat>, arsenm wrote: bf16->bfloat https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
https://github.com/arsenm commented: lgtm with a few nits https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2047,9 +2047,9 @@ void CodeGenModule::EmitCtorList(CtorList , const char *GlobalName) { llvm::Type *CtorPFTy = llvm::PointerType::get(CtorFTy, TheModule.getDataLayout().getProgramAddressSpace()); - // Get the type of a ctor entry, { i32, void ()*, i8* }. arsenm wrote: This is still one PR that touches both? https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -1,6 +1,8 @@ // RUN: %clang_cc1 %s -triple x86_64-apple-darwin -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s --check-prefix=GLOBALAS // CHECK: @llvm.used = appending global [2 x ptr] [ptr @foo, ptr @X], section "llvm.metadata" +// GLOBALAS: @llvm.compiler.used = appending addrspace(1) global [2 x ptr addrspace(1)] [ptr addrspace(1) addrspacecast (ptr @foo to ptr addrspace(1)), ptr addrspace(1) @X], section "llvm.metadata" arsenm wrote: Used really should use flat. It happens to be better for us that we can legally cast more things to addrspace(0) than to 1, but given how the IR currently works we should just assume used is as 0 https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2928,12 +2928,13 @@ static void emitUsed(CodeGenModule , StringRef Name, for (unsigned i = 0, e = List.size(); i != e; ++i) { UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( -cast(&*List[i]), CGM.Int8PtrTy); arsenm wrote: > Ok, I'm still struggling to see why it is best create (possibly) broken IR It's not broken. It just needs to provide a use, and whatever casts are there do not matter. We can write whatever rules we want and have the verifier enforce it. Currently we don't have any generic concept of "illegal addrspacecasts" (relatedly we should probably stop throwing codegen errors on the cases we don't handle, and just lower them to poison) > Why not make it a special global that uses AS 42 Behavior of address spaces are target defined. We don't want to just grab random numbers for generic purposes. We've gradually been migrating away from 0 being special in more contexts, but nothing has been done for used (and I don't see a particularly compelling reason to do, used just needs to be a box that is a use) > IR-level compat between LLVM versions (yes it frequently works but it's not > guaranteed to). We do guarantee forward compatible bitcode https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [Clang][Sanitizers] Add numerical sanitizer (PR #93783)
@@ -285,6 +285,9 @@ def SanitizeHWAddress : EnumAttr<"sanitize_hwaddress", [FnAttr]>; /// MemTagSanitizer is on. def SanitizeMemTag : EnumAttr<"sanitize_memtag", [FnAttr]>; +/// NumericalStabilitySanitizer is on. +def SanitizeNumericalStability : EnumAttr<"sanitize_numericalstability", [FnAttr]>; arsenm wrote: This should be in the other patch https://github.com/llvm/llvm-project/pull/93783 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][WIP] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types (PR #92725)
@@ -18479,6 +18479,28 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType()); return Builder.CreateCall(F, Args); } + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlanex16: { +llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); arsenm wrote: If there's really not a helper to just EmitScalarExpr for N arguments, there should be one used here https://github.com/llvm/llvm-project/pull/92725 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -6086,6 +6086,63 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering , SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering , SDNode *N, + SelectionDAG ) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IntrinsicID = N->getConstantOperandVal(0); + SDValue Src0 = N->getOperand(1); + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [, ](SDValue Src0, SDValue Src1, SDValue Src2, + MVT VT) -> SDValue { +return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2}) +: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1}) + : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0})); + }; + + SDValue Src1, Src2; + if (IntrinsicID == Intrinsic::amdgcn_readlane || + IntrinsicID == Intrinsic::amdgcn_writelane) { +Src1 = N->getOperand(2); +if (IntrinsicID == Intrinsic::amdgcn_writelane) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { +// Already legal +return SDValue(); + } + + if (ValSize < 32) { +bool IsFloat = VT.isFloatingPoint(); +Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, +SL, MVT::i32); +if (Src2.getNode()) { + Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, + SL, MVT::i32); +} +SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); +SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); +return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; + } + + if ((ValSize % 32) == 0) { +MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); arsenm wrote: Yes, you need bitcast to get from FP to the 32-bit scalars. you are only trying to preserve the element types of the 32-bit legal pieces. i.e. v4f16 -> v2f16, v2f16. v4f32 -> f32, f32, f32, f32. v2f64 -> bitcast v4i32 https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2928,12 +2928,13 @@ static void emitUsed(CodeGenModule , StringRef Name, for (unsigned i = 0, e = List.size(); i != e; ++i) { UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( -cast(&*List[i]), CGM.Int8PtrTy); arsenm wrote: For the purpose of used/compiler_used, I think it's best to ignore the language address space map (reiterating that the IR address space has little to do with the language). Just treat it as a special global that always uses addrspace(0). The required addrspacecasts will be inserted to 0, so it just leaves this as the status quo. It doesn't really matter if the target considers the casts valid or not, the used variables never codegened. If we really had to go out of our way to avoid illegal constant expression casts, we'd probably have to have multiple used intrinsic variables per address space. In a better future we wouldn't allow constantexpr addrspacecasts, and then we'd also need to come up with another solution https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -6086,6 +6086,63 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering , SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering , SDNode *N, + SelectionDAG ) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IntrinsicID = N->getConstantOperandVal(0); + SDValue Src0 = N->getOperand(1); + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [, ](SDValue Src0, SDValue Src1, SDValue Src2, + MVT VT) -> SDValue { +return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2}) +: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1}) + : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0})); + }; + + SDValue Src1, Src2; + if (IntrinsicID == Intrinsic::amdgcn_readlane || + IntrinsicID == Intrinsic::amdgcn_writelane) { +Src1 = N->getOperand(2); +if (IntrinsicID == Intrinsic::amdgcn_writelane) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { +// Already legal +return SDValue(); + } + + if (ValSize < 32) { +bool IsFloat = VT.isFloatingPoint(); +Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, +SL, MVT::i32); +if (Src2.getNode()) { + Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, + SL, MVT::i32); +} +SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); +SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); +return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; + } + + if ((ValSize % 32) == 0) { +MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); +Src0 = DAG.getBitcast(VecVT, Src0); + +if (Src2.getNode()) + Src2 = DAG.getBitcast(VecVT, Src2); + +SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); +SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode()); +return DAG.getBitcast(VT, UnrolledLaneOp); arsenm wrote: You should not be using TLI.getRegClassFor or getTargetExtractSubreg. The GlobalISel equivalent does not use these. Stick to the higher level operations unless necessary, which it isn't here https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -5387,6 +5387,124 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper , return true; } +// TODO: Fix pointer type handling +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper , + MachineInstr , + Intrinsic::ID IID) const { + + MachineIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo = *B.getMRI(); + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + + auto createLaneOp = [&](Register Src0, Register Src1, + Register Src2) -> Register { +auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0); +switch (IID) { +case Intrinsic::amdgcn_readfirstlane: + return LaneOp.getReg(0); +case Intrinsic::amdgcn_readlane: + return LaneOp.addUse(Src1).getReg(0); +case Intrinsic::amdgcn_writelane: + return LaneOp.addUse(Src1).addUse(Src2).getReg(0); +default: + llvm_unreachable("unhandled lane op"); +} + }; + + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) { +Src1 = MI.getOperand(3).getReg(); +if (IID == Intrinsic::amdgcn_writelane) { + Src2 = MI.getOperand(4).getReg(); +} + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { +// Already legal +return true; + } + + if (Size < 32) { +Src0 = B.buildAnyExt(S32, Src0).getReg(0); +if (Src2.isValid()) + Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); + +Register LaneOpDst = createLaneOp(Src0, Src1, Src2); +B.buildTrunc(DstReg, LaneOpDst); + +MI.eraseFromParent(); +return true; + } + + if ((Size % 32) == 0) { +SmallVector PartialRes; +unsigned NumParts = Size / 32; +LLT PartialResTy = +Ty.isVector() && Ty.getElementType() == S16 ? V2S16 : S32; arsenm wrote: This isn't trying to preserve pointer element types, and will also lose FP information in the future. Can you use changeElementCount in the size == 16 case, and try to maintain the element type for the 32-bit case? https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -1170,6 +1170,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics. :ref:`llvm.set.fpenv` Sets the floating point environment to the specifies state. + llvm.amdgcn.readfirstlaneProvides direct access to v_readfirstlane_b32. Returns the value in + the lowest active lane of the input operand. Currently + implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types + whose sizes are multiples of 32-bit. + + llvm.amdgcn.readlane Provides direct access to v_readlane_b32. Returns the value in the + specified lane of the first input operand. The second operand + specifies the lane to read from. Currently implemented + for i16, i32, float, half, bf16, v2i16, v2f16 and types whose sizes arsenm wrote: Should probably try to avoid repeating all the same information on all 3 intrinsics https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -6086,6 +6086,63 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering , SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering , SDNode *N, + SelectionDAG ) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IntrinsicID = N->getConstantOperandVal(0); + SDValue Src0 = N->getOperand(1); + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [, ](SDValue Src0, SDValue Src1, SDValue Src2, + MVT VT) -> SDValue { +return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2}) +: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1}) + : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0})); + }; + + SDValue Src1, Src2; + if (IntrinsicID == Intrinsic::amdgcn_readlane || + IntrinsicID == Intrinsic::amdgcn_writelane) { +Src1 = N->getOperand(2); +if (IntrinsicID == Intrinsic::amdgcn_writelane) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { +// Already legal +return SDValue(); + } + + if (ValSize < 32) { +bool IsFloat = VT.isFloatingPoint(); +Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, +SL, MVT::i32); +if (Src2.getNode()) { + Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, + SL, MVT::i32); +} +SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); +SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); +return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; + } + + if ((ValSize % 32) == 0) { +MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); arsenm wrote: Same comment, but more important than the globalisel case. Try to maintain the element type (e.g. v2f32->f32) https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -5387,6 +5387,124 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper , return true; } +// TODO: Fix pointer type handling +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper , + MachineInstr , + Intrinsic::ID IID) const { + + MachineIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo = *B.getMRI(); + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + + auto createLaneOp = [&](Register Src0, Register Src1, + Register Src2) -> Register { +auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0); +switch (IID) { +case Intrinsic::amdgcn_readfirstlane: + return LaneOp.getReg(0); +case Intrinsic::amdgcn_readlane: + return LaneOp.addUse(Src1).getReg(0); +case Intrinsic::amdgcn_writelane: + return LaneOp.addUse(Src1).addUse(Src2).getReg(0); +default: + llvm_unreachable("unhandled lane op"); +} + }; + + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) { +Src1 = MI.getOperand(3).getReg(); +if (IID == Intrinsic::amdgcn_writelane) { + Src2 = MI.getOperand(4).getReg(); +} + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { +// Already legal +return true; + } + + if (Size < 32) { +Src0 = B.buildAnyExt(S32, Src0).getReg(0); +if (Src2.isValid()) + Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); + +Register LaneOpDst = createLaneOp(Src0, Src1, Src2); +B.buildTrunc(DstReg, LaneOpDst); + +MI.eraseFromParent(); +return true; + } + + if ((Size % 32) == 0) { arsenm wrote: Early return instead of indenting everything on this. https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -1170,6 +1170,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics. :ref:`llvm.set.fpenv` Sets the floating point environment to the specifies state. + llvm.amdgcn.readfirstlaneProvides direct access to v_readfirstlane_b32. Returns the value in + the lowest active lane of the input operand. Currently + implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types arsenm wrote: This mixes type naming conventions. Probably should stick to the IR convention (and say `bfloat`, `<2 x i16>`, `<2 x half>`, `<2 x bfloat>`. Also, pointers double, and multiples of the 32-bit vectors should work. https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)
@@ -103,19 +104,27 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo ) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); + unsigned ArgumentIndex = 0; + const unsigned numFixedArguments = FI.getNumRequiredArgs(); arsenm wrote: Can you split the clang AMDGPU ABI changes into a separate PR? The tests for this are also missing https://github.com/llvm/llvm-project/pull/93362 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)
@@ -0,0 +1,180 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature + +// Simple calls to known variadic functions that are completely elided when +// optimisations are on This is a functional check that the expand-variadic pass +// is consistent with clang's va_arg handling + +// When expand-variadics is added to the default pipeline, clang -O1 will +// suffice here -Wno-varargs avoids warning second argument to 'va_start' is not +// the last named parameter + +// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -Wno-varargs -O1 -emit-llvm -o - | opt - -S --passes='module(expand-variadics,default)' --expand-variadics-override=optimize -o - | FileCheck %s arsenm wrote: Does this need REQUIRES: wasm-registered-target https://github.com/llvm/llvm-project/pull/93362 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2047,9 +2047,9 @@ void CodeGenModule::EmitCtorList(CtorList , const char *GlobalName) { llvm::Type *CtorPFTy = llvm::PointerType::get(CtorFTy, TheModule.getDataLayout().getProgramAddressSpace()); - // Get the type of a ctor entry, { i32, void ()*, i8* }. + // Get the type of a ctor entry, { i32, program void ()*, global i8* }. llvm::StructType *CtorStructTy = llvm::StructType::get( - Int32Ty, CtorPFTy, VoidPtrTy); + Int32Ty, CtorPFTy, GlobalsInt8PtrTy); arsenm wrote: What compatibility is broken here? Just bitcode loading? Do we just need to implement an autoupgrade for this? https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2928,12 +2928,13 @@ static void emitUsed(CodeGenModule , StringRef Name, for (unsigned i = 0, e = List.size(); i != e; ++i) { UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( -cast(&*List[i]), CGM.Int8PtrTy); arsenm wrote: I got confused on the other thread. Is Int8PtrTy not always AS 0 in this case? I think always emitting used/compiler.used as a hardcoded addrspace 0 (IR 0, having nothing to do with the language or LangAS mapping) would be fine https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)
@@ -2047,9 +2047,9 @@ void CodeGenModule::EmitCtorList(CtorList , const char *GlobalName) { llvm::Type *CtorPFTy = llvm::PointerType::get(CtorFTy, TheModule.getDataLayout().getProgramAddressSpace()); - // Get the type of a ctor entry, { i32, void ()*, i8* }. arsenm wrote: Probably should handle the ctor / dtor case separately from the used one, since that may be more meaningful https://github.com/llvm/llvm-project/pull/93601 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [Clang][CodeGen] Start migrating away from assuming the Default AS is 0 (PR #88182)
@@ -368,7 +368,8 @@ CodeGenModule::CodeGenModule(ASTContext , IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth()); IntPtrTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getMaxPointerWidth()); - Int8PtrTy = llvm::PointerType::get(LLVMContext, 0); + Int8PtrTy = llvm::PointerType::get(LLVMContext, arsenm wrote: > I don't think mixing languages with different LangAS maps is sound It is entirely sound, and required for this entire system to work (e.g. we implement the libraries in OpenCL used by all the languages). The point of the LangAS is to map to the target address space, which does not care what the language is. Different languages should be trying to be ABI compatible with each other, which includes emitting the correct IR address space https://github.com/llvm/llvm-project/pull/88182 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [Clang][CodeGen] Start migrating away from assuming the Default AS is 0 (PR #88182)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/88182 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [Clang][CodeGen] Start migrating away from assuming the Default AS is 0 (PR #88182)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/88182 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [Clang][CodeGen] Start migrating away from assuming the Default AS is 0 (PR #88182)
@@ -368,7 +368,8 @@ CodeGenModule::CodeGenModule(ASTContext , IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth()); IntPtrTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getMaxPointerWidth()); - Int8PtrTy = llvm::PointerType::get(LLVMContext, 0); + Int8PtrTy = llvm::PointerType::get(LLVMContext, arsenm wrote: llvm.used should be using the default globals addrspace. I believe there is special case linker code to handle mismatched address spaces for the intrinsic globals, so I don't think fixing this would break anything https://github.com/llvm/llvm-project/pull/88182 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [Clang][CodeGen] Start migrating away from assuming the Default AS is 0 (PR #88182)
@@ -368,7 +368,8 @@ CodeGenModule::CodeGenModule(ASTContext , IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth()); IntPtrTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getMaxPointerWidth()); - Int8PtrTy = llvm::PointerType::get(LLVMContext, 0); + Int8PtrTy = llvm::PointerType::get(LLVMContext, arsenm wrote: The IR address space is a pure target concept. Any address space error would be on the frontend emitting the wrong IR for the target https://github.com/llvm/llvm-project/pull/88182 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][Clang] Add check of size for __builtin_amdgcn_global_load_lds (PR #93064)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/93064 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][WIP] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
https://github.com/arsenm commented: Should lose the [WIP] in the title https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][Clang] Add check of size for __builtin_amdgcn_global_load_lds (PR #93064)
@@ -12385,4 +12385,8 @@ def err_acc_reduction_composite_type def err_acc_reduction_composite_member_type :Error< "OpenACC 'reduction' composite variable must not have non-scalar field">; def note_acc_reduction_composite_member_loc : Note<"invalid field is here">; + +// AMDGCN builtins diagnostics +def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">; +def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be 1/2/4">; arsenm wrote: Not sure what the message phrasing guidelines are here, but probably should spill out 1, 2, or 4 rather than using / https://github.com/llvm/llvm-project/pull/93064 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][Clang] Add check of size for __builtin_amdgcn_global_load_lds (PR #93064)
@@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; + +void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 size) { + __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{expression is not an integer constant expression}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1/2/4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1/2/4}} arsenm wrote: Didn't add negative value test https://github.com/llvm/llvm-project/pull/93064 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang] Introduce target-specific `Sema` components (PR #93179)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/93179 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang] Introduce target-specific `Sema` components (PR #93179)
https://github.com/arsenm commented: Should update the GitHub autolabeler paths for the targets https://github.com/llvm/llvm-project/pull/93179 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][WIP] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -6086,6 +6086,62 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering , SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering , SDNode *N, + SelectionDAG ) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IntrinsicID = N->getConstantOperandVal(0); + SDValue Src0 = N->getOperand(1); + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [, ](SDValue Src0, SDValue Src1, SDValue Src2, + MVT VT) -> SDValue { +return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2}) +: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1}) + : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0})); + }; + + SDValue Src1, Src2; + if (IntrinsicID == Intrinsic::amdgcn_readlane || + IntrinsicID == Intrinsic::amdgcn_writelane) { +Src1 = N->getOperand(2); +if (IntrinsicID == Intrinsic::amdgcn_writelane) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { +// Already legal +return SDValue(); + } + + if (ValSize < 32) { +SDValue InitBitCast = DAG.getBitcast(IntVT, Src0); +Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32); +if (Src2.getNode()) { + SDValue Src2Cast = DAG.getBitcast(IntVT, Src2); arsenm wrote: Yes, bitcast for the f16/bf16 case to get to the int https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][WIP] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
@@ -5456,43 +5444,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper , if ((Size % 32) == 0) { SmallVector PartialRes; unsigned NumParts = Size / 32; -auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16; +bool IsS16Vec = Ty.isVector() && Ty.getElementType() == S16; arsenm wrote: Better to track this as the LLT to use for the pieces, rather than making it this conditional thing. This will simplify improved pointer handling in the future https://github.com/llvm/llvm-project/pull/89217 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits