https://github.com/hekota created https://github.com/llvm/llvm-project/pull/169090
Storing to individual elements of a vector through vector swizzle components needs to be handled as separate stores. We need to avoid the load/modify/store of the whole vector to prevent overwriting other vector elements that might be getting updated in parallel. Fixes #152815 >From 3fe6c761f3fc4edbdcff194a39d2d35f9196eb2a Mon Sep 17 00:00:00 2001 From: Helena Kotas <[email protected]> Date: Fri, 21 Nov 2025 11:59:10 -0800 Subject: [PATCH] [HLSL] Update vector elements individually Storing to individual elements of a vector through vector swizzle components needs to be handled as separate stores. We need to avoid the load/modify/store of the whole vector to prevent overwriting other vector elements that might be getting updated in parallel. Fixes #152815 --- clang/lib/CodeGen/CGExpr.cpp | 47 +++++++-- .../BasicFeatures/OutputArguments.hlsl | 12 ++- .../CodeGenHLSL/builtins/ScalarSwizzles.hlsl | 25 +++-- .../CodeGenHLSL/builtins/VectorSwizzles.hlsl | 96 +++++++++++++++++++ 4 files changed, 159 insertions(+), 21 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/VectorSwizzles.hlsl diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index f2451b16e78be..45c8f4df03ab0 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -2801,18 +2801,50 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src, LValue Dst) { llvm::Value *SrcVal = Src.getScalarVal(); Address DstAddr = Dst.getExtVectorAddress(); + const llvm::Constant *Elts = Dst.getExtVectorElts(); if (DstAddr.getElementType()->getScalarSizeInBits() > SrcVal->getType()->getScalarSizeInBits()) SrcVal = Builder.CreateZExt( SrcVal, convertTypeForLoadStore(Dst.getType(), SrcVal->getType())); - // HLSL allows storing to scalar values through ExtVector component LValues. - // To support this we need to handle the case where the destination address is - // a scalar. - if (!DstAddr.getElementType()->isVectorTy()) { - assert(!Dst.getType()->isVectorType() && - "this should only occur for non-vector l-values"); - Builder.CreateStore(SrcVal, DstAddr, Dst.isVolatileQualified()); + if (getLangOpts().HLSL) { + llvm::Type *DestAddrTy = DstAddr.getElementType(); + // HLSL allows storing to scalar values through ExtVector component LValues. + // To support this we need to handle the case where the destination address + // is a scalar. + if (!DestAddrTy->isVectorTy()) { + assert(!Dst.getType()->isVectorType() && + "this should only occur for non-vector l-values"); + Builder.CreateStore(SrcVal, DstAddr, Dst.isVolatileQualified()); + return; + } + + // In HLSL, storing to individual elements of a vector through ExtVector + // components needs to be handled as separate store instructions. We need to + // avoid the load/modify/store sequence to prevent overwriting other + // elements that might be getting updated in parallel. + // If we are updating multiple elements, Dst and Src are vectors; for + // a single element update they are scalars. + const VectorType *VTy = Dst.getType()->getAs<VectorType>(); + unsigned NumSrcElts = VTy ? VTy->getNumElements() : 1; + CharUnits ElemAlign = CharUnits::fromQuantity( + CGM.getDataLayout().getPrefTypeAlign(DestAddrTy->getScalarType())); + llvm::Value *Zero = llvm::ConstantInt::get(Int32Ty, 0); + + for (unsigned I = 0; I != NumSrcElts; ++I) { + llvm::Value *Val = VTy ? Builder.CreateExtractElement( + SrcVal, llvm::ConstantInt::get(Int32Ty, I)) + : SrcVal; + unsigned FieldNo = getAccessedFieldNo(I, Elts); + Address DstElemAddr = Address::invalid(); + if (FieldNo == 0) + DstElemAddr = DstAddr.withAlignment(ElemAlign); + else + DstElemAddr = Builder.CreateGEP( + DstAddr, {Zero, llvm::ConstantInt::get(Int32Ty, FieldNo)}, + DestAddrTy, ElemAlign); + Builder.CreateStore(Val, DstElemAddr, Dst.isVolatileQualified()); + } return; } @@ -2820,7 +2852,6 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src, // value now. llvm::Value *Vec = Builder.CreateLoad(DstAddr, Dst.isVolatileQualified()); llvm::Type *VecTy = Vec->getType(); - const llvm::Constant *Elts = Dst.getExtVectorElts(); if (const VectorType *VTy = Dst.getType()->getAs<VectorType>()) { unsigned NumSrcElts = VTy->getNumElements(); diff --git a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl index d0ba8f447b732..ec03804ad1a4c 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl @@ -101,10 +101,16 @@ void funky(inout int3 X) { // Call the function with the temporary. // CHECK: call void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) [[ArgTmp]]) -// Shuffle it back. +// Write it back. // CHECK: [[RetVal:%.*]] = load <3 x i32>, ptr [[ArgTmp]] -// CHECK: [[Vxyz:%.*]] = shufflevector <3 x i32> [[RetVal]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1> -// CHECK: store <3 x i32> [[Vxyz]], ptr [[V]] +// CHECK: [[Src0:%.*]] = extractelement <3 x i32> [[RetVal]], i32 0 +// CHECK: [[PtrY:%.*]] = getelementptr <3 x i32>, ptr %V, i32 0, i32 1 +// CHECK: store i32 [[Src0]], ptr [[PtrY]], align 4 +// CHECK: [[Src1:%.*]] = extractelement <3 x i32> [[RetVal]], i32 1 +// CHECK: [[PtrZ:%.*]] = getelementptr <3 x i32>, ptr %V, i32 0, i32 2 +// CHECK: store i32 [[Src1]], ptr [[PtrZ]], align 4 +// CHECK: [[Src2:%.*]] = extractelement <3 x i32> [[RetVal]], i32 2 +// CHECK: store i32 [[Src2]], ptr %V, align 4 // OPT: ret <3 x i32> <i32 3, i32 1, i32 2> export int3 case4() { diff --git a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl index 7804239edccae..270598265c660 100644 --- a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl @@ -259,9 +259,8 @@ bool AssignBool(bool V) { // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[VAddr]], align 4 // CHECK-NEXT: [[LV1:%.*]] = trunc i32 [[B]] to i1 // CHECK-NEXT: [[D:%.*]] = zext i1 [[LV1]] to i32 -// CHECK-NEXT: [[C:%.*]] = load <2 x i32>, ptr [[X]], align 8 -// CHECK-NEXT: [[E:%.*]] = insertelement <2 x i32> [[C]], i32 [[D]], i32 1 -// CHECK-NEXT: store <2 x i32> [[E]], ptr [[X]], align 8 +// CHECK-NEXT: [[C:%.*]] = getelementptr <2 x i32>, ptr [[X]], i32 0, i32 1 +// CHECK-NEXT: store i32 [[D]], ptr [[C]], align 4 // CHECK-NEXT: ret void void AssignBool2(bool V) { bool2 X = true.xx; @@ -277,10 +276,13 @@ void AssignBool2(bool V) { // CHECK-NEXT: [[Z:%.*]] = load <2 x i32>, ptr [[VAddr]], align 8 // CHECK-NEXT: [[LV:%.*]] = trunc <2 x i32> [[Z]] to <2 x i1> // CHECK-NEXT: [[B:%.*]] = zext <2 x i1> [[LV]] to <2 x i32> -// CHECK-NEXT: [[A:%.*]] = load <2 x i32>, ptr [[X]], align 8 -// CHECK-NEXT: [[C:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 0, i32 1> -// CHECK-NEXT: store <2 x i32> [[C]], ptr [[X]], align 8 +// CHECK-NEXT: [[V1:%.*]] = extractelement <2 x i32> [[B]], i32 0 +// CHECK-NEXT: store i32 [[V1]], ptr [[X]], align 4 +// CHECK-NEXT: [[V2:%.*]] = extractelement <2 x i32> [[B]], i32 1 +// CHECK-NEXT: [[X2:%.*]] = getelementptr <2 x i32>, ptr [[X]], i32 0, i32 1 +// CHECK-NEXT: store i32 [[V2]], ptr [[X2]], align 4 // CHECK-NEXT: ret void + void AssignBool3(bool2 V) { bool2 X = {true,true}; X.xy = V; @@ -313,10 +315,13 @@ bool2 AccessBools() { // CHECK-NEXT: [[L1:%.*]] = shufflevector <1 x i32> [[L0]], <1 x i32> poison, <3 x i32> zeroinitializer // CHECK-NEXT: [[TruncV:%.*]] = trunc <3 x i32> [[L1]] to <3 x i1> // CHECK-NEXT: [[L2:%.*]] = zext <3 x i1> [[TruncV]] to <3 x i32> -// CHECK-NEXT: [[L3:%.*]] = load <4 x i32>, ptr [[B]], align 16 -// CHECK-NEXT: [[L4:%.*]] = shufflevector <3 x i32> [[L2]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> -// CHECK-NEXT: [[L5:%.*]] = shufflevector <4 x i32> [[L3]], <4 x i32> [[L4]], <4 x i32> <i32 4, i32 5, i32 6, i32 3> -// CHECK-NEXT: store <4 x i32> [[L5]], ptr [[B]], align 16 +// CHECK-NEXT: [[V1:%.*]] = extractelement <3 x i32> [[L2]], i32 0 +// CHECK-NEXT: store i32 [[V1]], ptr %B, align 4 +// CHECK-NEXT: [[V2:%.*]] = extractelement <3 x i32> [[L2]], i32 1 +// CHECK-NEXT: [[B2:%.*]] = getelementptr <4 x i32>, ptr %B, i32 0, i32 1 +// CHECK-NEXT: store i32 [[V2]], ptr [[B2]], align 4 +// CHECK-NEXT: [[V3:%.*]] = extractelement <3 x i32> [[L2]], i32 2 +// CHECK-NEXT: [[B3:%.*]] = getelementptr <4 x i32>, ptr %B, i32 0, i32 2 void BoolSizeMismatch() { bool4 B = {true,true,true,true}; B.xyz = false.xxx; diff --git a/clang/test/CodeGenHLSL/builtins/VectorSwizzles.hlsl b/clang/test/CodeGenHLSL/builtins/VectorSwizzles.hlsl new file mode 100644 index 0000000000000..c632e795098ea --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/VectorSwizzles.hlsl @@ -0,0 +1,96 @@ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type \ +// RUN: -triple dxil-pc-shadermodel6.3-library %s -disable-llvm-passes \ +// RUN: -emit-llvm -o - | FileCheck %s + +// CHECK-LABEL: Single + +// Setup local vars. +// CHECK: [[VecAddr:%.*]] = alloca <3 x i64>, align 32 +// CHECK-NEXT: [[AAddr:%.*]] = alloca i64, align 8 +// CHECK-NEXT: store <3 x i64> %vec, ptr [[VecAddr]], align 32 +// CHECK-NEXT: store i64 %a, ptr [[AAddr]], align 8 + +// Update single element of the vector. +// CHECK-NEXT: [[A:%.*]] = load i64, ptr [[AAddr]], align 8 +// CHECK-NEXT: [[Vy:%.*]] = getelementptr <3 x i64>, ptr [[VecAddr]], i32 0, i32 1 +// CHECK-NEXT: store i64 [[A]], ptr [[Vy]], align 8 + +// Return. +// CHECK-NEXT: [[RetVal:%.*]] = load <3 x i64>, ptr [[VecAddr]], align 32 +// CHECK-NEXT: ret <3 x i64> [[RetVal]] +uint64_t3 Single(uint64_t3 vec, uint64_t a){ + vec.y = a; + return vec; +} + +// CHECK-LABEL: Double + +// Setup local vars. +// CHECK: [[VecAddr:%.*]] = alloca <3 x float>, align 16 +// CHECK-NEXT: [[AAddr:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[BAddr:%.*]] = alloca float, align 4 +// CHECK-NEXT: store <3 x float> %vec, ptr [[VecAddr]], align 16 +// CHECK-NEXT: store float %a, ptr [[AAddr]], align 4 +// CHECK-NEXT: store float %b, ptr [[BAddr]], align 4 + +// Create temporary vector {a, b}. +// CHECK-NEXT: [[A:%.*]] = load float, ptr [[AAddr]], align 4 +// CHECK-NEXT: [[TmpVec0:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0 +// CHECK-NEXT: [[B:%.*]] = load float, ptr [[BAddr]], align 4 +// CHECK-NEXT: [[TmpVec1:%.*]] = insertelement <2 x float> [[TmpVec0]], float [[B]], i32 1 + +// Update two elements of the vector from temporary vector. +// CHECK-NEXT: [[TmpX:%.*]] = extractelement <2 x float> [[TmpVec1]], i32 0 +// CHECK-NEXT: [[VecZ:%.*]] = getelementptr <3 x float>, ptr [[VecAddr]], i32 0, i32 2 +// CHECK-NEXT: store float [[TmpX]], ptr [[VecZ]], align 4 +// CHECK-NEXT: [[TmpY:%.*]] = extractelement <2 x float> [[TmpVec1]], i32 1 +// CHECK-NEXT: [[VecY:%.*]] = getelementptr <3 x float>, ptr [[VecAddr]], i32 0, i32 1 +// CHECK-NEXT: store float [[TmpY]], ptr [[VecY]], align 4 + +// Return. +// CHECK-NEXT: [[RetVal:%.*]] = load <3 x float>, ptr [[VecAddr]], align 16 +// CHECK-NEXT: ret <3 x float> [[RetVal]] +float3 Double(float3 vec, float a, float b) { + vec.zy = {a, b}; + return vec; +} + +// CHECK-LABEL: Shuffle + +// Setup local vars. +// CHECK: [[VecAddr:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[AAddr:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[BAddr:%.*]] = alloca half, align 2 +// CHECK-NEXT: store <4 x half> %vec, ptr [[VecAddr]], align 8 +// CHECK-NEXT: store half %a, ptr [[AAddr]], align 2 +// CHECK-NEXT: store half %b, ptr [[BAddr]], align 2 + +// Create temporary vector {a, b, 13.74, a}. +// CHECK-NEXT: [[A:%.*]] = load half, ptr [[AAddr]], align 2 +// CHECK-NEXT: [[TmpVec0:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 +// CHECK-NEXT: [[B:%.*]] = load half, ptr [[BAddr]], align 2 +// CHECK-NEXT: [[TmpVec1:%.*]] = insertelement <4 x half> [[TmpVec0]], half [[B]], i32 1 +// CHECK-NEXT: [[TmpVec2:%.*]] = insertelement <4 x half> %vecinit1, half 0xH4ADF, i32 2 +// CHECK-NEXT: [[A:%.*]] = load half, ptr [[AAddr]], align 2 +// CHECK-NEXT: [[TmpVec3:%.*]] = insertelement <4 x half> [[TmpVec2]], half [[A]], i32 3 + +// Update four elements of the vector via mixed up swizzle from the temporary vector. +// CHECK-NEXT: [[TmpX:%.*]] = extractelement <4 x half> [[TmpVec3]], i32 0 +// CHECK-NEXT: [[VecZ:%.*]] = getelementptr <4 x half>, ptr [[VecAddr]], i32 0, i32 2 +// CHECK-NEXT: store half [[TmpX]], ptr [[VecZ]], align 2 +// CHECK-NEXT: [[TmpY:%.*]] = extractelement <4 x half> [[TmpVec3]], i32 1 +// CHECK-NEXT: [[VecW:%.*]] = getelementptr <4 x half>, ptr [[VecAddr]], i32 0, i32 3 +// CHECK-NEXT: store half [[TmpY]], ptr [[VecW]], align 2 +// CHECK-NEXT: [[TmpZ:%.*]] = extractelement <4 x half> [[TmpVec3]], i32 2 +// CHECK-NEXT: store half [[TmpZ]], ptr [[VecAddr]], align 2 +// CHECK-NEXT: [[TmpW:%.*]] = extractelement <4 x half> [[TmpVec3]], i32 3 +// CHECK-NEXT: [[VecY:%.*]] = getelementptr <4 x half>, ptr [[VecAddr]], i32 0, i32 1 +// CHECK-NEXT: store half [[TmpW]], ptr [[VecY]], align 2 + +// Return. +// CHECK-NEXT: [[RetVal:%.*]] = load <4 x half>, ptr [[VecAddr]], align 8 +// CHECK-NEXT: ret <4 x half> [[RetVal]] +half4 Shuffle(half4 vec, half a, half b) { + vec.zwxy = {a, b, 13.74, a}; + return vec; +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
