https://github.com/VigneshwarJ updated 
https://github.com/llvm/llvm-project/pull/183639

>From c91d4e3d461ca477131997e22729d6b45f3b53bb Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Thu, 26 Feb 2026 16:37:30 -0600
Subject: [PATCH 1/8] [Clang] Fix invalid sret addspacecast for' new' on AMDGPU

When a HIP kernel uses placement new with a function returning an
aggregate via sret (e.g. `new (out) T(make_t())`), and the placement
destination is in global memory (addrspace 1), CGCall's EmitCall would
addrspacecast the pointer to the callee's expected sret address space
(addrspace 5 / private). the addrspacecast produces an invalid pointer
that faults at runtime.

Instead of casting the caller's pointer directly, materialise a
temporary alloca in the callee's expected address space, pass that as
the sret argument, and copy the result back to the original destination
after the call.
---
 clang/lib/CodeGen/CGCall.cpp                  | 34 +++++++--
 .../CodeGenHIP/placement-new-addrspace.hip    | 71 +++++++++++++++++++
 2 files changed, 100 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/placement-new-addrspace.hip

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 1d950ffed8a0b..91fac02a667f8 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5427,7 +5427,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
   // If the call returns a temporary with struct return, create a temporary
   // alloca to hold the result, unless one is given to us.
   Address SRetPtr = Address::invalid();
+  Address OriginalSRetPtr = Address::invalid();
   bool NeedSRetLifetimeEnd = false;
+  bool NeedSRetCopyBack = false;
   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
     // For virtual function pointer thunks and musttail calls, we must always
     // forward an incoming SRet pointer to the callee, because a local alloca
@@ -5439,6 +5441,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
                                              RetTy, 
CharUnits::fromQuantity(1));
     } else if (!ReturnValue.isNull()) {
       SRetPtr = ReturnValue.getAddress();
+      OriginalSRetPtr = SRetPtr;
     } else {
       SRetPtr = CreateMemTempWithoutCast(RetTy, "tmp");
       if (HaveInsertPoint() && ReturnValue.isUnused())
@@ -5450,12 +5453,26 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
       // a chain involving stores to / loads from the DefaultAS; we address 
this
       // here, symmetrically with the handling we have for normal pointer args.
       if (SRetPtr.getAddressSpace() != RetAI.getIndirectAddrSpace()) {
-        llvm::Value *V = SRetPtr.getBasePointer();
-        llvm::Type *Ty = llvm::PointerType::get(getLLVMContext(),
-                                                RetAI.getIndirectAddrSpace());
+        // If the caller supplied a destination in a different address space,
+        // materialize the call result in a temporary with the callee's 
expected
+        // sret address space and copy back after the call.
+        if (OriginalSRetPtr.isValid() && !IsVirtualFunctionPointerThunk &&
+            !IsMustTail) {
+          Address TmpSRetPtr = CreateMemTempWithoutCast(RetTy, "tmp.sret");
+          if (TmpSRetPtr.getAddressSpace() == RetAI.getIndirectAddrSpace()) {
+            SRetPtr = TmpSRetPtr;
+            NeedSRetCopyBack = true;
+          }
+        }
+
+        if (!NeedSRetCopyBack) {
+          llvm::Value *V = SRetPtr.getBasePointer();
+          llvm::Type *Ty = llvm::PointerType::get(getLLVMContext(),
+                                                  
RetAI.getIndirectAddrSpace());
 
-        SRetPtr = SRetPtr.withPointer(performAddrSpaceCast(V, Ty),
-                                      SRetPtr.isKnownNonNull());
+          SRetPtr = SRetPtr.withPointer(performAddrSpaceCast(V, Ty),
+                                        SRetPtr.isKnownNonNull());
+        }
       }
       IRCallArgs[IRFunctionArgs.getSRetArgNo()] =
           getAsNaturalPointerTo(SRetPtr, RetTy);
@@ -6271,6 +6288,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
   // lexical order, so deactivate it and run it manually here.
   CallArgs.freeArgumentMemory(*this);
 
+  if (NeedSRetCopyBack) {
+    LValue DstLV = MakeAddrLValue(OriginalSRetPtr, RetTy);
+    LValue SrcLV = MakeAddrLValue(SRetPtr, RetTy);
+    EmitAggregateCopy(DstLV, SrcLV, RetTy, AggValueSlot::MayOverlap);
+    SRetPtr = OriginalSRetPtr;
+  }
+
   // Extract the return value.
   RValue Ret;
 
diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip 
b/clang/test/CodeGenHIP/placement-new-addrspace.hip
new file mode 100644
index 0000000000000..106d9ca4867a4
--- /dev/null
+++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip
@@ -0,0 +1,71 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --functions "make_big|kernel" --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x 
hip -std=c++17 %s -o - | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+// Verify that when a function returning an aggregate via sret is called with a
+// destination in a different address space (e.g. global pointer from kernel
+// arg), the compiler materialises a temporary in the alloca AS and copies 
back,
+// rather than emitting an invalid addrspacecast of the destination pointer.
+
+typedef __SIZE_TYPE__ size_t;
+__attribute__((device)) void *operator new(size_t, void *p) noexcept { return 
p; }
+
+struct Big {
+  int v[32];
+  __attribute__((device)) Big(int x) {
+    for (int i = 0; i < 32; ++i)
+      v[i] = x + i;
+  }
+};
+
+// CHECK-LABEL: define dso_local void @_Z8make_bigv(
+// CHECK-SAME: ptr addrspace(5) dead_on_unwind noalias writable 
sret([[STRUCT_BIG:%.*]]) align 4 [[AGG_RESULT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[AGG_RESULT]] to ptr
+// CHECK-NEXT:    call void @_ZN3BigC1Ei(ptr noundef nonnull align 4 
dereferenceable(128) [[AGG_RESULT_ASCAST]], i32 noundef 7) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+__attribute__((device)) Big make_big() { return Big(7); }
+
+// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z6kernelP3Big(
+// CHECK-SAME: ptr addrspace(1) noundef [[OUT_COERCE:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[TMP_SRET:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] 
to ptr
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE]], ptr [[OUT_ASCAST]], 
align 8
+// CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP_SRET]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP0]], ptr 
addrspace(5) align 4 [[TMP_SRET]], i64 128, i1 false)
+// CHECK-NEXT:    ret void
+//
+__attribute__((global)) void kernel(Big *out) {
+  new (out) Big(make_big());
+}
+
+// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z13kernel_assignP3Big(
+// CHECK-SAME: ptr addrspace(1) noundef [[OUT_COERCE:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
+// CHECK-NEXT:    [[TMP_SRET:%.*]] = alloca [[STRUCT_BIG]], align 4, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] 
to ptr
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[REF_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[REF_TMP]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE]], ptr [[OUT_ASCAST]], 
align 8
+// CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP_SRET]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 
[[REF_TMP_ASCAST]], ptr addrspace(5) align 4 [[TMP_SRET]], i64 128, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP0]], ptr 
align 4 [[REF_TMP_ASCAST]], i64 128, i1 false)
+// CHECK-NEXT:    ret void
+//
+__attribute__((global)) void kernel_assign(Big *out) {
+  *out = make_big();
+}

>From f1760c0fa59e9bc664737e939a31936480eed7d5 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Thu, 26 Feb 2026 17:07:21 -0600
Subject: [PATCH 2/8] moved implementation to CGExprAgg

---
 clang/lib/CodeGen/CGCall.cpp                  | 34 +++----------------
 clang/lib/CodeGen/CGExprAgg.cpp               | 10 ++++--
 .../CodeGenHIP/placement-new-addrspace.hip    | 29 ++--------------
 3 files changed, 16 insertions(+), 57 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 91fac02a667f8..1d950ffed8a0b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5427,9 +5427,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
   // If the call returns a temporary with struct return, create a temporary
   // alloca to hold the result, unless one is given to us.
   Address SRetPtr = Address::invalid();
-  Address OriginalSRetPtr = Address::invalid();
   bool NeedSRetLifetimeEnd = false;
-  bool NeedSRetCopyBack = false;
   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
     // For virtual function pointer thunks and musttail calls, we must always
     // forward an incoming SRet pointer to the callee, because a local alloca
@@ -5441,7 +5439,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
                                              RetTy, 
CharUnits::fromQuantity(1));
     } else if (!ReturnValue.isNull()) {
       SRetPtr = ReturnValue.getAddress();
-      OriginalSRetPtr = SRetPtr;
     } else {
       SRetPtr = CreateMemTempWithoutCast(RetTy, "tmp");
       if (HaveInsertPoint() && ReturnValue.isUnused())
@@ -5453,26 +5450,12 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
       // a chain involving stores to / loads from the DefaultAS; we address 
this
       // here, symmetrically with the handling we have for normal pointer args.
       if (SRetPtr.getAddressSpace() != RetAI.getIndirectAddrSpace()) {
-        // If the caller supplied a destination in a different address space,
-        // materialize the call result in a temporary with the callee's 
expected
-        // sret address space and copy back after the call.
-        if (OriginalSRetPtr.isValid() && !IsVirtualFunctionPointerThunk &&
-            !IsMustTail) {
-          Address TmpSRetPtr = CreateMemTempWithoutCast(RetTy, "tmp.sret");
-          if (TmpSRetPtr.getAddressSpace() == RetAI.getIndirectAddrSpace()) {
-            SRetPtr = TmpSRetPtr;
-            NeedSRetCopyBack = true;
-          }
-        }
-
-        if (!NeedSRetCopyBack) {
-          llvm::Value *V = SRetPtr.getBasePointer();
-          llvm::Type *Ty = llvm::PointerType::get(getLLVMContext(),
-                                                  
RetAI.getIndirectAddrSpace());
+        llvm::Value *V = SRetPtr.getBasePointer();
+        llvm::Type *Ty = llvm::PointerType::get(getLLVMContext(),
+                                                RetAI.getIndirectAddrSpace());
 
-          SRetPtr = SRetPtr.withPointer(performAddrSpaceCast(V, Ty),
-                                        SRetPtr.isKnownNonNull());
-        }
+        SRetPtr = SRetPtr.withPointer(performAddrSpaceCast(V, Ty),
+                                      SRetPtr.isKnownNonNull());
       }
       IRCallArgs[IRFunctionArgs.getSRetArgNo()] =
           getAsNaturalPointerTo(SRetPtr, RetTy);
@@ -6288,13 +6271,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
   // lexical order, so deactivate it and run it manually here.
   CallArgs.freeArgumentMemory(*this);
 
-  if (NeedSRetCopyBack) {
-    LValue DstLV = MakeAddrLValue(OriginalSRetPtr, RetTy);
-    LValue SrcLV = MakeAddrLValue(SRetPtr, RetTy);
-    EmitAggregateCopy(DstLV, SrcLV, RetTy, AggValueSlot::MayOverlap);
-    SRetPtr = OriginalSRetPtr;
-  }
-
   // Extract the return value.
   RValue Ret;
 
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 0ca6af3def57f..3957904314698 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -286,8 +286,14 @@ void AggExprEmitter::withReturnValueSlot(
   // We need to always provide our own temporary if destruction is required.
   // Otherwise, EmitCall will emit its own, notice that it's "unused", and end
   // its lifetime before we have the chance to emit a proper destructor call.
-  bool UseTemp = Dest.isPotentiallyAliased() || Dest.requiresGCollection() ||
-                 (RequiresDestruction && Dest.isIgnored());
+  //
+  // We also need a temporary if the destination is in a different address 
space
+  // from the alloca AS, to avoid an invalid addrspacecast on the sret pointer.
+  bool UseTemp =
+      Dest.isPotentiallyAliased() || Dest.requiresGCollection() ||
+      (RequiresDestruction && Dest.isIgnored()) ||
+      (!Dest.isIgnored() && Dest.getAddress().getAddressSpace() !=
+                                CGF.CGM.getDataLayout().getAllocaAddrSpace());
 
   Address RetAddr = Address::invalid();
 
diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip 
b/clang/test/CodeGenHIP/placement-new-addrspace.hip
index 106d9ca4867a4..07c7e87c37a60 100644
--- a/clang/test/CodeGenHIP/placement-new-addrspace.hip
+++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip
@@ -32,40 +32,17 @@ __attribute__((device)) Big make_big() { return Big(7); }
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[TMP_SRET:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
 // CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] 
to ptr
 // CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
 // CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE]], ptr [[OUT_ASCAST]], 
align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP_SRET]]) #[[ATTR3]]
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP0]], ptr 
addrspace(5) align 4 [[TMP_SRET]], i64 128, i1 false)
+// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP0]], ptr 
addrspace(5) align 4 [[TMP]], i64 128, i1 false)
 // CHECK-NEXT:    ret void
 //
 __attribute__((global)) void kernel(Big *out) {
   new (out) Big(make_big());
 }
-
-// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z13kernel_assignP3Big(
-// CHECK-SAME: ptr addrspace(1) noundef [[OUT_COERCE:%.*]]) #[[ATTR1]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
-// CHECK-NEXT:    [[TMP_SRET:%.*]] = alloca [[STRUCT_BIG]], align 4, 
addrspace(5)
-// CHECK-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] 
to ptr
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[REF_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[REF_TMP]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE]], ptr [[OUT_ASCAST]], 
align 8
-// CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP_SRET]]) #[[ATTR3]]
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 
[[REF_TMP_ASCAST]], ptr addrspace(5) align 4 [[TMP_SRET]], i64 128, i1 false)
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP0]], ptr 
align 4 [[REF_TMP_ASCAST]], i64 128, i1 false)
-// CHECK-NEXT:    ret void
-//
-__attribute__((global)) void kernel_assign(Big *out) {
-  *out = make_big();
-}

>From 0667723cccd2e54cc83d53c1fa9ea5a13d674a1e Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Thu, 26 Feb 2026 17:49:46 -0600
Subject: [PATCH 3/8] missed test

---
 clang/test/OpenMP/amdgcn_sret_ctor.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/clang/test/OpenMP/amdgcn_sret_ctor.cpp 
b/clang/test/OpenMP/amdgcn_sret_ctor.cpp
index 99ca31b78e1fc..8d2c035e23472 100644
--- a/clang/test/OpenMP/amdgcn_sret_ctor.cpp
+++ b/clang/test/OpenMP/amdgcn_sret_ctor.cpp
@@ -16,15 +16,13 @@ E::E() noexcept : foo(s()) {}
 // CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) 
unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S:%.*]], align 1, addrspace(5)
 // CHECK-NEXT:    [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[THIS_ADDR]] to ptr
 // CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[THIS1_ASCAST:%.*]] = addrspacecast ptr [[THIS1]] to ptr 
addrspace(5)
-// CHECK-NEXT:    call void @_Z1sv(ptr addrspace(5) dead_on_unwind writable 
sret([[STRUCT_S:%.*]]) align 1 [[THIS1_ASCAST]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    call void @_Z1sv(ptr addrspace(5) dead_on_unwind writable 
sret([[STRUCT_S]]) align 1 [[TMP]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
-// CHECK-LABEL: declare void @_Z1sv(
-// CHECK-SAME: ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_S]]) 
align 1) #[[ATTR1:[0-9]+]]
 //
 // CHECK-LABEL: define hidden void @_ZN1EC1Ev(
 // CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) 
unnamed_addr #[[ATTR0]] align 2 {

>From d0eb6b1f235e3447c8bd77cd1a95e88ca2697b04 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Sun, 1 Mar 2026 18:48:34 -0600
Subject: [PATCH 4/8] review change

---
 clang/lib/CodeGen/CGExprAgg.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 3957904314698..efcf570cdda10 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -293,7 +293,8 @@ void AggExprEmitter::withReturnValueSlot(
       Dest.isPotentiallyAliased() || Dest.requiresGCollection() ||
       (RequiresDestruction && Dest.isIgnored()) ||
       (!Dest.isIgnored() && Dest.getAddress().getAddressSpace() !=
-                                CGF.CGM.getDataLayout().getAllocaAddrSpace());
+                                CGF.getContext().getTargetAddressSpace(
+                                    CGF.CGM.getASTAllocaAddressSpace()));
 
   Address RetAddr = Address::invalid();
 

>From c3bec8f44acab3c0aac8bd9b8e0ae8f5ef63f6af Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Tue, 3 Mar 2026 17:09:33 -0600
Subject: [PATCH 5/8] review comments

---
 clang/test/CodeGenHIP/placement-new-addrspace.hip | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip 
b/clang/test/CodeGenHIP/placement-new-addrspace.hip
index 07c7e87c37a60..27f799b05193b 100644
--- a/clang/test/CodeGenHIP/placement-new-addrspace.hip
+++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --functions "make_big|kernel" --version 5
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x 
hip -std=c++17 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm 
-disable-llvm-passes -x hip -std=c++17 %s -o - | FileCheck %s
 // REQUIRES: amdgpu-registered-target
 
 // Verify that when a function returning an aggregate via sret is called with a

>From ed87a039891c8952e222c7cdb9a5a20c50301ffc Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Thu, 5 Mar 2026 11:52:47 -0600
Subject: [PATCH 6/8] unnecessary_memcpy

---
 .../CodeGenHIP/placement-new-addrspace.hip     | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip 
b/clang/test/CodeGenHIP/placement-new-addrspace.hip
index 27f799b05193b..193660ecfdfe3 100644
--- a/clang/test/CodeGenHIP/placement-new-addrspace.hip
+++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --functions "make_big|kernel" --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --functions "make_big|kernel|local_test" --version 5
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm 
-disable-llvm-passes -x hip -std=c++17 %s -o - | FileCheck %s
 // REQUIRES: amdgpu-registered-target
 
@@ -46,3 +46,19 @@ __attribute__((device)) Big make_big() { return Big(7); }
 __attribute__((global)) void kernel(Big *out) {
   new (out) Big(make_big());
 }
+
+// If the destination is ultimately backed by alloca AS (even through cast
+// chains), we should pass it directly as sret and avoid an extra temp/copy.
+// CHECK-LABEL: define dso_local void @_Z10local_testv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_BIG]], align 4, addrspace(5)
+// CHECK-NEXT:    [[LOCAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[LOCAL]] to ptr
+// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 
[[LOCAL_ASCAST]], ptr addrspace(5) align 4 [[TMP]], i64 128, i1 false)
+// CHECK-NEXT:    ret void
+//
+__attribute__((device)) void local_test() {
+  Big local = make_big();
+}

>From c3d2a5a84ee56b605289b00e527ccce77b7e32da Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Thu, 5 Mar 2026 12:28:37 -0600
Subject: [PATCH 7/8] remove pointer casts

---
 clang/lib/CodeGen/CGExprAgg.cpp               | 21 +++++++++++++------
 .../CodeGenHIP/placement-new-addrspace.hip    |  5 ++---
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index efcf570cdda10..b66b540357a39 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -289,12 +289,21 @@ void AggExprEmitter::withReturnValueSlot(
   //
   // We also need a temporary if the destination is in a different address 
space
   // from the alloca AS, to avoid an invalid addrspacecast on the sret pointer.
-  bool UseTemp =
-      Dest.isPotentiallyAliased() || Dest.requiresGCollection() ||
-      (RequiresDestruction && Dest.isIgnored()) ||
-      (!Dest.isIgnored() && Dest.getAddress().getAddressSpace() !=
-                                CGF.getContext().getTargetAddressSpace(
-                                    CGF.CGM.getASTAllocaAddressSpace()));
+  // Look through addrspacecasts to avoid unnecessary temps when the
+  // destination is already in the alloca AS.
+  bool DestASMismatch = false;
+  if (!Dest.isIgnored()) {
+    unsigned SRetAS = CGF.getContext().getTargetAddressSpace(
+        CGF.CGM.getASTAllocaAddressSpace());
+    unsigned DestAS = Dest.getAddress()
+                          .getBasePointer()
+                          ->stripPointerCasts()
+                          ->getType()
+                          ->getPointerAddressSpace();
+    DestASMismatch = DestAS != SRetAS;
+  }
+  bool UseTemp = Dest.isPotentiallyAliased() || Dest.requiresGCollection() ||
+                 (RequiresDestruction && Dest.isIgnored()) || DestASMismatch;
 
   Address RetAddr = Address::invalid();
 
diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip 
b/clang/test/CodeGenHIP/placement-new-addrspace.hip
index 193660ecfdfe3..48a401baf9a78 100644
--- a/clang/test/CodeGenHIP/placement-new-addrspace.hip
+++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip
@@ -53,10 +53,9 @@ __attribute__((global)) void kernel(Big *out) {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, 
addrspace(5)
-// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_BIG]], align 4, addrspace(5)
 // CHECK-NEXT:    [[LOCAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[LOCAL]] to ptr
-// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[TMP]]) #[[ATTR3]]
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 
[[LOCAL_ASCAST]], ptr addrspace(5) align 4 [[TMP]], i64 128, i1 false)
+// CHECK-NEXT:    [[LOCAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr 
[[LOCAL_ASCAST]] to ptr addrspace(5)
+// CHECK-NEXT:    call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind 
writable sret([[STRUCT_BIG]]) align 4 [[LOCAL_ASCAST_ASCAST]]) #[[ATTR3]]
 // CHECK-NEXT:    ret void
 //
 __attribute__((device)) void local_test() {

>From 2e409d2aa819d463b7cd1429d27d661a8647c4c1 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <[email protected]>
Date: Thu, 5 Mar 2026 21:07:18 -0600
Subject: [PATCH 8/8] cast RetAddr

---
 clang/lib/CodeGen/CGExprAgg.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index b66b540357a39..0c81fbe74432c 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -291,10 +291,10 @@ void AggExprEmitter::withReturnValueSlot(
   // from the alloca AS, to avoid an invalid addrspacecast on the sret pointer.
   // Look through addrspacecasts to avoid unnecessary temps when the
   // destination is already in the alloca AS.
+  unsigned SRetAS = CGF.getContext().getTargetAddressSpace(
+      CGF.CGM.getASTAllocaAddressSpace());
   bool DestASMismatch = false;
   if (!Dest.isIgnored()) {
-    unsigned SRetAS = CGF.getContext().getTargetAddressSpace(
-        CGF.CGM.getASTAllocaAddressSpace());
     unsigned DestAS = Dest.getAddress()
                           .getBasePointer()
                           ->stripPointerCasts()
@@ -311,6 +311,13 @@ void AggExprEmitter::withReturnValueSlot(
   llvm::IntrinsicInst *LifetimeStartInst = nullptr;
   if (!UseTemp) {
     RetAddr = Dest.getAddress();
+    if (RetAddr.isValid() && RetAddr.getAddressSpace() != SRetAS) {
+      llvm::Type *SRetPtrTy =
+          llvm::PointerType::get(CGF.getLLVMContext(), SRetAS);
+      RetAddr = RetAddr.withPointer(
+          CGF.performAddrSpaceCast(RetAddr.getBasePointer(), SRetPtrTy),
+          RetAddr.isKnownNonNull());
+    }
   } else {
     RetAddr = CGF.CreateMemTempWithoutCast(RetTy, "tmp");
     if (CGF.EmitLifetimeStart(RetAddr.getBasePointer())) {

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to