[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)

2025-07-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/149518
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)

2025-07-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Shilei Tian (shiltian)


Changes

Co-authored-by: Mekhanoshin, Stanislav 

---

Patch is 21.43 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/149518.diff


8 Files Affected:

- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+47) 
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+8) 
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+10-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll (+151-2) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+6) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+6) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+21) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+21) 


``diff
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index d42e51d04ab9d..4c3f308a6cf75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -5,6 +5,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
 typedef half __attribute__((ext_vector_type(2))) half2;
 
 // CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
   out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
 }
 
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:[[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OLD_ADDR]] to ptr
+// CHECK-NEXT:[[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRC_ADDR]] to ptr
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:[[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], 
i64 0
+// CHECK-NEXT:[[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 
[[TMP4]], i64 1
+// CHECK-NEXT:[[TMP7:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT:[[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP10:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT:[[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT:[[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT:[[TMP13:%.*]] = insertelement <2 x i32> poison, i32 
[[TMP11]], i64 0
+// CHECK-NEXT:[[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 
[[TMP12]], i64 1
+// CHECK-NEXT:[[TMP15:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 
8
+// CHECK-NEXT:[[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP18:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT:[[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT:[[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT:[[TMP21:%.*]] = insertelement <2 x i32> poison, i32 
[[TMP19]], i64 0
+// CHECK-NEXT:[[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 
[[TMP20]], i64 1
+// CHECK-NEXT:[[TMP23:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 
8
+// CHECK-NEXT:ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+  *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+  *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+  *out = __builtin_amdgcn_permlane16_swap(old, src, fa

[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)

2025-07-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)


Changes

Co-authored-by: Mekhanoshin, Stanislav 

---

Patch is 21.43 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/149518.diff


8 Files Affected:

- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+47) 
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+8) 
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+10-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll (+151-2) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+6) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+6) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+21) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+21) 


``diff
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index d42e51d04ab9d..4c3f308a6cf75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -5,6 +5,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
 typedef half __attribute__((ext_vector_type(2))) half2;
 
 // CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
   out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
 }
 
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:[[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OLD_ADDR]] to ptr
+// CHECK-NEXT:[[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRC_ADDR]] to ptr
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:[[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], 
i64 0
+// CHECK-NEXT:[[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 
[[TMP4]], i64 1
+// CHECK-NEXT:[[TMP7:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT:[[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP10:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT:[[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT:[[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT:[[TMP13:%.*]] = insertelement <2 x i32> poison, i32 
[[TMP11]], i64 0
+// CHECK-NEXT:[[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 
[[TMP12]], i64 1
+// CHECK-NEXT:[[TMP15:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 
8
+// CHECK-NEXT:[[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP18:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT:[[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT:[[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT:[[TMP21:%.*]] = insertelement <2 x i32> poison, i32 
[[TMP19]], i64 0
+// CHECK-NEXT:[[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 
[[TMP20]], i64 1
+// CHECK-NEXT:[[TMP23:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 
8
+// CHECK-NEXT:ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+  *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+  *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+  *out = __builtin_amdgcn_permlane16_swap(old

[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)

2025-07-18 Thread Shilei Tian via llvm-branch-commits

shiltian wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/149518?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#149518** https://app.graphite.dev/github/pr/llvm/llvm-project/149518?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/149518?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#149450** https://app.graphite.dev/github/pr/llvm/llvm-project/149450?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#149447** https://app.graphite.dev/github/pr/llvm/llvm-project/149447?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/149518
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)

2025-07-18 Thread Shilei Tian via llvm-branch-commits

https://github.com/shiltian created 
https://github.com/llvm/llvm-project/pull/149518

Co-authored-by: Mekhanoshin, Stanislav 

>From c38e27b115c2fe9255eaaa89e9e83a48bc6edb23 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Fri, 18 Jul 2025 10:02:30 -0400
Subject: [PATCH] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250

Co-authored-by: Mekhanoshin, Stanislav 
---
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  47 ++
 llvm/lib/Target/AMDGPU/VOP1Instructions.td|   8 +
 llvm/lib/Target/AMDGPU/VOPInstructions.td |  11 +-
 .../AMDGPU/llvm.amdgcn.permlane16.swap.ll | 153 +-
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s |   6 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s|   6 +
 .../gfx1250_asm_vop3_from_vop1-fake16.s   |  21 +++
 .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s|  21 +++
 8 files changed, 270 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index d42e51d04ab9d..4c3f308a6cf75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -5,6 +5,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
 typedef half __attribute__((ext_vector_type(2))) half2;
 
 // CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
   out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
 }
 
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:[[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OLD_ADDR]] to ptr
+// CHECK-NEXT:[[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRC_ADDR]] to ptr
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:[[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], 
i64 0
+// CHECK-NEXT:[[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 
[[TMP4]], i64 1
+// CHECK-NEXT:[[TMP7:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT:[[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP10:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT:[[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT:[[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT:[[TMP13:%.*]] = insertelement <2 x i32> poison, i32 
[[TMP11]], i64 0
+// CHECK-NEXT:[[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 
[[TMP12]], i64 1
+// CHECK-NEXT:[[TMP15:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 
8
+// CHECK-NEXT:[[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP18:%.*]] = call { i32, i32 } 
@llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT:[[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT:[[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT:[[TMP21:%.*]] = insertelement <2 x i32> poison, i32 
[[TMP19]], i64 0
+// CHECK-NEXT:[[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 
[[TMP20]], i64 1
+// CHECK-NEXT:[[TMP23:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 
8
+// CHECK-NEXT:ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+  *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+  *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+  *out = __builtin_amdg