[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/149518 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)
llvmbot wrote: @llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) Changes Co-authored-by: Mekhanoshin, Stanislav--- Patch is 21.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149518.diff 8 Files Affected: - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+47) - (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+8) - (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+10-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll (+151-2) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+6) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+6) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+21) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+21) ``diff diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index d42e51d04ab9d..4c3f308a6cf75 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -5,6 +5,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned int uint; +typedef unsigned int __attribute__((ext_vector_type(2))) uint2; typedef half __attribute__((ext_vector_type(2))) half2; // CHECK-LABEL: @test_setprio_inc_wg( @@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a) out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); } +// CHECK-LABEL: @test_permlane16_swap( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT:[[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT:[[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT:[[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 +// CHECK-NEXT:[[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 +// CHECK-NEXT:[[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 +// CHECK-NEXT:[[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) +// CHECK-NEXT:[[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 +// CHECK-NEXT:[[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 +// CHECK-NEXT:[[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 +// CHECK-NEXT:[[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 +// CHECK-NEXT:[[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 +// CHECK-NEXT:[[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) +// CHECK-NEXT:[[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 +// CHECK-NEXT:[[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 +// CHECK-NEXT:[[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +// CHECK-NEXT:[[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +// CHECK-NEXT:[[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 +// CHECK-NEXT:ret void +// +void test_permlane16_swap(global uint2* out, uint old, uint src) { + *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, fa
[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) Changes Co-authored-by: Mekhanoshin, Stanislav--- Patch is 21.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149518.diff 8 Files Affected: - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+47) - (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+8) - (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+10-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll (+151-2) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+6) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+6) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+21) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+21) ``diff diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index d42e51d04ab9d..4c3f308a6cf75 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -5,6 +5,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned int uint; +typedef unsigned int __attribute__((ext_vector_type(2))) uint2; typedef half __attribute__((ext_vector_type(2))) half2; // CHECK-LABEL: @test_setprio_inc_wg( @@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a) out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); } +// CHECK-LABEL: @test_permlane16_swap( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT:[[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT:[[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT:[[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 +// CHECK-NEXT:[[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 +// CHECK-NEXT:[[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 +// CHECK-NEXT:[[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) +// CHECK-NEXT:[[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 +// CHECK-NEXT:[[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 +// CHECK-NEXT:[[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 +// CHECK-NEXT:[[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 +// CHECK-NEXT:[[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 +// CHECK-NEXT:[[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT:[[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) +// CHECK-NEXT:[[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 +// CHECK-NEXT:[[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 +// CHECK-NEXT:[[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +// CHECK-NEXT:[[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +// CHECK-NEXT:[[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 +// CHECK-NEXT:ret void +// +void test_permlane16_swap(global uint2* out, uint old, uint src) { + *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); + *out = __builtin_amdgcn_permlane16_swap(old
[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)
shiltian wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/149518?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#149518** https://app.graphite.dev/github/pr/llvm/llvm-project/149518?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/149518?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#149450** https://app.graphite.dev/github/pr/llvm/llvm-project/149450?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#149447** https://app.graphite.dev/github/pr/llvm/llvm-project/149447?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/149518 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (PR #149518)
https://github.com/shiltian created
https://github.com/llvm/llvm-project/pull/149518
Co-authored-by: Mekhanoshin, Stanislav
>From c38e27b115c2fe9255eaaa89e9e83a48bc6edb23 Mon Sep 17 00:00:00 2001
From: Shilei Tian
Date: Fri, 18 Jul 2025 10:02:30 -0400
Subject: [PATCH] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250
Co-authored-by: Mekhanoshin, Stanislav
---
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 47 ++
llvm/lib/Target/AMDGPU/VOP1Instructions.td| 8 +
llvm/lib/Target/AMDGPU/VOPInstructions.td | 11 +-
.../AMDGPU/llvm.amdgcn.permlane16.swap.ll | 153 +-
llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 6 +
llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s| 6 +
.../gfx1250_asm_vop3_from_vop1-fake16.s | 21 +++
.../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s| 21 +++
8 files changed, 270 insertions(+), 3 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index d42e51d04ab9d..4c3f308a6cf75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -5,6 +5,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT: entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
+// CHECK-NEXT:[[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:[[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[OLD_ADDR]] to ptr
+// CHECK-NEXT:[[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[SRC_ADDR]] to ptr
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]],
align 8
+// CHECK-NEXT:store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = call { i32, i32 }
@llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:[[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:[[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]],
i64 0
+// CHECK-NEXT:[[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32
[[TMP4]], i64 1
+// CHECK-NEXT:[[TMP7:%.*]] = load ptr addrspace(1), ptr
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT:[[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP10:%.*]] = call { i32, i32 }
@llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT:[[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT:[[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT:[[TMP13:%.*]] = insertelement <2 x i32> poison, i32
[[TMP11]], i64 0
+// CHECK-NEXT:[[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32
[[TMP12]], i64 1
+// CHECK-NEXT:[[TMP15:%.*]] = load ptr addrspace(1), ptr
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align
8
+// CHECK-NEXT:[[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:[[TMP18:%.*]] = call { i32, i32 }
@llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT:[[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT:[[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT:[[TMP21:%.*]] = insertelement <2 x i32> poison, i32
[[TMP19]], i64 0
+// CHECK-NEXT:[[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32
[[TMP20]], i64 1
+// CHECK-NEXT:[[TMP23:%.*]] = load ptr addrspace(1), ptr
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align
8
+// CHECK-NEXT:ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+ *out = __builtin_amdg
