[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
jrbyrnes wrote: > We should spent more energy making the scheduler sensible by default, instead > of creating all of this complexity. I would also prefer a more sensible default scheduler, but the driving usecase for this is global scheduling. The scheduler is doing inefficient things since it is unaware of loop carried dependencies. A generalized solution, then, is not feasible due the timeline for that feature. We could try adding some sort of ad-hoc heuristic to the scheduler for cases like this, but I don't see how that would improve complexity relative to this, and it will likely not produce the results the users expect. https://github.com/llvm/llvm-project/pull/85304 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
@@ -437,16 +437,18 @@ void test_sched_group_barrier() } // CHECK-LABEL: @test_sched_group_barrier_rule -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0) -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100) -// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, i32 -1, i32 -100) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i64 1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i64 1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i64 -9223372036854775808) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 2, i32 4, i32 6, i64 255) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 2, i32 4, i32 6, i64 1) void test_sched_group_barrier_rule() { __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0); __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0); - __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100); - __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100); + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 63); + __builtin_amdgcn_sched_group_barrier(2, 4, 6, 0, 1, 2, 3, 4, 5, 6, 7); jrbyrnes wrote: Do you prefer having the latest iteration wherein users provide a mask instead of the variadic ? https://github.com/llvm/llvm-project/pull/85304 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/85304 >From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 12 Mar 2024 10:22:24 -0700 Subject: [PATCH 1/4] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 17 +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 14 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 ++- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 -- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 14 +++ llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp| 3 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 25 9 files changed, 202 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 61ec8b79bf054d..f7b6a4610bd80a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") -BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n") BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 528a13fb275124..4bf71c7535db63 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_grid_size_z: return EmitAMDGPUGridSize(*this, 2); + // scheduling builtins + case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: { +return E->getNumArgs() == 3 + ? Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2))}) + : Builder.CreateCall( + CGM.getIntrinsic( + Intrinsic::amdgcn_sched_group_barrier_rule), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3))}); + } + // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 8a4533633706b2..e28e0a6987484b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -436,6 +436,20 @@ void test_sched_group_barrier() __builtin_amdgcn_sched_group_barrier(15, 1, -1); } +// CHECK-LABEL: @test_sched_group_barrier_rule +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, i32 -1, i32 -100) +void test_sched_group_barrier_rule() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0); + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0); + __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100); + __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100); +} + + // CHECK-LABEL: @test_iglp_opt // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 051e603c0819d2..68fe42a8f04d21 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, // matching instructions that will be associated with this sched_group_barrier. // The third parameter is an identifier which is used to describe what other // sched_group_barriers should be synchronized with. -def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, - Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects, - IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +multiclass SCHED_GR
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
jrbyrnes wrote: Updated the PR as discussed offline. Support the variadic builtin arg via combining into mask for intrinsic. This sort of implies a limit of 64 rules, but we can workaround by add a new intrinsic with two masks (to support rules 65-128), and so on. For now, rules in this PR behave as they do in the existing code (that is, they are additional inclusion criteria). Any changes to this will be addressed in future PR. https://github.com/llvm/llvm-project/pull/85304 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
https://github.com/jrbyrnes ready_for_review https://github.com/llvm/llvm-project/pull/85304 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/85304 >From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 12 Mar 2024 10:22:24 -0700 Subject: [PATCH 1/2] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 17 +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 14 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 ++- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 -- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 14 +++ llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp| 3 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 25 9 files changed, 202 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 61ec8b79bf054d..f7b6a4610bd80a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") -BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n") BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 528a13fb275124..4bf71c7535db63 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_grid_size_z: return EmitAMDGPUGridSize(*this, 2); + // scheduling builtins + case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: { +return E->getNumArgs() == 3 + ? Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2))}) + : Builder.CreateCall( + CGM.getIntrinsic( + Intrinsic::amdgcn_sched_group_barrier_rule), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3))}); + } + // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 8a4533633706b2..e28e0a6987484b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -436,6 +436,20 @@ void test_sched_group_barrier() __builtin_amdgcn_sched_group_barrier(15, 1, -1); } +// CHECK-LABEL: @test_sched_group_barrier_rule +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, i32 -1, i32 -100) +void test_sched_group_barrier_rule() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0); + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0); + __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100); + __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100); +} + + // CHECK-LABEL: @test_iglp_opt // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 051e603c0819d2..68fe42a8f04d21 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, // matching instructions that will be associated with this sched_group_barrier. // The third parameter is an identifier which is used to describe what other // sched_group_barriers should be synchronized with. -def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, - Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects, - IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +multiclass SCHED_GR
[clang] [llvm] [AMDGPU]: Add and codegen sched_group_barrier_inst (PR #78775)
https://github.com/jrbyrnes closed https://github.com/llvm/llvm-project/pull/78775 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
jrbyrnes wrote: Supersedes https://github.com/llvm/llvm-project/pull/78775 https://github.com/llvm/llvm-project/pull/85304 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
https://github.com/jrbyrnes edited https://github.com/llvm/llvm-project/pull/85304 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/85304 I am still working with the user to define the actual rules, so it is still a WIP. However, the current version contains the main machinery of the feature. This helps bridge the gap between sched_group_barrier and iglp_opt, enabling users (with compiler support) more ability to create the pipelines they want. In particular, this is aimed at helping control scheduling in blocks with loop-carried dependencies. Since this is a global scheduling problem, there is no straightforward way to tune the scheduler against these blocks. >From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 12 Mar 2024 10:22:24 -0700 Subject: [PATCH] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 17 +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 14 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 ++- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 -- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 14 +++ llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp| 3 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 25 9 files changed, 202 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 61ec8b79bf054d..f7b6a4610bd80a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") -BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n") BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 528a13fb275124..4bf71c7535db63 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_grid_size_z: return EmitAMDGPUGridSize(*this, 2); + // scheduling builtins + case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: { +return E->getNumArgs() == 3 + ? Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2))}) + : Builder.CreateCall( + CGM.getIntrinsic( + Intrinsic::amdgcn_sched_group_barrier_rule), + {EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3))}); + } + // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 8a4533633706b2..e28e0a6987484b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -436,6 +436,20 @@ void test_sched_group_barrier() __builtin_amdgcn_sched_group_barrier(15, 1, -1); } +// CHECK-LABEL: @test_sched_group_barrier_rule +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, i32 -1, i32 -100) +void test_sched_group_barrier_rule() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0); + __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0); + __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100); + __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100); +} + + // CHECK-LABEL: @test_iglp_opt // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 051e603c0819d2..68fe42a8f04d21 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_am
[clang] [llvm] [AMDGPU]: Add and codegen sched_group_barrier_inst (PR #78775)
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/78775 As stated, this simply adds and codegens the builtin/intrinsic. A subsequent patch will interface it with IGroupLP. The idea is to give the users more expression by allowing them to create schedgroups which have an inclusion mechanism that compares the char * argument of the builtin to the name of the instruction -- with the argument being handled as a prefix (again, this will be implemented in subsequent patch). There are some peculiarities with handling the char *, so I've created this as a separate review. In particular, I wasn't quite sure the best way to provide the metadata to the MIR passes -- open to ideas. >From 6687ddf4ff756bf15c8e7204e23491322c5b6d8c Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 19 Jan 2024 11:59:57 -0800 Subject: [PATCH] [AMDGPU]: Add and codegen sched_group_barrier_inst Change-Id: I920b3787a9a2c9f65b02d3d897bfe89573a97e27 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 16 + clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 13 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 ++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 15 .../lib/Target/AMDGPU/AMDGPUMachineFunction.h | 25 ++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 18 ++ llvm/lib/Target/AMDGPU/SIInstructions.td | 18 ++ .../llvm.amdgcn.sched.group.barrier.inst.ll | 34 +++ 11 files changed, 154 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.inst.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..bd540bebd37319 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -64,6 +64,7 @@ BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier_inst, "vcC*IiIi", "n") BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1ed35befe1361f..efcb0e80a4eb5c 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18149,6 +18149,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType}); return Builder.CreateCall(F, {Arg}); } + case AMDGPU::BI__builtin_amdgcn_sched_group_barrier_inst: { +StringRef InstrStr; +llvm::getConstantStringInfo(EmitScalarExpr(E->getArg(0)), InstrStr); + +llvm::MDBuilder MDHelper(getLLVMContext()); + +MDNode *InfoTuple = +MDTuple::get(getLLVMContext(), {MDHelper.createString(InstrStr)}); +auto MDV = MetadataAsValue::get(getLLVMContext(), InfoTuple); + +Function *F = +CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier_inst, {}); +llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); +llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); +return Builder.CreateCall(F, {MDV, Src1, Src2}); + } default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 0bc9a54682d3e3..d43a47746cf0df 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -436,6 +436,19 @@ void test_sched_group_barrier() __builtin_amdgcn_sched_group_barrier(15, 1, -1); } +// CHECK-LABEL: @test_sched_group_barrier_inst +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !16, i32 1, i32 2) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !17, i32 3, i32 1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !16, i32 1000, i32 -1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !18, i32 1, i32 1) +void test_sched_group_barrier_inst() +{ + __builtin_amdgcn_sched_group_barrier_inst("ds_r",1,2); + __builtin_amdgcn_sched_group_barrier_inst("v_cvt",3,1); + __builtin_amdgcn_sched_group_barrier_inst("ds_r",1000,-1); + __builtin_amdgcn_sched_group_barrier_inst("1",1,1); +} + // CHECK-LABEL: @test_iglp_opt // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e5596258847f9f..fd8b4581d97c8c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -302,6 +302,
[libclc] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
https://github.com/jrbyrnes commented: Just have a few questions about implementation details -- at a higher level, seems like we are trading one heuristic for another w.r.t flagging regions as ExcessRP -- so I'm curious about the relative performance. https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
https://github.com/jrbyrnes commented: Just have a few questions about implementation details -- at a higher level, seems like we are trading one heuristic for another w.r.t flagging regions as ExcessRP -- so I'm curious about the relative performance. https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libclc] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
@@ -894,10 +894,22 @@ void GCNSchedStage::setupNewBlock() { void GCNSchedStage::finalizeGCNRegion() { DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); - DAG.RescheduleRegions[RegionIdx] = false; jrbyrnes wrote: Why was this removed? https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libclc] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
@@ -959,16 +970,6 @@ void GCNSchedStage::checkScheduling() { << DAG.MinOccupancy << ".\n"); } - unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); - unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum(false) > MaxVGPRs || - PressureAfter.getAGPRNum() > MaxVGPRs || - PressureAfter.getSGPRNum() > MaxSGPRs) { -DAG.RescheduleRegions[RegionIdx] = true; jrbyrnes wrote: Why do we drop the maxNumGPR checks -- these are wavesPerEU aware? https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
@@ -1117,16 +1118,23 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { // If RP is not reduced in the unclustered reschedule stage, revert to the // old schedule. - if ((WavesAfter <= PressureBefore.getOccupancy(ST) && - mayCauseSpilling(WavesAfter)) || - GCNSchedStage::shouldRevertScheduling(WavesAfter)) { -LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); -return true; - } + if (DAG.RegionsWithExcessRP[RegionIdx]) { jrbyrnes wrote: Should still revert if occupancy has dropped (GCNSchedStage::shouldRevertScheduling)? https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
@@ -702,7 +702,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { if (!GCNSchedStage::initGCNSchedStage()) return false; - if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) + if (DAG.RegionsWithExcessRP.none()) jrbyrnes wrote: What about regions that are close to the critical limit? https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)
https://github.com/jrbyrnes commented: Just have a few questions about implementation details -- at a higher level, seems like we are trading one heuristic for another w.r.t flagging regions as ExcessRP -- so I'm curious about the relative performance. https://github.com/llvm/llvm-project/pull/68004 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] be8a65b - [HIP]: Add -fhip-emit-relocatable to override link job creation for -fno-gpu-rdc
Author: Jeffrey Byrnes Date: 2023-06-29T08:18:28-07:00 New Revision: be8a65b598b3b80f73e862a01c7eaafe84d853a0 URL: https://github.com/llvm/llvm-project/commit/be8a65b598b3b80f73e862a01c7eaafe84d853a0 DIFF: https://github.com/llvm/llvm-project/commit/be8a65b598b3b80f73e862a01c7eaafe84d853a0.diff LOG: [HIP]: Add -fhip-emit-relocatable to override link job creation for -fno-gpu-rdc Differential Revision: https://reviews.llvm.org/D153667 Change-Id: Idcc5c7c25dc350b8dc9a1865fd67982904d06ecd Added: clang/test/Driver/hip-dependent-options.hip Modified: clang/include/clang/Driver/Options.td clang/lib/Driver/Driver.cpp clang/test/Driver/hip-device-compile.hip clang/test/Driver/hip-phases.hip clang/test/Driver/hip-rdc-device-only.hip Removed: diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a352e20f1f9a0c..dfecea22ea69b0 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1124,6 +1124,10 @@ def gpu_bundle_output : Flag<["--"], "gpu-bundle-output">, Group, HelpText<"Bundle output files of HIP device compilation">; def no_gpu_bundle_output : Flag<["--"], "no-gpu-bundle-output">, Group, HelpText<"Do not bundle output files of HIP device compilation">; +def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, Group, + HelpText<"Compile HIP source to relocatable">; +def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, Group, + HelpText<"Do not override toolchain to compile HIP source to relocatable">; def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>, HelpText<"An ID for compilation unit, which should be the same for the same " "compilation unit but diff erent for diff erent compilation units. " diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 1580f092bcde0d..ccdaa5c7eb68bb 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2946,7 +2946,12 @@ class OffloadingActionBuilder final { CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs, Action::OffloadKind OFKind) -: DeviceActionBuilder(C, Args, Inputs, OFKind) {} +: DeviceActionBuilder(C, Args, Inputs, OFKind) { + + CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); +} ActionBuilderReturnCode addDeviceDependences(Action *HostAction) override { // While generating code for CUDA, we only depend on the host input action @@ -3099,9 +3104,6 @@ class OffloadingActionBuilder final { !C.hasOffloadToolChain()) return false; - Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, - options::OPT_fno_gpu_rdc, /*Default=*/false); - const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); if (HostTC->getTriple().isNVPTX() || @@ -3120,7 +3122,6 @@ class OffloadingActionBuilder final { : C.getSingleOffloadToolChain()); CompileHostOnly = C.getDriver().offloadHostOnly(); - CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); EmitLLVM = Args.getLastArg(options::OPT_emit_llvm); EmitAsm = Args.getLastArg(options::OPT_S); FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); @@ -3352,16 +3353,40 @@ class OffloadingActionBuilder final { // only compilation. Bundle other type of output files only if // --gpu-bundle-output is specified for device only compilation. std::optional BundleOutput; +std::optional EmitReloc; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) { + DefaultCudaArch = CudaArch::GFX906; + + if (Args.hasArg(options::OPT_fhip_emit_relocatable, + options::OPT_fno_hip_emit_relocatable)) { +EmitReloc = Args.hasFlag(options::OPT_fhip_emit_relocatable, + options::OPT_fno_hip_emit_relocatable, false); + +if (*EmitReloc) { + if (Relocatable) { +C.getDriver().Diag(diag::err_opt_not_valid_with_opt) +<< "-fhip-emit-relocatable" +<< "-fgpu-rdc"; + } + + if (!CompileDeviceOnly) { +C.getDriver().Diag(diag::err_opt_not_valid_without_opt) +<< "-fhip-emit-relocatable" +<< "--cuda-device-only"; + } +} + } + if (Args.hasArg(options::OPT_gpu_bundle_output, options::OPT_no_gpu_bundle_output))