[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-05-09 Thread Jeffrey Byrnes via cfe-commits

jrbyrnes wrote:

> We should spent more energy making the scheduler sensible by default, instead 
> of creating all of this complexity.

I would also prefer a more sensible default scheduler, but the driving usecase 
for this is global scheduling. The scheduler is doing inefficient things since 
it is unaware of loop carried dependencies. A generalized solution, then, is 
not feasible due the timeline for that feature. We could try adding some sort 
of ad-hoc heuristic to the scheduler for cases like this, but I don't see how 
that would improve complexity relative to this, and it will likely not produce 
the results the users expect.

https://github.com/llvm/llvm-project/pull/85304
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-05-01 Thread Jeffrey Byrnes via cfe-commits


@@ -437,16 +437,18 @@ void test_sched_group_barrier()
 }
 
 // CHECK-LABEL: @test_sched_group_barrier_rule
-// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, 
i32 0)
-// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, 
i32 0)
-// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 
16, i32 100)
-// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, 
i32 -1, i32 -100)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, 
i64 1)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, 
i64 1)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, 
i64 -9223372036854775808)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 2, i32 4, i32 6, 
i64 255)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 2, i32 4, i32 6, 
i64 1)
 void test_sched_group_barrier_rule()
 {
   __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0);
   __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0);
-  __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100);
-  __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100);
+  __builtin_amdgcn_sched_group_barrier(1, 2, 4, 63);
+  __builtin_amdgcn_sched_group_barrier(2, 4, 6, 0, 1, 2, 3, 4, 5, 6, 7);

jrbyrnes wrote:

Do you prefer having the latest iteration wherein users provide a mask instead 
of the variadic ? 

https://github.com/llvm/llvm-project/pull/85304
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-05-01 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes updated 
https://github.com/llvm/llvm-project/pull/85304

>From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes 
Date: Tue, 12 Mar 2024 10:22:24 -0700
Subject: [PATCH 1/4] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to
 support rules.

Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp   |  17 +++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |  14 +++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  15 ++-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 --
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |  14 +++
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  16 +++
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp|   3 +-
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll |  25 
 9 files changed, 202 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61ec8b79bf054d..f7b6a4610bd80a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n")
 BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
-BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
+BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n")
 BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
 BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 528a13fb275124..4bf71c7535db63 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
 return EmitAMDGPUGridSize(*this, 2);
 
+  // scheduling builtins
+  case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: {
+return E->getNumArgs() == 3
+   ? Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier),
+ {EmitScalarExpr(E->getArg(0)),
+  EmitScalarExpr(E->getArg(1)),
+  EmitScalarExpr(E->getArg(2))})
+   : Builder.CreateCall(
+ CGM.getIntrinsic(
+ Intrinsic::amdgcn_sched_group_barrier_rule),
+ {EmitScalarExpr(E->getArg(0)),
+  EmitScalarExpr(E->getArg(1)),
+  EmitScalarExpr(E->getArg(2)),
+  EmitScalarExpr(E->getArg(3))});
+  }
+
   // r600 intrinsics
   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 8a4533633706b2..e28e0a6987484b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -436,6 +436,20 @@ void test_sched_group_barrier()
   __builtin_amdgcn_sched_group_barrier(15, 1, -1);
 }
 
+// CHECK-LABEL: @test_sched_group_barrier_rule
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, 
i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, 
i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 
16, i32 100)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, 
i32 -1, i32 -100)
+void test_sched_group_barrier_rule()
+{
+  __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0);
+  __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0);
+  __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100);
+  __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100);
+}
+
+
 // CHECK-LABEL: @test_iglp_opt
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0)
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..68fe42a8f04d21 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : 
ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
 // matching instructions that will be associated with this sched_group_barrier.
 // The third parameter is an identifier which is used to describe what other
 // sched_group_barriers should be synchronized with.
-def int_amdgcn_sched_group_barrier : 
ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
-  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, 
IntrHasSideEffects,
-   IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+multiclass SCHED_GR

[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-04-23 Thread Jeffrey Byrnes via cfe-commits

jrbyrnes wrote:

Updated the PR as discussed offline.

Support the variadic builtin arg via combining into mask for intrinsic. This 
sort of implies a limit of 64 rules, but we can workaround by add a new 
intrinsic with two masks (to support rules 65-128), and so on.

For now, rules in this PR behave as they do in the existing code (that is, they 
are additional inclusion criteria). Any changes to this will be addressed in 
future PR.

https://github.com/llvm/llvm-project/pull/85304
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-04-23 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes ready_for_review 
https://github.com/llvm/llvm-project/pull/85304
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-04-23 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes updated 
https://github.com/llvm/llvm-project/pull/85304

>From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes 
Date: Tue, 12 Mar 2024 10:22:24 -0700
Subject: [PATCH 1/2] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to
 support rules.

Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp   |  17 +++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |  14 +++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  15 ++-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 --
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |  14 +++
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  16 +++
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp|   3 +-
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll |  25 
 9 files changed, 202 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61ec8b79bf054d..f7b6a4610bd80a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n")
 BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
-BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
+BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n")
 BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
 BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 528a13fb275124..4bf71c7535db63 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
 return EmitAMDGPUGridSize(*this, 2);
 
+  // scheduling builtins
+  case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: {
+return E->getNumArgs() == 3
+   ? Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier),
+ {EmitScalarExpr(E->getArg(0)),
+  EmitScalarExpr(E->getArg(1)),
+  EmitScalarExpr(E->getArg(2))})
+   : Builder.CreateCall(
+ CGM.getIntrinsic(
+ Intrinsic::amdgcn_sched_group_barrier_rule),
+ {EmitScalarExpr(E->getArg(0)),
+  EmitScalarExpr(E->getArg(1)),
+  EmitScalarExpr(E->getArg(2)),
+  EmitScalarExpr(E->getArg(3))});
+  }
+
   // r600 intrinsics
   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 8a4533633706b2..e28e0a6987484b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -436,6 +436,20 @@ void test_sched_group_barrier()
   __builtin_amdgcn_sched_group_barrier(15, 1, -1);
 }
 
+// CHECK-LABEL: @test_sched_group_barrier_rule
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, 
i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, 
i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 
16, i32 100)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, 
i32 -1, i32 -100)
+void test_sched_group_barrier_rule()
+{
+  __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0);
+  __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0);
+  __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100);
+  __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100);
+}
+
+
 // CHECK-LABEL: @test_iglp_opt
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0)
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..68fe42a8f04d21 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : 
ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
 // matching instructions that will be associated with this sched_group_barrier.
 // The third parameter is an identifier which is used to describe what other
 // sched_group_barriers should be synchronized with.
-def int_amdgcn_sched_group_barrier : 
ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
-  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, 
IntrHasSideEffects,
-   IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+multiclass SCHED_GR

[clang] [llvm] [AMDGPU]: Add and codegen sched_group_barrier_inst (PR #78775)

2024-03-14 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes closed 
https://github.com/llvm/llvm-project/pull/78775
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-03-14 Thread Jeffrey Byrnes via cfe-commits

jrbyrnes wrote:

Supersedes https://github.com/llvm/llvm-project/pull/78775

https://github.com/llvm/llvm-project/pull/85304
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-03-14 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes edited 
https://github.com/llvm/llvm-project/pull/85304
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)

2024-03-14 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes created 
https://github.com/llvm/llvm-project/pull/85304

I am still working with the user to define the actual rules, so it is still a 
WIP.  However, the current version contains the main machinery of the feature.

This helps bridge the gap between sched_group_barrier and iglp_opt, enabling 
users (with compiler support) more ability to create the pipelines they want.

In particular, this is aimed at helping control scheduling in blocks with 
loop-carried dependencies. Since this is a global scheduling problem, there is 
no straightforward way to tune the scheduler against these blocks.

>From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes 
Date: Tue, 12 Mar 2024 10:22:24 -0700
Subject: [PATCH] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to
 support rules.

Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp   |  17 +++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |  14 +++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  15 ++-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 --
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |  14 +++
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  16 +++
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp|   3 +-
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll |  25 
 9 files changed, 202 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61ec8b79bf054d..f7b6a4610bd80a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n")
 BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
-BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
+BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n")
 BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
 BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 528a13fb275124..4bf71c7535db63 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
 return EmitAMDGPUGridSize(*this, 2);
 
+  // scheduling builtins
+  case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: {
+return E->getNumArgs() == 3
+   ? Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier),
+ {EmitScalarExpr(E->getArg(0)),
+  EmitScalarExpr(E->getArg(1)),
+  EmitScalarExpr(E->getArg(2))})
+   : Builder.CreateCall(
+ CGM.getIntrinsic(
+ Intrinsic::amdgcn_sched_group_barrier_rule),
+ {EmitScalarExpr(E->getArg(0)),
+  EmitScalarExpr(E->getArg(1)),
+  EmitScalarExpr(E->getArg(2)),
+  EmitScalarExpr(E->getArg(3))});
+  }
+
   // r600 intrinsics
   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 8a4533633706b2..e28e0a6987484b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -436,6 +436,20 @@ void test_sched_group_barrier()
   __builtin_amdgcn_sched_group_barrier(15, 1, -1);
 }
 
+// CHECK-LABEL: @test_sched_group_barrier_rule
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, 
i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, 
i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 
16, i32 100)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 1, 
i32 -1, i32 -100)
+void test_sched_group_barrier_rule()
+{
+  __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0);
+  __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0);
+  __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100);
+  __builtin_amdgcn_sched_group_barrier(15, 1, -1, -100);
+}
+
+
 // CHECK-LABEL: @test_iglp_opt
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0)
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..68fe42a8f04d21 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : 
ClangBuiltin<"__builtin_am

[clang] [llvm] [AMDGPU]: Add and codegen sched_group_barrier_inst (PR #78775)

2024-01-19 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes created 
https://github.com/llvm/llvm-project/pull/78775

As stated, this simply adds and codegens the builtin/intrinsic. A subsequent 
patch will interface it with IGroupLP.

The idea is to give the users more expression by allowing them to create 
schedgroups which have an inclusion mechanism that compares the char * argument 
of the builtin to the name of the instruction -- with the argument being 
handled as a prefix (again, this will be implemented in subsequent patch).

There are some peculiarities with handling the char *, so I've created this as 
a separate review. In particular, I wasn't quite sure the best way to provide 
the metadata to the MIR passes -- open to ideas.

>From 6687ddf4ff756bf15c8e7204e23491322c5b6d8c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes 
Date: Fri, 19 Jan 2024 11:59:57 -0800
Subject: [PATCH] [AMDGPU]: Add and codegen sched_group_barrier_inst

Change-Id: I920b3787a9a2c9f65b02d3d897bfe89573a97e27
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp   | 16 +
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   | 13 +++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 11 ++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  2 ++
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  | 15 
 .../lib/Target/AMDGPU/AMDGPUMachineFunction.h | 25 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 18 ++
 llvm/lib/Target/AMDGPU/SIInstructions.td  | 18 ++
 .../llvm.amdgcn.sched.group.barrier.inst.ll   | 34 +++
 11 files changed, 154 insertions(+)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.inst.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..bd540bebd37319 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -64,6 +64,7 @@ BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
 BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
+BUILTIN(__builtin_amdgcn_sched_group_barrier_inst, "vcC*IiIi", "n")
 BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
 BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1ed35befe1361f..efcb0e80a4eb5c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18149,6 +18149,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
 return Builder.CreateCall(F, {Arg});
   }
+  case AMDGPU::BI__builtin_amdgcn_sched_group_barrier_inst: {
+StringRef InstrStr;
+llvm::getConstantStringInfo(EmitScalarExpr(E->getArg(0)), InstrStr);
+
+llvm::MDBuilder MDHelper(getLLVMContext());
+
+MDNode *InfoTuple =
+MDTuple::get(getLLVMContext(), {MDHelper.createString(InstrStr)});
+auto MDV = MetadataAsValue::get(getLLVMContext(), InfoTuple);
+
+Function *F =
+CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier_inst, {});
+llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+return Builder.CreateCall(F, {MDV, Src1, Src2});
+  }
   default:
 return nullptr;
   }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 0bc9a54682d3e3..d43a47746cf0df 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -436,6 +436,19 @@ void test_sched_group_barrier()
   __builtin_amdgcn_sched_group_barrier(15, 1, -1);
 }
 
+// CHECK-LABEL: @test_sched_group_barrier_inst
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !16, i32 1, 
i32 2)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !17, i32 3, 
i32 1)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !16, i32 
1000, i32 -1)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !18, i32 1, 
i32 1)
+void test_sched_group_barrier_inst()
+{
+  __builtin_amdgcn_sched_group_barrier_inst("ds_r",1,2);
+  __builtin_amdgcn_sched_group_barrier_inst("v_cvt",3,1);
+  __builtin_amdgcn_sched_group_barrier_inst("ds_r",1000,-1);
+  __builtin_amdgcn_sched_group_barrier_inst("1",1,1);
+}
+
 // CHECK-LABEL: @test_iglp_opt
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0)
 // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e5596258847f9f..fd8b4581d97c8c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -302,6 +302,

[libclc] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes commented:

Just have a few questions about implementation details -- at a higher level, 
seems like we are trading one heuristic for another w.r.t flagging regions as 
ExcessRP -- so I'm curious about the relative performance.

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes commented:

Just have a few questions about implementation details -- at a higher level, 
seems like we are trading one heuristic for another w.r.t flagging regions as 
ExcessRP -- so I'm curious about the relative performance.

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libclc] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits


@@ -894,10 +894,22 @@ void GCNSchedStage::setupNewBlock() {
 
 void GCNSchedStage::finalizeGCNRegion() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
-  DAG.RescheduleRegions[RegionIdx] = false;

jrbyrnes wrote:

Why was this removed?

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libclc] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits


@@ -959,16 +970,6 @@ void GCNSchedStage::checkScheduling() {
   << DAG.MinOccupancy << ".\n");
   }
 
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
-  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-  PressureAfter.getAGPRNum() > MaxVGPRs ||
-  PressureAfter.getSGPRNum() > MaxSGPRs) {
-DAG.RescheduleRegions[RegionIdx] = true;

jrbyrnes wrote:

Why do we drop the maxNumGPR checks -- these are wavesPerEU aware?

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits


@@ -1117,16 +1118,23 @@ bool 
OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   // If RP is not reduced in the unclustered reschedule stage, revert to the
   // old schedule.
-  if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
-   mayCauseSpilling(WavesAfter)) ||
-  GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
-LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
-return true;
-  }
+  if (DAG.RegionsWithExcessRP[RegionIdx]) {

jrbyrnes wrote:

Should still revert if occupancy has dropped 
(GCNSchedStage::shouldRevertScheduling)?

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits


@@ -702,7 +702,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (!GCNSchedStage::initGCNSchedStage())
 return false;
 
-  if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
+  if (DAG.RegionsWithExcessRP.none())

jrbyrnes wrote:

What about regions that are close to the critical limit?

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [AMDGPU][MachineScheduler] Alternative way to control excess RP. (PR #68004)

2023-10-20 Thread Jeffrey Byrnes via cfe-commits

https://github.com/jrbyrnes commented:

Just have a few questions about implementation details -- at a higher level, 
seems like we are trading one heuristic for another w.r.t flagging regions as 
ExcessRP -- so I'm curious about the relative performance.

https://github.com/llvm/llvm-project/pull/68004
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] be8a65b - [HIP]: Add -fhip-emit-relocatable to override link job creation for -fno-gpu-rdc

2023-06-29 Thread Jeffrey Byrnes via cfe-commits

Author: Jeffrey Byrnes
Date: 2023-06-29T08:18:28-07:00
New Revision: be8a65b598b3b80f73e862a01c7eaafe84d853a0

URL: 
https://github.com/llvm/llvm-project/commit/be8a65b598b3b80f73e862a01c7eaafe84d853a0
DIFF: 
https://github.com/llvm/llvm-project/commit/be8a65b598b3b80f73e862a01c7eaafe84d853a0.diff

LOG: [HIP]: Add -fhip-emit-relocatable to override link job creation for 
-fno-gpu-rdc

Differential Revision: https://reviews.llvm.org/D153667

Change-Id: Idcc5c7c25dc350b8dc9a1865fd67982904d06ecd

Added: 
clang/test/Driver/hip-dependent-options.hip

Modified: 
clang/include/clang/Driver/Options.td
clang/lib/Driver/Driver.cpp
clang/test/Driver/hip-device-compile.hip
clang/test/Driver/hip-phases.hip
clang/test/Driver/hip-rdc-device-only.hip

Removed: 




diff  --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index a352e20f1f9a0c..dfecea22ea69b0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1124,6 +1124,10 @@ def gpu_bundle_output : Flag<["--"], 
"gpu-bundle-output">,
   Group, HelpText<"Bundle output files of HIP device compilation">;
 def no_gpu_bundle_output : Flag<["--"], "no-gpu-bundle-output">,
   Group, HelpText<"Do not bundle output files of HIP device 
compilation">;
+def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, 
Group,
+  HelpText<"Compile HIP source to relocatable">;
+def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, 
Group,
+  HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>,
   HelpText<"An ID for compilation unit, which should be the same for the same "
"compilation unit but 
diff erent for 
diff erent compilation units. "

diff  --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 1580f092bcde0d..ccdaa5c7eb68bb 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2946,7 +2946,12 @@ class OffloadingActionBuilder final {
 CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
   const Driver::InputList &Inputs,
   Action::OffloadKind OFKind)
-: DeviceActionBuilder(C, Args, Inputs, OFKind) {}
+: DeviceActionBuilder(C, Args, Inputs, OFKind) {
+
+  CompileDeviceOnly = C.getDriver().offloadDeviceOnly();
+  Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+ options::OPT_fno_gpu_rdc, /*Default=*/false);
+}
 
 ActionBuilderReturnCode addDeviceDependences(Action *HostAction) override {
   // While generating code for CUDA, we only depend on the host input 
action
@@ -3099,9 +3104,6 @@ class OffloadingActionBuilder final {
   !C.hasOffloadToolChain())
 return false;
 
-  Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
- options::OPT_fno_gpu_rdc, /*Default=*/false);
-
   const ToolChain *HostTC = 
C.getSingleOffloadToolChain();
   assert(HostTC && "No toolchain for host compilation.");
   if (HostTC->getTriple().isNVPTX() ||
@@ -3120,7 +3122,6 @@ class OffloadingActionBuilder final {
   : C.getSingleOffloadToolChain());
 
   CompileHostOnly = C.getDriver().offloadHostOnly();
-  CompileDeviceOnly = C.getDriver().offloadDeviceOnly();
   EmitLLVM = Args.getLastArg(options::OPT_emit_llvm);
   EmitAsm = Args.getLastArg(options::OPT_S);
   FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ);
@@ -3352,16 +3353,40 @@ class OffloadingActionBuilder final {
 // only compilation. Bundle other type of output files only if
 // --gpu-bundle-output is specified for device only compilation.
 std::optional BundleOutput;
+std::optional EmitReloc;
 
   public:
 HIPActionBuilder(Compilation &C, DerivedArgList &Args,
  const Driver::InputList &Inputs)
 : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {
+
   DefaultCudaArch = CudaArch::GFX906;
+
+  if (Args.hasArg(options::OPT_fhip_emit_relocatable,
+  options::OPT_fno_hip_emit_relocatable)) {
+EmitReloc = Args.hasFlag(options::OPT_fhip_emit_relocatable,
+ options::OPT_fno_hip_emit_relocatable, false);
+
+if (*EmitReloc) {
+  if (Relocatable) {
+C.getDriver().Diag(diag::err_opt_not_valid_with_opt)
+<< "-fhip-emit-relocatable"
+<< "-fgpu-rdc";
+  }
+
+  if (!CompileDeviceOnly) {
+C.getDriver().Diag(diag::err_opt_not_valid_without_opt)
+<< "-fhip-emit-relocatable"
+<< "--cuda-device-only";
+  }
+}
+  }
+
   if (Args.hasArg(options::OPT_gpu_bundle_output,
   options::OPT_no_gpu_bundle_output))