[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2020-01-07 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG9f2d8b5c0cdb: [HIP] Add option --gpu-max-threads-per-block=n 
(authored by yaxunl).
Herald added a project: clang.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/TargetInfo.cpp
  clang/lib/Driver/ToolChains/HIP.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
  clang/test/Driver/hip-options.hip

Index: clang/test/Driver/hip-options.hip
===
--- /dev/null
+++ clang/test/Driver/hip-options.hip
@@ -0,0 +1,10 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s
+
+// Check that there are commands for both host- and device-side compilations.
+//
+// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device"
+// CHECK-SAME: "--gpu-max-threads-per-block=1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -1,13 +1,21 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
-// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
 // RUN: %clang_cc1 -triple nvptx \
 // RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
 // RUN: -check-prefix=NAMD
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
-// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s
+// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s
 
 #include "Inputs/cuda.h"
 
+__global__ void flat_work_group_size_default() {
+// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+}
+
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {
 // CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
@@ -31,7 +39,9 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64" 
-// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32" 
-// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64" 
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
+// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
Index: clang/lib/Frontend/CompilerInvocation.cpp
===
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2559,6 +2559,12 @@
   << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
   }
   Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
+  if (Opts.HIP)
+Opts.GPUMaxThreadsPerBlock = getLastArgIntValue(
+Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock);
+  else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ))
+Diags.Report(diag::warn_ignored_hip_only_option)
+<< Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);
 
   if (Opts.ObjC) {
 if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Index: clang/lib/Driver/ToolChains/HIP.cpp
===
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -307,6 +307,14 @@
  false))
 CC1Args.push_back("-fgpu-rdc");
 
+  StringRef MaxThreadsPerBlock =
+  DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
+  if (!MaxThreadsPerBlock.empty()) {
+std::string ArgStr =
+  

[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2020-01-05 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

ping


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2019-12-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 235025.
yaxunl added a comment.

revised by Artem's comments.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/TargetInfo.cpp
  clang/lib/Driver/ToolChains/HIP.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
  clang/test/Driver/hip-options.hip

Index: clang/test/Driver/hip-options.hip
===
--- /dev/null
+++ clang/test/Driver/hip-options.hip
@@ -0,0 +1,10 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s
+
+// Check that there are commands for both host- and device-side compilations.
+//
+// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device"
+// CHECK-SAME: "--gpu-max-threads-per-block=1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -1,13 +1,21 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
-// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
 // RUN: %clang_cc1 -triple nvptx \
 // RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
 // RUN: -check-prefix=NAMD
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
-// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s
+// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s
 
 #include "Inputs/cuda.h"
 
+__global__ void flat_work_group_size_default() {
+// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+}
+
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {
 // CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
@@ -31,7 +39,9 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64" 
-// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32" 
-// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64" 
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
+// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
Index: clang/lib/Frontend/CompilerInvocation.cpp
===
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2559,6 +2559,12 @@
   << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
   }
   Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
+  if (Opts.HIP)
+Opts.GPUMaxThreadsPerBlock = getLastArgIntValue(
+Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock);
+  else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ))
+Diags.Report(diag::warn_ignored_hip_only_option)
+<< Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);
 
   if (Opts.ObjC) {
 if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Index: clang/lib/Driver/ToolChains/HIP.cpp
===
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -307,6 +307,14 @@
  false))
 CC1Args.push_back("-fgpu-rdc");
 
+  StringRef MaxThreadsPerBlock =
+  DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
+  if (!MaxThreadsPerBlock.empty()) {
+std::string ArgStr =
+std::string("--gpu-max-threads-per-block=") + MaxThreadsPerBlock.str();
+CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr));
+  }
+
 

[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2019-12-20 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/CodeGen/TargetInfo.cpp:8067
+unsigned MaxThreadsPerBlock =
+IsHIPKernel ? M.getLangOpts().GPUMaxThreadsPerBlock : 256;
+std::string AttrVal = std::string("1,") + llvm::utostr(MaxThreadsPerBlock);

yaxunl wrote:
> tra wrote:
> > The magic value of 256 should be defined as a constant or macro somewhere 
> > -- you're using it in multiple places.
> > Alternatively, always set LangOpts.GPUMaxThreadsPerBlock to something and 
> > skip figuring out the default everywhere else.
> For the default value of LangOpts.GPUMaxThreadsPerBlock, it tends to be 
> target dependent. I am thinking probably should add 
> TargetInfo.getDefaultMaxThreadsPerBlock() and use it to set the default value 
> for LangOpts.GPUMaxThreadsPerBlock.
That could be an option. I just want to have an unambiguous source for that 
number.
BTW, does the value need to be hardcoded for OpenCL?

I think it would make sense for --gpu-max-threads-per-block=n to control the 
value for OpenCL, too.
Then you would always get the value from LangOpts.GPUMaxThreadsPerBlock  and 
will have only one place where you set the default, which would be OK until we 
make the default target-specific.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2019-12-20 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 2 inline comments as done.
yaxunl added a comment.

In D71221#1791802 , @tra wrote:

> What's the use case for this flag?


If a kernel is launched with a block size greater than the default max block 
size, explicit launch bound is required.

Different projects have different block size usages.

If a project mostly uses block size 1024, it prefers to use 1024 as the default 
max block size to avoid adding explicit launch bounds to each kernel.

If a project mostly uses block size 256, it prefers to use 256 as the default 
max block size.

Another situation is that at the initial development stage people prefer a 
default max block size that works for all possible launching block sizes. Then 
they can just let the max block size be 1024 by using this option. Later on, 
they can add launch bounds and choose a different max block size.

On the other hand, we cannot simply let the default max block size be 1024 
since we have large sets of existing projects assuming default max block size 
be 256. Changing the default max block size to 1024 will cause perf degradation 
in the existing projects. Adding this options provides an option for backward 
compatibility in case we want to change the default max block size.




Comment at: clang/lib/CodeGen/TargetInfo.cpp:8067
+unsigned MaxThreadsPerBlock =
+IsHIPKernel ? M.getLangOpts().GPUMaxThreadsPerBlock : 256;
+std::string AttrVal = std::string("1,") + llvm::utostr(MaxThreadsPerBlock);

tra wrote:
> The magic value of 256 should be defined as a constant or macro somewhere -- 
> you're using it in multiple places.
> Alternatively, always set LangOpts.GPUMaxThreadsPerBlock to something and 
> skip figuring out the default everywhere else.
For the default value of LangOpts.GPUMaxThreadsPerBlock, it tends to be target 
dependent. I am thinking probably should add 
TargetInfo.getDefaultMaxThreadsPerBlock() and use it to set the default value 
for LangOpts.GPUMaxThreadsPerBlock.



Comment at: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu:19
+
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {

tra wrote:
> Is this the attribute that `__launch_bounds__()` expands to in HIP?
> If __launch_bounds__ is a separate attribute, then, I guess, it should be 
> tested, too.
yes.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2019-12-19 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

What's the use case for this flag?




Comment at: clang/lib/CodeGen/TargetInfo.cpp:8067
+unsigned MaxThreadsPerBlock =
+IsHIPKernel ? M.getLangOpts().GPUMaxThreadsPerBlock : 256;
+std::string AttrVal = std::string("1,") + llvm::utostr(MaxThreadsPerBlock);

The magic value of 256 should be defined as a constant or macro somewhere -- 
you're using it in multiple places.
Alternatively, always set LangOpts.GPUMaxThreadsPerBlock to something and skip 
figuring out the default everywhere else.



Comment at: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu:19
+
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {

Is this the attribute that `__launch_bounds__()` expands to in HIP?
If __launch_bounds__ is a separate attribute, then, I guess, it should be 
tested, too.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2019-12-19 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

ping


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71221/new/

https://reviews.llvm.org/D71221



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D71221: [HIP] Add option --gpu-max-threads-per-block=n

2019-12-09 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
yaxunl added a reviewer: tra.
Herald added subscribers: nhaehnle, jvesely.

Add this option to change the default launch bounds.


https://reviews.llvm.org/D71221

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/TargetInfo.cpp
  clang/lib/Driver/ToolChains/HIP.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
  clang/test/Driver/hip-options.hip

Index: clang/test/Driver/hip-options.hip
===
--- /dev/null
+++ clang/test/Driver/hip-options.hip
@@ -0,0 +1,10 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s
+
+// Check that there are commands for both host- and device-side compilations.
+//
+// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device"
+// CHECK-SAME: "--gpu-max-threads-per-block=1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -1,13 +1,21 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
-// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
 // RUN: %clang_cc1 -triple nvptx \
 // RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
 // RUN: -check-prefix=NAMD
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
-// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s
+// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s
 
 #include "Inputs/cuda.h"
 
+__global__ void flat_work_group_size_default() {
+// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+}
+
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {
 // CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
@@ -31,7 +39,9 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64" 
-// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32" 
-// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64" 
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
+// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
Index: clang/lib/Frontend/CompilerInvocation.cpp
===
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2556,6 +2556,12 @@
   << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
   }
   Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
+  if (Opts.HIP)
+Opts.GPUMaxThreadsPerBlock =
+getLastArgIntValue(Args, OPT_gpu_max_threads_per_block_EQ, 256);
+  else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ))
+Diags.Report(diag::warn_ignored_hip_only_option)
+<< Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);
 
   if (Opts.ObjC) {
 if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Index: clang/lib/Driver/ToolChains/HIP.cpp
===
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -292,6 +292,14 @@
  false))
 CC1Args.push_back("-fgpu-rdc");
 
+  StringRef MaxThreadsPerBlock =
+  DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
+  if (!MaxThreadsPerBlock.empty()) {
+std::string ArgStr =
+std::string("--gpu-max-threads-per-block=") + MaxThreadsPerBlock.str();
+CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr));
+  }
+
   if