[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

2021-03-08 Thread Shilei Tian via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGc41ae246ac67: [OpenMP][Clang][NVPTX] Only build one bitcode 
library for each SM (authored by tianshilei1992).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97198/new/

https://reviews.llvm.org/D97198

Files:
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
  clang/test/Driver/openmp-offload-gpu.c
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -53,46 +53,28 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
-#endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
  int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
   int32_t Var, uint32_t Delta,
   int32_t Width) {
   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
   __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }
 
 // NVPTX specific kernel initialization
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -137,6 +137,7 @@
  -Xclang -emit-llvm-bc
  -Xclang -aux-triple -Xclang ${aux_triple}
  -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+ -Xclang -target-feature -Xclang +ptx61
  -D__CUDACC__
  -I${devicertl_base_directory}
  -I${devicertl_nvptx_directory}/src)
@@ -150,81 +151,51 @@
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)
 
-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 70 70 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-list(GET cuda_version_list ${itr} cuda_version)
-list(GET ptx_feature_list ${itr} ptx_num)
-set(cuda_flags ${sm_flags})
-list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-  set(cuda_version_major ${CMAKE_MATCH_1})
-  set(cuda_version_minor ${CMAKE_MATCH_2})
-else()
-  libomptarget_error_say(
-"Unrecognized CUDA version format: ${cuda_version}")
-endif()
-list(APPEND cuda_flags
-  "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0")
-
-set(bc_files "")
-foreach(src ${cuda_src_files})
-  get_filename_component(infile ${src} ABSOLUTE)
-  get_filename_component(outfile ${src} NAME)
-  set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-  add_custom_command(OUTPUT 

[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

2021-03-07 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert accepted this revision.
jdoerfert added a comment.
This revision is now accepted and ready to land.

LGTM


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97198/new/

https://reviews.llvm.org/D97198

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

2021-03-07 Thread Shilei Tian via Phabricator via cfe-commits
tianshilei1992 updated this revision to Diff 328926.
tianshilei1992 added a comment.

rebase and ping


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97198/new/

https://reviews.llvm.org/D97198

Files:
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
  clang/test/Driver/openmp-offload-gpu.c
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -53,46 +53,28 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
-#endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
  int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
   int32_t Var, uint32_t Delta,
   int32_t Width) {
   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
   __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }
 
 // NVPTX specific kernel initialization
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -137,6 +137,7 @@
  -Xclang -emit-llvm-bc
  -Xclang -aux-triple -Xclang ${aux_triple}
  -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+ -Xclang -target-feature -Xclang +ptx61
  -D__CUDACC__
  -I${devicertl_base_directory}
  -I${devicertl_nvptx_directory}/src)
@@ -150,81 +151,51 @@
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)
 
-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 70 70 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-list(GET cuda_version_list ${itr} cuda_version)
-list(GET ptx_feature_list ${itr} ptx_num)
-set(cuda_flags ${sm_flags})
-list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-  set(cuda_version_major ${CMAKE_MATCH_1})
-  set(cuda_version_minor ${CMAKE_MATCH_2})
-else()
-  libomptarget_error_say(
-"Unrecognized CUDA version format: ${cuda_version}")
-endif()
-list(APPEND cuda_flags
-  "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0")
-
-set(bc_files "")
-foreach(src ${cuda_src_files})
-  get_filename_component(infile ${src} ABSOLUTE)
-  get_filename_component(outfile ${src} NAME)
-  set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-  add_custom_command(OUTPUT ${outfile}
-COMMAND ${cuda_compiler} ${bc_flags}
-  ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-DEPENDS ${infile}
-  

[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

2021-02-22 Thread Shilei Tian via Phabricator via cfe-commits
tianshilei1992 updated this revision to Diff 325492.
tianshilei1992 added a comment.

use `ptx61` instead


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97198/new/

https://reviews.llvm.org/D97198

Files:
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
  clang/test/Driver/openmp-offload-gpu.c
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -53,46 +53,28 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
-#endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
  int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
   int32_t Var, uint32_t Delta,
   int32_t Width) {
   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
   __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }
 
 // NVPTX specific kernel initialization
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -137,6 +137,7 @@
  -Xclang -emit-llvm-bc
  -Xclang -aux-triple -Xclang ${aux_triple}
  -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+ -Xclang -target-feature -Xclang +ptx61
  -D__CUDACC__
  -I${devicertl_base_directory}
  -I${devicertl_nvptx_directory}/src)
@@ -150,81 +151,51 @@
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)
 
-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-list(GET cuda_version_list ${itr} cuda_version)
-list(GET ptx_feature_list ${itr} ptx_num)
-set(cuda_flags ${sm_flags})
-list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-  set(cuda_version_major ${CMAKE_MATCH_1})
-  set(cuda_version_minor ${CMAKE_MATCH_2})
-else()
-  libomptarget_error_say(
-"Unrecognized CUDA version format: ${cuda_version}")
-endif()
-list(APPEND cuda_flags
-  "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0")
-
-set(bc_files "")
-foreach(src ${cuda_src_files})
-  get_filename_component(infile ${src} ABSOLUTE)
-  get_filename_component(outfile ${src} NAME)
-  set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-  add_custom_command(OUTPUT ${outfile}
-COMMAND ${cuda_compiler} ${bc_flags}
-  ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-DEPENDS 

[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

2021-02-22 Thread Shilei Tian via Phabricator via cfe-commits
tianshilei1992 created this revision.
tianshilei1992 added reviewers: jdoerfert, JonChesterfield, ABataev, grokos, 
ye-luo.
Herald added subscribers: guansong, yaxunl, mgorny.
tianshilei1992 requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.

In D97003 , CUDA 9.2 is the minimum 
requirement for OpenMP offloading on
NVPTX target. We don't need to have macros in source code to select right 
functions
based on CUDA version. we don't need to compile multiple bitcode libraries of
different CUDA versions for each SM. We don't need to worry about future
compatibility with newer CUDA version.

`-target-feature +ptx60` is used in this patch, which corresponds to the highest
PTX version that CUDA 9.2 can support.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D97198

Files:
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
  clang/test/Driver/openmp-offload-gpu.c
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -53,46 +53,28 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
-#endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
  int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
   int32_t Var, uint32_t Delta,
   int32_t Width) {
   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
   __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }
 
 // NVPTX specific kernel initialization
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -137,6 +137,7 @@
  -Xclang -emit-llvm-bc
  -Xclang -aux-triple -Xclang ${aux_triple}
  -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+ -Xclang -target-feature -Xclang +ptx60
  -D__CUDACC__
  -I${devicertl_base_directory}
  -I${devicertl_nvptx_directory}/src)
@@ -150,81 +151,51 @@
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)
 
-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-list(GET cuda_version_list ${itr} cuda_version)
-list(GET ptx_feature_list ${itr} ptx_num)
-set(cuda_flags ${sm_flags})
-list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-  set(cuda_version_major ${CMAKE_MATCH_1})
-