From: Andrew Stubbs <[email protected]>

This patch, which was originally applied to the OG11 branch in 2021,
fixes a bug in which testcases using thread_limit larger than the
number of physical threads would crash with a memory fault.  This was
exacerbated in testcases with a lot of register pressure because the
autoscaling reduces the number of physical threads to compensate for
the increased resource usage.  We specifically saw this happen in the
t-reduction testcase in the external omptests testsuite.  With this patch
that testcase now passes, and a couple of other failures are also fixed.

The included test case was greatly reduced from the t-reduction testcase
with c-vise and hand-editing.  The code is nonsensical, but it was
triggering the memory fault with only 13 threads.  It was also checked
with nvidia offloading and on x86_64 without offloading.

libgomp/ChangeLog:

        * config/gcn/bar.h (gomp_barrier_init): Limit thread count to the
        actual physical number.
        * config/gcn/team.c (gomp_team_start): Don't attempt to set up
        threads that do not exist.
        * testsuite/libgomp.c/thread-limit-6.c: New test case.

Co-Authored-by: Sandra Loosemore <[email protected]>
---
 libgomp/config/gcn/bar.h                     |  3 ++
 libgomp/config/gcn/team.c                    |  4 ++
 libgomp/testsuite/libgomp.c/thread-limit-6.c | 53 ++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 libgomp/testsuite/libgomp.c/thread-limit-6.c

diff --git a/libgomp/config/gcn/bar.h b/libgomp/config/gcn/bar.h
index 6e838ff54a8..bb6a17baefa 100644
--- a/libgomp/config/gcn/bar.h
+++ b/libgomp/config/gcn/bar.h
@@ -55,6 +55,9 @@ typedef unsigned int gomp_barrier_state_t;
 
 static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
 {
+  unsigned actual_thread_count = __builtin_gcn_dim_size (1);
+  if (count > actual_thread_count)
+    count = actual_thread_count;
   bar->total = count;
   bar->awaited = count;
   bar->awaited_final = count;
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index c9c2f3c2419..2c8d653fe9c 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -213,6 +213,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
   if (nthreads == 1)
     return;
 
+  unsigned actual_thread_count = __builtin_gcn_dim_size (1);
+  if (nthreads > actual_thread_count)
+    nthreads = actual_thread_count;
+
   /* Release existing idle threads.  */
   for (unsigned i = 1; i < nthreads; ++i)
     {
diff --git a/libgomp/testsuite/libgomp.c/thread-limit-6.c 
b/libgomp/testsuite/libgomp.c/thread-limit-6.c
new file mode 100644
index 00000000000..77ef266de65
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/thread-limit-6.c
@@ -0,0 +1,53 @@
+/* { dg-additional-options "-O3" }  */
+
+/* This test is reduced from a larger example that used to give a 
+   "Memory access fault" on AMD GCN due to creating too many threads.  */
+
+int main() {
+  char Ac_0;
+  short As[2871];
+  int Ai[2871];
+  int Ci_0;
+  int Bi_0;
+  int Ai_0;
+  long All[2871];
+  long Cll_0;
+  long Bll_0;
+  long All_0;
+  float Af_0;
+  double Cd_0;
+  double Bd_0;
+  double Ad_0;
+  short Rs7, Rs10;
+  int Ri1, Ri2, Ri5, Ri6, Ri7, Ri10;
+  long Rll1, Rll2, Rll5, Rll6, Rll7, Rll10;
+  float Rf1;
+  double Rd2;
+
+  int i = 0;
+  for (int threads = 0; threads < 512; threads++)
+    {
+#pragma omp target teams num_teams(1) thread_limit(1024)
+#pragma omp parallel if (threads > 1) num_threads(threads)
+      {
+       Rs7 |= 1 < 0;
+       Rs10 = Rs10 || As[i] > 0;
+       Ri1 += Ai_0 + 0;
+       Ri2 += Ai_0 + (Bi_0 + Ci_0);
+       Ri5 *= i % 1000 == 0 ? 2 : 1;
+       Ri6 &= 0;
+       Ri7 |= 1 < 0;
+       Ri10 = Ri10 || Ai[i] > 0;
+       Rll1 += 0;
+       Rll2 += All_0 + (Bll_0 + Cll_0);
+       Rll5 *= i % 1000 == 0 ? : 1;
+       Rll6 &= 0;
+       Rll7 |= 1ll < 0;
+       Rll10 = Rll10 || All[i] > 0;
+       Rf1 += Af_0 + 0;
+       Rd2 += Ad_0 + (Bd_0 + Cd_0);
+      }
+    }
+  return 0;
+}
+
-- 
2.39.5

Reply via email to