From: Andrew Stubbs <[email protected]>
This patch, which was originally applied to the OG11 branch in 2021,
fixes a bug in which testcases using thread_limit larger than the
number of physical threads would crash with a memory fault. This was
exacerbated in testcases with a lot of register pressure because the
autoscaling reduces the number of physical threads to compensate for
the increased resource usage. We specifically saw this happen in the
t-reduction testcase in the external omptests testsuite. With this patch
that testcase now passes, and a couple of other failures are also fixed.
The included test case was greatly reduced from the t-reduction testcase
with c-vise and hand-editing. The code is nonsensical, but it was
triggering the memory fault with only 13 threads. It was also checked
with nvidia offloading and on x86_64 without offloading.
libgomp/ChangeLog:
* config/gcn/bar.h (gomp_barrier_init): Limit thread count to the
actual physical number.
* config/gcn/team.c (gomp_team_start): Don't attempt to set up
threads that do not exist.
* testsuite/libgomp.c/thread-limit-6.c: New test case.
Co-Authored-by: Sandra Loosemore <[email protected]>
---
libgomp/config/gcn/bar.h | 3 ++
libgomp/config/gcn/team.c | 4 ++
libgomp/testsuite/libgomp.c/thread-limit-6.c | 53 ++++++++++++++++++++
3 files changed, 60 insertions(+)
create mode 100644 libgomp/testsuite/libgomp.c/thread-limit-6.c
diff --git a/libgomp/config/gcn/bar.h b/libgomp/config/gcn/bar.h
index 6e838ff54a8..bb6a17baefa 100644
--- a/libgomp/config/gcn/bar.h
+++ b/libgomp/config/gcn/bar.h
@@ -55,6 +55,9 @@ typedef unsigned int gomp_barrier_state_t;
static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
{
+ unsigned actual_thread_count = __builtin_gcn_dim_size (1);
+ if (count > actual_thread_count)
+ count = actual_thread_count;
bar->total = count;
bar->awaited = count;
bar->awaited_final = count;
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index c9c2f3c2419..2c8d653fe9c 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -213,6 +213,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
if (nthreads == 1)
return;
+ unsigned actual_thread_count = __builtin_gcn_dim_size (1);
+ if (nthreads > actual_thread_count)
+ nthreads = actual_thread_count;
+
/* Release existing idle threads. */
for (unsigned i = 1; i < nthreads; ++i)
{
diff --git a/libgomp/testsuite/libgomp.c/thread-limit-6.c
b/libgomp/testsuite/libgomp.c/thread-limit-6.c
new file mode 100644
index 00000000000..77ef266de65
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/thread-limit-6.c
@@ -0,0 +1,53 @@
+/* { dg-additional-options "-O3" } */
+
+/* This test is reduced from a larger example that used to give a
+ "Memory access fault" on AMD GCN due to creating too many threads. */
+
+int main() {
+ char Ac_0;
+ short As[2871];
+ int Ai[2871];
+ int Ci_0;
+ int Bi_0;
+ int Ai_0;
+ long All[2871];
+ long Cll_0;
+ long Bll_0;
+ long All_0;
+ float Af_0;
+ double Cd_0;
+ double Bd_0;
+ double Ad_0;
+ short Rs7, Rs10;
+ int Ri1, Ri2, Ri5, Ri6, Ri7, Ri10;
+ long Rll1, Rll2, Rll5, Rll6, Rll7, Rll10;
+ float Rf1;
+ double Rd2;
+
+ int i = 0;
+ for (int threads = 0; threads < 512; threads++)
+ {
+#pragma omp target teams num_teams(1) thread_limit(1024)
+#pragma omp parallel if (threads > 1) num_threads(threads)
+ {
+ Rs7 |= 1 < 0;
+ Rs10 = Rs10 || As[i] > 0;
+ Ri1 += Ai_0 + 0;
+ Ri2 += Ai_0 + (Bi_0 + Ci_0);
+ Ri5 *= i % 1000 == 0 ? 2 : 1;
+ Ri6 &= 0;
+ Ri7 |= 1 < 0;
+ Ri10 = Ri10 || Ai[i] > 0;
+ Rll1 += 0;
+ Rll2 += All_0 + (Bll_0 + Cll_0);
+ Rll5 *= i % 1000 == 0 ? : 1;
+ Rll6 &= 0;
+ Rll7 |= 1ll < 0;
+ Rll10 = Rll10 || All[i] > 0;
+ Rf1 += Af_0 + 0;
+ Rd2 += Ad_0 + (Bd_0 + Cd_0);
+ }
+ }
+ return 0;
+}
+
--
2.39.5