[og12] Miscellaneous clean-up re OpenMP 'ompx_unified_shared_mem_space', 'ompx_host_mem_space' (was: [PATCH 3/5] openmp, nvptx: ompx_unified_shared_mem_alloc)

2023-02-16 Thread Thomas Schwinge
Hi!

On 2023-02-10T15:31:47+, Andrew Stubbs  wrote:
> On 10/02/2023 14:21, Thomas Schwinge wrote:
>> Is the correct fix the following [...]
>
> Yes, [...]

>>> --- a/libgomp/config/nvptx/allocator.c
>>> +++ b/libgomp/config/nvptx/allocator.c
>>> @@ -125,6 +125,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, 
>>> size_t size)
>>> __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, 
>>> MEMMODEL_RELEASE);
>>> return result;
>>>   }
>>> +  else if (memspace == ompx_host_mem_space)
>>> +return NULL;
>>> else
>>>   return malloc (size);
>>>   }
>>> @@ -145,6 +147,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, 
>>> size_t size)
>>>
>>> return result;
>>>   }
>>> +  else if (memspace == ompx_host_mem_space)
>>> +return NULL;
>>> else
>>>   return calloc (1, size);
>>>   }
>>> @@ -354,6 +358,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, 
>>> void *addr,
>>>}
>>> return result;
>>>   }
>>> +  else if (memspace == ompx_host_mem_space)
>>> +return NULL;
>>> else
>>>   return realloc (addr, size);
>>>   }
>>
>> (I'd have added an explicit no-op (or, 'abort'?) to
>> 'nvptx_memspace_free', but that's maybe just me...)  ;-\
>
> Why? The host memspace is just the regular heap, which can be a thing on
> any device. It's an extension though so we can define it either way.

My point was: for nvptx libgomp, all 'ompx_host_mem_space' allocator
functions (cited above) 'return NULL', and it's a cheap check to verify
that in 'nvptx_memspace_free'.

>>> --- a/libgomp/libgomp.h
>>> +++ b/libgomp/libgomp.h
>>
>>> +extern void * gomp_usm_alloc (size_t size, int device_num);
>>> +extern void gomp_usm_free (void *device_ptr, int device_num);
>>> +extern bool gomp_is_usm_ptr (void *ptr);
>>
>> 'gomp_is_usm_ptr' isn't defined/used anywhere; I'll remove it.
>
> I think I started that and then decided against. Thanks.

These three combined, I've pushed to devel/omp/gcc-12 branch
commit 23f52e49368d7b26a1b1a72d6bb903d31666e961
"Miscellaneous clean-up re OpenMP 'ompx_unified_shared_mem_space', 
'ompx_host_mem_space'",
see attached.


>>> --- a/libgomp/target.c
>>> +++ b/libgomp/target.c
>>
>>> @@ -3740,6 +3807,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr 
>>> *device,
>>> DLSYM (unload_image);
>>> DLSYM (alloc);
>>> DLSYM (free);
>>> +  DLSYM_OPT (usm_alloc, usm_alloc);
>>> +  DLSYM_OPT (usm_free, usm_free);
>>> +  DLSYM_OPT (is_usm_ptr, is_usm_ptr);
>>> DLSYM (dev2host);
>>> DLSYM (host2dev);
>>
>> As a sanity check, shouldn't we check that either none or all three of
>> those are defined, like in the 'if (cuda && cuda != 4) { [error] }' check
>> a bit further down?
>
> This is only going to happen when somebody writes a new plugin, and then
> they'll discover very quickly that there are issues. I've wasted more
> time writing this sentence than it's worth already. :)

Eh.  ;-) OK, outvoted.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 23f52e49368d7b26a1b1a72d6bb903d31666e961 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 14 Feb 2023 17:10:57 +0100
Subject: [PATCH] Miscellaneous clean-up re OpenMP
 'ompx_unified_shared_mem_space', 'ompx_host_mem_space'

Clean-up for og12 commit 84914e197d91a67b3d27db0e4c69a433462983a5
"openmp, nvptx: ompx_unified_shared_mem_alloc".  No functional change.

	libgomp/
	* config/linux/allocator.c (linux_memspace_calloc): Elide
	(innocuous) duplicate 'if' condition.
	* config/nvptx/allocator.c (nvptx_memspace_free): Explicitly
	handle 'memspace == ompx_host_mem_space'.
	* libgomp.h (gomp_is_usm_ptr): Remove.
---
 libgomp/ChangeLog.omp| 6 ++
 libgomp/config/linux/allocator.c | 3 +--
 libgomp/config/nvptx/allocator.c | 4 
 libgomp/libgomp.h| 1 -
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index b667c72b8ca..1c4b1833c0b 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,5 +1,11 @@
 2023-02-16  Thomas Schwinge  
 
+	* config/linux/allocator.c (linux_memspace_calloc): Elide
+	(innocuous) duplicate 'if' condition.
+	* config/nvptx/allocator.c (nvptx_memspace_free): Explicitly
+	handle 'memspace == ompx_host_mem_space'.
+	* libgomp.h (gomp_is_usm_ptr): Remove.
+
 	* basic-allocator.c (BASIC_ALLOC_YIELD): instead of '#deine',
 	'#define' it.
 
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 07af3a2821a..8a9171c36df 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -95,8 +95,7 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
   memset (ret, 0, size);
   return ret;
 }
- 

Re: [PATCH 3/5] openmp, nvptx: ompx_unified_shared_mem_alloc

2023-02-10 Thread Andrew Stubbs

On 10/02/2023 14:21, Thomas Schwinge wrote:

Is the correct fix the following (conceptually like
'linux_memspace_alloc' cited above), or is there something that I fail to
understand?

  static void *
  linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int 
pin)
  {
if (memspace == ompx_unified_shared_mem_space)
  {
void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
memset (ret, 0, size);
return ret;
  }
 -  else if (memspace == ompx_unified_shared_mem_space
 -  || pin)
 +  else if (pin)
  return linux_memspace_alloc (memspace, size, pin);
else
  return calloc (1, size);


Yes, I think that is what was intended (and what actually happens). You 
can have your memory both unified and pinned (well, maybe it's possible, 
but there's no one Cuda API for that), so the USM takes precedence.



The following ones then again are conceptually like
'linux_memspace_alloc' cited above:


@@ -77,9 +86,9 @@ static void
  linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
int pin)
  {
-  (void)memspace;
-
-  if (pin)
+  if (memspace == ompx_unified_shared_mem_space)
+gomp_usm_free (addr, GOMP_DEVICE_ICV);
+  else if (pin)
  munmap (addr, size);
else
  free (addr);
@@ -89,7 +98,9 @@ static void *
  linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
   size_t oldsize, size_t size, int oldpin, int pin)
  {
-  if (oldpin && pin)
+  if (memspace == ompx_unified_shared_mem_space)
+goto manual_realloc;
+  else if (oldpin && pin)
  {
void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
if (newaddr == MAP_FAILED)
@@ -98,18 +109,19 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, 
void *addr,
[...]


Yes.

..., and similar those here:


--- a/libgomp/config/nvptx/allocator.c
+++ b/libgomp/config/nvptx/allocator.c
@@ -125,6 +125,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, 
size_t size)
__atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, 
MEMMODEL_RELEASE);
return result;
  }
+  else if (memspace == ompx_host_mem_space)
+return NULL;
else
  return malloc (size);
  }
@@ -145,6 +147,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, 
size_t size)

return result;
  }
+  else if (memspace == ompx_host_mem_space)
+return NULL;
else
  return calloc (1, size);
  }
@@ -354,6 +358,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, 
void *addr,
   }
return result;
  }
+  else if (memspace == ompx_host_mem_space)
+return NULL;
else
  return realloc (addr, size);
  }


(I'd have added an explicit no-op (or, 'abort'?) to
'nvptx_memspace_free', but that's maybe just me...)  ;-\


Why? The host memspace is just the regular heap, which can be a thing on 
any device. It's an extension though so we can define it either way.



--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h



+extern void * gomp_usm_alloc (size_t size, int device_num);
+extern void gomp_usm_free (void *device_ptr, int device_num);
+extern bool gomp_is_usm_ptr (void *ptr);


'gomp_is_usm_ptr' isn't defined/used anywhere; I'll remove it.


I think I started that and then decided against. Thanks.


--- a/libgomp/target.c
+++ b/libgomp/target.c



@@ -3740,6 +3807,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr 
*device,
DLSYM (unload_image);
DLSYM (alloc);
DLSYM (free);
+  DLSYM_OPT (usm_alloc, usm_alloc);
+  DLSYM_OPT (usm_free, usm_free);
+  DLSYM_OPT (is_usm_ptr, is_usm_ptr);
DLSYM (dev2host);
DLSYM (host2dev);


As a sanity check, shouldn't we check that either none or all three of
those are defined, like in the 'if (cuda && cuda != 4) { [error] }' check
a bit further down?


This is only going to happen when somebody writes a new plugin, and then 
they'll discover very quickly that there are issues. I've wasted more 
time writing this sentence than it's worth already. :)



Note that these remarks likewise apply to the current upstream
submission:
>
 "openmp, nvptx: ompx_unified_shared_mem_alloc".


I have new patches to heap on top of this set already on OG12, and more 
planned, plus these ones you're working on; the whole patchset is going 
to have to get a rebase, squash, and tidy "soonish".


Andrew


Re: [PATCH 3/5] openmp, nvptx: ompx_unified_shared_mem_alloc

2023-02-10 Thread Thomas Schwinge
Hi Andrew!

On 2022-03-08T11:30:57+, Hafiz Abid Qadeer  wrote:
> From: Andrew Stubbs 
>
> This adds support for using Cuda Managed Memory with omp_alloc.  It will be
> used as the underpinnings for "requires unified_shared_memory" in a later
> patch.
>
> There are two new predefined allocators, ompx_unified_shared_mem_alloc and
> ompx_host_mem_alloc, plus corresponding memory spaces, [...]

> --- a/libgomp/config/linux/allocator.c
> +++ b/libgomp/config/linux/allocator.c
> @@ -42,9 +42,11 @@
>  static void *
>  linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
>  {
> -  (void)memspace;
> -
> -  if (pin)
> +  if (memspace == ompx_unified_shared_mem_space)
> +{
> +  return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
> +}
> +  else if (pin)
>  {
>void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
>MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

This I understand conceptually, but then:

> @@ -67,7 +69,14 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, 
> size_t size, int pin)
>  static void *
>  linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
>  {
> -  if (pin)
> +  if (memspace == ompx_unified_shared_mem_space)
> +{
> +  void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
> +  memset (ret, 0, size);
> +  return ret;
> +}
> +  else if (memspace == ompx_unified_shared_mem_space
> +  || pin)
>  return linux_memspace_alloc (memspace, size, pin);
>else
>  return calloc (1, size);

..., here, we've got a duplicated (and thus always-false) expression
'memspace == ompx_unified_shared_mem_space' (..., which
'-Wduplicated-cond' fails to report; 
"'-Wduplicated-cond' doesn't diagnose duplicated subexpressions"...).
Is the correct fix the following (conceptually like
'linux_memspace_alloc' cited above), or is there something that I fail to
understand?

 static void *
 linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int 
pin)
 {
   if (memspace == ompx_unified_shared_mem_space)
 {
   void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
   memset (ret, 0, size);
   return ret;
 }
-  else if (memspace == ompx_unified_shared_mem_space
-  || pin)
+  else if (pin)
 return linux_memspace_alloc (memspace, size, pin);
   else
 return calloc (1, size);

The following ones then again are conceptually like
'linux_memspace_alloc' cited above:

> @@ -77,9 +86,9 @@ static void
>  linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
>int pin)
>  {
> -  (void)memspace;
> -
> -  if (pin)
> +  if (memspace == ompx_unified_shared_mem_space)
> +gomp_usm_free (addr, GOMP_DEVICE_ICV);
> +  else if (pin)
>  munmap (addr, size);
>else
>  free (addr);
> @@ -89,7 +98,9 @@ static void *
>  linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
>   size_t oldsize, size_t size, int oldpin, int pin)
>  {
> -  if (oldpin && pin)
> +  if (memspace == ompx_unified_shared_mem_space)
> +goto manual_realloc;
> +  else if (oldpin && pin)
>  {
>void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
>if (newaddr == MAP_FAILED)
> @@ -98,18 +109,19 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, 
> void *addr,
> [...]

..., and similar those here:

> --- a/libgomp/config/nvptx/allocator.c
> +++ b/libgomp/config/nvptx/allocator.c
> @@ -125,6 +125,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, 
> size_t size)
>__atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, 
> MEMMODEL_RELEASE);
>return result;
>  }
> +  else if (memspace == ompx_host_mem_space)
> +return NULL;
>else
>  return malloc (size);
>  }
> @@ -145,6 +147,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, 
> size_t size)
>
>return result;
>  }
> +  else if (memspace == ompx_host_mem_space)
> +return NULL;
>else
>  return calloc (1, size);
>  }
> @@ -354,6 +358,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, 
> void *addr,
>   }
>return result;
>  }
> +  else if (memspace == ompx_host_mem_space)
> +return NULL;
>else
>  return realloc (addr, size);
>  }

(I'd have added an explicit no-op (or, 'abort'?) to
'nvptx_memspace_free', but that's maybe just me...)  ;-\


> --- a/libgomp/libgomp.h
> +++ b/libgomp/libgomp.h

> +extern void * gomp_usm_alloc (size_t size, int device_num);
> +extern void gomp_usm_free (void *device_ptr, int device_num);
> +extern bool gomp_is_usm_ptr (void *ptr);

'gomp_is_usm_ptr' isn't defined/used anywhere; I'll remove it.


> --- a/libgomp/target.c
> +++ b/libgomp/target.c

> @@ -3740,6 +3807,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr 
> *device,
>DLSYM (unload_image);
>DLSYM (alloc);
>DLSYM (free);
> +  DLSYM_OPT 

[PATCH 3/5] openmp, nvptx: ompx_unified_shared_mem_alloc

2022-03-08 Thread Hafiz Abid Qadeer
From: Andrew Stubbs 

This adds support for using Cuda Managed Memory with omp_alloc.  It will be
used as the underpinnings for "requires unified_shared_memory" in a later
patch.

There are two new predefined allocators, ompx_unified_shared_mem_alloc and
ompx_host_mem_alloc, plus corresponding memory spaces, which can be used to
allocate memory in the "managed" space and explicitly on the host (it is
intended that "malloc" will be intercepted by the compiler).

The nvptx plugin is modified to make the necessary Cuda calls, and libgomp
is modified to switch to shared-memory mode for USM allocated mappings.

libgomp/ChangeLog:

* allocator.c (omp_max_predefined_alloc): Update.
(omp_aligned_alloc): Don't fallback ompx_host_mem_alloc.
(omp_aligned_calloc): Likewise.
(omp_realloc): Likewise.
* config/linux/allocator.c (linux_memspace_alloc): Handle USM.
(linux_memspace_calloc): Handle USM.
(linux_memspace_free): Handle USM.
(linux_memspace_realloc): Handle USM.
* config/nvptx/allocator.c (nvptx_memspace_alloc): Reject
ompx_host_mem_alloc.
(nvptx_memspace_calloc): Likewise.
(nvptx_memspace_realloc): Likewise.
* libgomp-plugin.h (GOMP_OFFLOAD_usm_alloc): New prototype.
(GOMP_OFFLOAD_usm_free): New prototype.
(GOMP_OFFLOAD_is_usm_ptr): New prototype.
* libgomp.h (gomp_usm_alloc): New prototype.
(gomp_usm_free): New prototype.
(gomp_is_usm_ptr): New prototype.
(struct gomp_device_descr): Add USM functions.
* omp.h.in (omp_memspace_handle_t): Add ompx_unified_shared_mem_space
and ompx_host_mem_space.
(omp_allocator_handle_t): Add ompx_unified_shared_mem_alloc and
ompx_host_mem_alloc.
* omp_lib.f90.in: Likewise.
* plugin/plugin-nvptx.c (nvptx_alloc): Add "usm" parameter.
Call cuMemAllocManaged as appropriate.
(GOMP_OFFLOAD_alloc): Move internals to ...
(GOMP_OFFLOAD_alloc_1): ... this, and add usm parameter.
(GOMP_OFFLOAD_usm_alloc): New function.
(GOMP_OFFLOAD_usm_free): New function.
(GOMP_OFFLOAD_is_usm_ptr): New function.
* target.c (gomp_map_vars_internal): Add USM support.
(gomp_usm_alloc): New function.
(gomp_usm_free): New function.
(gomp_load_plugin_for_device): New function.
* testsuite/libgomp.c/usm-1.c: New test.
* testsuite/libgomp.c/usm-2.c: New test.
* testsuite/libgomp.c/usm-3.c: New test.
* testsuite/libgomp.c/usm-4.c: New test.
* testsuite/libgomp.c/usm-5.c: New test.
---
 libgomp/allocator.c | 13 --
 libgomp/config/linux/allocator.c| 48 
 libgomp/config/nvptx/allocator.c|  6 +++
 libgomp/libgomp-plugin.h|  3 ++
 libgomp/libgomp.h   |  6 +++
 libgomp/omp.h.in|  4 ++
 libgomp/omp_lib.f90.in  |  8 
 libgomp/plugin/plugin-nvptx.c   | 45 ---
 libgomp/target.c| 70 +
 libgomp/testsuite/libgomp.c/usm-1.c | 24 ++
 libgomp/testsuite/libgomp.c/usm-2.c | 32 +
 libgomp/testsuite/libgomp.c/usm-3.c | 35 +++
 libgomp/testsuite/libgomp.c/usm-4.c | 36 +++
 libgomp/testsuite/libgomp.c/usm-5.c | 28 
 14 files changed, 330 insertions(+), 28 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.c/usm-1.c
 create mode 100644 libgomp/testsuite/libgomp.c/usm-2.c
 create mode 100644 libgomp/testsuite/libgomp.c/usm-3.c
 create mode 100644 libgomp/testsuite/libgomp.c/usm-4.c
 create mode 100644 libgomp/testsuite/libgomp.c/usm-5.c

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index 000ccc2dd9c..18045dbe0c4 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -32,7 +32,7 @@
 #include 
 #include 
 
-#define omp_max_predefined_alloc ompx_pinned_mem_alloc
+#define omp_max_predefined_alloc ompx_host_mem_alloc
 
 /* These macros may be overridden in config//allocator.c.  */
 #ifndef MEMSPACE_ALLOC
@@ -68,6 +68,8 @@ static const omp_memspace_handle_t predefined_alloc_mapping[] 
= {
   omp_low_lat_mem_space,   /* omp_pteam_mem_alloc. */
   omp_low_lat_mem_space,   /* omp_thread_mem_alloc. */
   omp_default_mem_space,   /* ompx_pinned_mem_alloc. */
+  ompx_unified_shared_mem_space,  /* ompx_unified_shared_mem_alloc. */
+  ompx_host_mem_space, /* ompx_host_mem_alloc.  */
 };
 
 struct omp_allocator_data
@@ -367,7 +369,8 @@ fail:
   int fallback = (allocator_data
  ? allocator_data->fallback
  : (allocator == omp_default_mem_alloc
-|| allocator == ompx_pinned_mem_alloc)
+|| allocator == ompx_pinned_mem_alloc
+|| allocator == ompx_host_mem_alloc)
  ? omp_atv_null_fb
  : omp_atv_default_mem_fb);
   switch (fallback)
@@ -597,7 +600,8 @@ fail: