[Bug libgomp/122281] New: libgomp: cuCtxSynchronize error: an illegal memory access was encountered in code that reserves memory correctly.

schulz.benjamin at googlemail dot com via Gcc-bugs Sat, 18 Oct 2025 00:12:45 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122281


            Bug ID: 122281
           Summary: libgomp: cuCtxSynchronize error: an illegal memory
                    access was encountered in code that reserves memory
                    correctly.
           Product: gcc
           Version: 15.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: libgomp
          Assignee: unassigned at gcc dot gnu.org
          Reporter: schulz.benjamin at googlemail dot com
                CC: jakub at gcc dot gnu.org
  Target Milestone: ---

Created attachment 62557
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=62557&action=edit
sparsetests-xnvptx-none.ii.tar.gz

The following code, which is in datablockcontainer.h on line 227 and called by
sparsetests.cpp on line 97 yields, if compiled with gcc-15.2 and -fopenmp
-foffload=nvptx-none -fno-stack-protector  an error as follows




libgomp: cuCtxSynchronize error: an illegal memory access was encountered

libgomp: cuModuleGetFunction (__do_global_dtors__entry) error: an illegal
memory access was encountered

libgomp: cuMemFree_v2 error: an illegal memory access was encountered



Here is the code listing:  



    void build_blocks_rank2(size_t block_rows, size_t block_cols, bool
remove_zeroblocks,bool devptr)
    {
        const size_t nblocks_row = (dblock.dpextents[0] + block_rows - 1) /
block_rows;
        const size_t nblocks_col = (dblock.dpextents[1] + block_cols - 1) /
block_cols;
        const size_t maxblocks   = nblocks_row * nblocks_col;

        pooled_offsets_flat = devptr
                              ? (size_t*)omp_target_alloc(sizeof(size_t) * 2 *
maxblocks, dblock.devptr_devicenum)
                              : new size_t[2 * maxblocks];

        pooled_offsets_starts = devptr
                                ? (size_t*)omp_target_alloc(sizeof(size_t) *
(maxblocks + 1),dblock.devptr_devicenum)
                                : new size_t[maxblocks + 1];

        size_t count  = 0; // block count
        const size_t ext0=dblock.dpextents[0];
        const size_t ext1=dblock.dpextents[1];
        const size_t str0=dblock.dpstrides[0];
        const size_t str1=dblock.dpstrides[1];
        const T* pd=dblock.dpdata;

        if(devptr)
        {
            #pragma omp target teams distribute map(tofrom:count) shared(count)
is_device_ptr(pd,pooled_offsets_flat,pooled_offsets_starts)
device(dblock.devptr_devicenum)
            for (size_t bi = 0; bi < nblocks_row; ++bi)
            {
                #pragma omp parallel for shared(count)
                for (size_t bj = 0; bj < nblocks_col; ++bj)
                {
                    const size_t row_off = bi * block_rows;
                    const size_t diff1   = ext0 - row_off;
                    const size_t tile_rows = (block_rows < diff1) ? block_rows
: diff1;

                    bool keep = true;

                    const size_t col_off = bj * block_cols;
                    const size_t diff2   = ext1 - col_off;
                    const size_t tile_cols = (block_cols < diff2) ? block_cols
: diff2;

                    if (remove_zeroblocks)
                    {
                        keep = false;

                        for (size_t i = 0; i < tile_rows && !keep; ++i)
                            for (size_t j = 0; j < tile_cols && !keep; ++j)
                                if (pd[(row_off + i) * str0 + (col_off + j)
*str1] != T(0))
                                {
                                    keep = true;
                                    goto outofloop3;
                                }
                    }
outofloop3:
                    if (keep)
                    {
                        size_t slot;
                        #pragma omp atomic capture
                        slot = count++;
                        const size_t pos = slot * 2;
                        pooled_offsets_starts[slot] = pos;
                        pooled_offsets_flat[pos]    = row_off;
                        pooled_offsets_flat[pos+1]  = col_off;

                    }

                }
            }



Specifically error gets be triggered by

                        pooled_offsets_starts[slot] = pos;
                        pooled_offsets_flat[pos]    = row_off;
                        pooled_offsets_flat[pos+1]  = col_off;


and even if the sizes of the arrays 

pooled_offsets_flat and 
pooled_offsets_starts 

were correctly allocated, and they are allocated by omp_target_alloc. I have
verified that.


Also, clang compiles the application just fine and the loop runs on device.


Perhaps this is a problem of the new nvidia driver?

This is on my system:


Tue Oct 14 13:50:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA
Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile
Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util 
Compute M. |
|                                         |                        |           
   MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 5060 Ti     Off |   00000000:2D:00.0  On |           
      N/A |
|  0%   46C    P8             12W /  180W |     672MiB /  16311MiB |      0%   
  Default |
|                                         |                        |           
      N/A |
+-----------------------------------------+------------------------+----------------------+


dev-util/nvidia-cuda-toolkit-12.9.1-r1:0/12.9.1::gentoo  USE="debugger examples
nsight profiler rdma sanitizer -clang" PYTHON_TARGETS="python3_13 -python3_11
-python3_12" 0 KiB

x11-drivers/nvidia-drivers-580.95.05:0/580::gentoo  USE="X dist-kernel
kernel-open modules modules-sign static-libs strip tools wayland
-modules-compress -persistenced -powerd" ABI_X86="(64) -32" 0 KiB

kernel version linux-6.12.52-gentoo-dist *

[Bug libgomp/122281] New: libgomp: cuCtxSynchronize error: an illegal memory access was encountered in code that reserves memory correctly.

Reply via email to