[PyCUDA] [Pycuda] LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

Jie Liu Tue, 26 Nov 2019 06:02:53 -0800

Hallo,

I have a piece of very simple code written in Pycuda. Can anyone tell me, why shouldn't I set the index of array CC as "c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx"? For example, if I set the index of CC as 1 or 2 or 3, it can get the right value. Thank you very much!



importpycuda.autoinit
frompycuda importdriver, compiler, gpuarray, tools
frompycuda.compiler importSourceModule
importnumpy asnp
kernel_code_template ="""
#include <cuComplex.h>

__device__ void ExtractVector(cuFloatComplex *mul_a, int &idt, cuFloatComplex *A){

for (int i = 0; i < %(MATRIX_SIZE_O)d; i ++)
{
A[i] = make_cuFloatComplex(0,0);

A[i] = make_cuFloatComplex(cuCrealf(mul_a[i*%(tSize)d+ idt]),cuCimagf(mul_a[i*%(tSize)d+ idt]));

}
__syncthreads();
}
__global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *CC)
{
const uint wA = %(MATRIX_SIZE_O)d;
const uint wB = %(MATRIX_SIZE_O)d;
// Block index
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
// Thread index
const uint tx = threadIdx.x;
const uint ty = threadIdx.y;
cuFloatComplex Csub = make_cuFloatComplex(0,0);
__shared__ cuFloatComplex VectorA[%(MATRIX_SIZE_O)d];
__shared__ cuFloatComplex C1[%(MATRIX_SIZE_O)d];
const uint aBegin = wA * %(BLOCK_SIZE)d* by;
const uint aEnd = aBegin + wA - 1;
const uint aStep = 1;
const int bBegin = %(BLOCK_SIZE)d* bx;
const uint bStep = wB;
const uint c = wA * %(BLOCK_SIZE)d* by + %(BLOCK_SIZE)d* bx ;
for (int index_r = 0; index_r < %(rSize)d; index_r ++)
{
for (int index_t = 0; index_t < %(tSize)d; index_t ++)
{
ExtractVector(A, index_t, VectorA);
CC[c] = make_cuFloatComplex(cuCrealf(VectorA[c]),cuCimagf(VectorA[c]));
}
}
}
"""
theta =(np.arange(-80, 80, 40))/360*2*np.pi
tsize =theta.size
rmax =300
r =np.arange(0, rmax, rmax/3)
rsize =r.size
Mt =1
Mr =6
MATRIX_SIZE_O =Mt*Mr
MATRIX_SIZE_I =1
BLOCK_SIZE =1
BLOCK_SIZE_x =1
BLOCK_SIZE_y =1
GRID_SIZE =MATRIX_SIZE_O
kernel_code =kernel_code_template %{
'MATRIX_SIZE_O': MATRIX_SIZE_O,
'BLOCK_SIZE': BLOCK_SIZE,
'tSize': tsize,
'rSize': rsize,
}
a_cpu =1+1j*1+np.arange(0, tsize *MATRIX_SIZE_O)
a_cpu =a_cpu.reshape(MATRIX_SIZE_O,tsize).astype(np.complex64)
b_cpu =1+1j*2+np.arange(0, MATRIX_SIZE_O *MATRIX_SIZE_O)
b_cpu =b_cpu.reshape(MATRIX_SIZE_O,MATRIX_SIZE_O).astype(np.complex64)
bb_cpu =1+1j*3+np.arange(0, tsize *MATRIX_SIZE_O)
bb_cpu =bb_cpu.reshape(MATRIX_SIZE_O, tsize).astype(np.complex64)
f_cpu =1+1j*4+np.arange(0, rsize *MATRIX_SIZE_O)
f_cpu =f_cpu.reshape(MATRIX_SIZE_O,rsize).astype(np.complex64)

Bcon =np.real(a_cpu).astype(np.float32) +1j*np.imag(a_cpu).astype(np.float32)

a_gpu =gpuarray.to_gpu(Bcon)

Rinv =np.real(b_cpu).astype(np.float32) +1j*np.imag(b_cpu).astype(np.float32)

b_gpu =gpuarray.to_gpu(Rinv)

FFT_cpu =np.real(f_cpu).astype(np.float32) +1j*np.imag(f_cpu).astype(np.float32)

f_gpu =gpuarray.to_gpu(FFT_cpu)
B =np.real(bb_cpu).astype(np.float32) +1j*np.imag(bb_cpu).astype(np.float32)
bb_gpu =gpuarray.to_gpu(B)
beta_gpu =gpuarray.empty((rsize,tsize), np.complex64)
c_gpu =gpuarray.empty((MATRIX_SIZE_O), np.complex64)
beta_cpu =np.zeros(shape=(rsize,tsize)).astype(np.complex64)
forindex_r inrange(0, rsize):
forindex_theta inrange(0, theta.size):
c_cpu =np.dot(a_cpu[:, index_theta], b_cpu)
beta_cpu[index_r, index_theta] =np.dot(c_cpu, f_cpu[:, index_r])
#/ (np.dot(c_cpu, bb_cpu[:, index_theta]))
#beta_cpu[index_r, index_theta] = np.dot(c_cpu, bb_cpu[:, index_theta])
mod =compiler.SourceModule(kernel_code)
matrixmul =mod.get_function("MatrixMulKernel")
matrixmul(
# inputs
a_gpu,
# outputs
c_gpu,
# grid of multiple blocks
grid=(GRID_SIZE, GRID_SIZE),
# block of multiple threads
block=(BLOCK_SIZE_x , BLOCK_SIZE_y , 1)
)
# print the results
print("-"*80)
print("Matrix A (GPU): ")
print(a_gpu.get())
print("-"*80)
print("Matrix B (GPU): ")
print(bb_gpu.get())
print("-"*80)
print("Matrix C (GPU): ")
print(c_gpu.get())
print("-"*80)
print("Matrix C (CPU): ")
print(c_cpu)
print("-"*80)
print("Matrix beta (GPU): ")
print(beta_gpu.get())
print("-"*80)
print("Matrix beta (CPU): ")
print(beta_cpu)
print("-"*80)
print("CPU-GPU Difference: ")
print(beta_cpu/beta_gpu.get())

smime.p7s
Description: S/MIME Cryptographic Signature

_______________________________________________
PyCUDA mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[PyCUDA] [Pycuda] LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

Reply via email to