Hi,
this is the full code:
import scipy
import scipy.ndimage as nd
import numpy as np
import numpy.ma as ma
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.driver import Context
import pycuda.tools
import pycuda.driver as cuda
import numpy
import os, sys, glob
from pycuda.compiler import SourceModule
import time
from pycuda.gpuarray import to_gpu
a = np.array([0,0,1,1,1,0,0,0,1,1,1,0,0,1,1],dtype=numpy.int32)
kernel_h= np.array([-1,-1,-1,1,1,1],dtype=numpy.int32)
step=len(kernel_h)/2
a1=a[:(step)][::-1]
a2=a[-(step-1):][::-1]
aMod=np.append(a1,np.append(a,a2))
numthreads=len(a)
c=np.zeros(len(a),dtype=numpy.int32)
kernelSize_h=np.zeros(2)
kernelSize_h=kernelSize_h.astype(numpy.int32)
kernelSize_h[0]=len(kernel_h)
kernelSize_h[1]=len(a)
mod=SourceModule("""
__global__ void gpu_kernel(int *corrGpu,int *aMod,int *b,int *kernelSize_h)
{
int j,step1=kernelSize_h[0]/2;
int idx = threadIdx.x+step1;
for(j=0;j<step1;j++)
corrGpu[idx-step1]+=aMod[idx+j-(step1)]*b[j];
}
""")
manipulate_vector=mod.get_function("gpu_kernel")
c_gpu=to_gpu(c)
manipulate_vector(c_gpu,drv.In(aMod),drv.In(kernel_h),drv.In(kernelSize_h),block=(numthreads,1,1),grid=(1,1))
print "Corr. GPU \n"
print c_gpu.get()
corrcpu=nd.correlate1d(a,kernel_h,mode='reflect')
print "Corr CPU= "
print corrcpu
print "Differenza : "
print c_gpu.get()-corrcpu
About your advise: when i do: int idx = threadIdx.x+step, idx doesn't start
from step1? so when j=0 idx-step1+j =0 ? it's wrong?
> Date: Wed, 11 Jul 2012 10:03:33 +1000
> Subject: Re: [PyCUDA] Thread Problem
> From: [email protected]
> To: [email protected]
> CC: [email protected]
>
> Hi Andrea,
>
> Please send the full working script which anyone can save and execute
> without assembling it from the excerpts you provided. In the mean
> time, that's what I can say by looking at the kernel:
>
> On Wed, Jul 11, 2012 at 1:24 AM, Andrea Cesari <[email protected]>
> wrote:
> > __global__ void gpu_kernel(int *corrGpu,int *aMod,int *b,int *kernelSize_h)
> >
> > {
> > int j,step1=kernelSize_h[0]/2;
> > int idx = threadIdx.x+step1;
> > for(j=0;j<step1;j++)
> > corrGpu[idx-step1]+=aMod[idx+j-(step1)]*b[j];
> >
> > }
>
> With the construction like "aMod[idx+j-(step1)]", reads sometimes
> occur outside of the aMod array (consider idx=0 and j=0, for example —
> you will be reading from aMod[-step1]).
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda