Hi, I think the problem is that you're doing the memory allocation yourself. The cudaMalloc is automatically done by the data manager when you require the buffer pointer (GetCudaDataManager()->GetGPUBufferPointer()), you don't have to do it. Try removing the cudaMalloc calls. Cheers, Simon
On Sun, Mar 3, 2019 at 5:38 PM Kabelitz, Gordian < [email protected]> wrote: > Hi Simon, > > > > I made a workaround by including a explicit copying from device to host > after the kernel is called. The function looks now like this: > > > > void > > CUDA_gradient( > > int size[3], > > float spacing[3], > > float *dev_in, > > float *dev_out_x, > > float *dev_out_y, > > float *dev_out_z) > > { > > > > int3 dev_Size = make_int3(size[0], size[1], size[2]); > > float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing > [2]); > > > > // Output volume > > long int outputMemorySize = size[0] * size[1] * size[2] * sizeof( > float); > > float * dev_grad_x; > > float * dev_grad_y; > > float * dev_grad_z; > > cudaMalloc((void**)&dev_grad_x, outputMemorySize); > > cudaMalloc((void**)&dev_grad_y, outputMemorySize); > > cudaMalloc((void**)&dev_grad_z, outputMemorySize); > > cudaMemset(dev_grad_x, 2.f, outputMemorySize); > > cudaMemset(dev_grad_y, 3.f, outputMemorySize); > > cudaMemset(dev_grad_z, 4.f, outputMemorySize); > > > > // Thread Block Dimensions > > dim3 dimBlock = dim3(10, 10, 10); > > > > int blocksInX = iDivUp(size[0], dimBlock.x); > > int blocksInY = iDivUp(size[1], dimBlock.y); > > int blocksInZ = iDivUp(size[2], dimBlock.z); > > > > dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); > > > > gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_grad_x, > dev_grad_y, dev_grad_z, dev_Size, dev_Spacing); > > CUDA_CHECK_ERROR; > > > > cudaMemcpy(dev_out_x, dev_grad_x, outputMemorySize, > cudaMemcpyDeviceToHost); > > CUDA_CHECK_ERROR; > > cudaMemcpy(dev_out_y, dev_grad_y, outputMemorySize, > cudaMemcpyDeviceToHost); > > CUDA_CHECK_ERROR; > > cudaMemcpy(dev_out_z, dev_grad_z, outputMemorySize, > cudaMemcpyDeviceToHost); > > CUDA_CHECK_ERROR; > > } > > > > I am not sure if this violates the intended behavior of the ITKCudaCommon > by explicitly copying the memory (that needed to be allocated before). > > > > Still I cannot solve why the implicit memory copying mechanism do not work. > > Have you look into this problem or do you miss any information? > > With best regards, > > Gordian > > > > *Von:* Simon Rit [mailto:[email protected]] > *Gesendet:* Mittwoch, 27. Februar 2019 21:58 > *An:* Kabelitz, Gordian > *Cc:* [email protected] > *Betreff:* Re: [Rtk-users] GPU kernel do not change output variables > > > > Hi, > > Sounds like a challenge. When you say you set fixed numbers, did you check > that you reach the point where you set this number? You can use cuprintf to > check what's going on in the kernel. > > One thing wrong I noticed: you use size.Fill in a loop, which is a bit odd > because it will Fill the size with the last value of the loop. > > I hope this helps, > > Simon > > > > On Wed, Feb 27, 2019 at 9:39 PM Kabelitz, Gordian < > [email protected]> wrote: > > Hi rtk-users, > > I am facing an oddity which I cannot explain. > > I want to implement a new gradient filter. The input is an > CudaImage<float,3> and the output should be an > CudaImage<CovariantVector<float,3>,3>. The filter runs without any cuda > errors but the output (pout_(xyz)) is has not changed at all. The kernel > function is accessed and the print out from there seems to be okay. I tried > to explicitly copy the content of the GPUBuffer into the CPUBuffer. Still > no success. Even if I set fixed numbers in the kernel to the output image > nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK 2.0 as > remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets are > below. > > Do I miss something obvious? Any recommendation are welcome. > > With kind regards, > > Gordian > > > > The GPUGenerateData function: > > GPUGenerateData() > > { > > int inputSize[3]; > > int outputSize[3]; > > float inputSpacing[3]; > > float outputSpacing[3]; > > > > for (int i = 0; i<3; i++) > > { > > inputSize[i] = this > ->GetInput()->GetBufferedRegion().GetSize()[i]; > > outputSize[i] = this > ->GetOutput()->GetBufferedRegion().GetSize()[i]; > > inputSpacing[i] = this->GetInput()->GetSpacing()[i]; > > outputSpacing[i] = this > ->GetOutput()->GetSpacing()[i]; > > > > if ((inputSize[i] != outputSize[i]) || > (inputSpacing[i] != outputSpacing[i])) > > { > > std::cerr << "The CUDA laplacian filter can > only handle input and output regions of equal size and spacing" << std:: > endl; > > exit(1); > > } > > } > > > > float *pin = *(float**)(this > ->GetInput()->GetCudaDataManager()->GetGPUBufferPointer()); > > > > // This is a test area > > typename InputImageType::IndexType index; > > index.Fill(0); > > typename InputImageType::SizeType size; > > for (auto i = 0; i < 3; ++i) > > size.Fill(this > ->GetInput()->GetLargestPossibleRegion().GetSize()[i]); > > typename InputImageType::RegionType region(index, size); > > // images for gradients > > auto grad_x = CudaImage<float, 3>::New(); > > grad_x->SetRegions(region); > > grad_x->Allocate(); > > grad_x->FillBuffer(1); > > auto grad_y = CudaImage<float, 3>::New(); > > grad_y->SetRegions(region); > > grad_y->Allocate(); > > auto grad_z = CudaImage<float, 3>::New(); > > grad_z->SetRegions(region); > > grad_z->Allocate(); > > > > float *pout_x = *(float**)(grad_x->GetCudaDataManager()-> > GetGPUBufferPointer()); > > float *pout_y = *(float**)(grad_y->GetCudaDataManager()-> > GetGPUBufferPointer()); > > float *pout_z = *(float**)(grad_z->GetCudaDataManager()-> > GetGPUBufferPointer()); > > > > CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, > pout_z); // after this line neither of the pout_(xyz) images have changed. > > > > // put the gradient images in a single covariant vector image > > auto CompositeImageFilter = itk::ComposeImageFilter< > CudaImage<float, 3>, CudaImage<CovariantVector<float, 3>,3>>::New(); > > CompositeImageFilter->SetInput1(grad_x); > > CompositeImageFilter->SetInput2(grad_y); > > CompositeImageFilter->SetInput3(grad_z); > > CompositeImageFilter->Update(); > > > > this->GetOutput()->Graft(CompositeImageFilter->GetOutput()); > > } > > > > The cuda/kernel function > > > > __global__ void gradient_kernel(float * in, float * grad_x, float * grad_y, > float * grad_z, int3 c_Size, float3 c_Spacing); > > > > void > > CUDA_gradient( > > int size[3], > > float spacing[3], > > float *dev_in, > > float *dev_out_x, > > float *dev_out_y, > > float *dev_out_z) > > { > > int3 dev_Size = make_int3(size[0], size[1], size[2]); > > float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing > [2]); > > > > // Output volume > > long int outputMemorySize = size[0] * size[1] * size[2] * sizeof( > float); > > cudaMalloc((void**)&dev_out_x, outputMemorySize); > > cudaMalloc((void**)&dev_out_y, outputMemorySize); > > cudaMalloc((void**)&dev_out_z, outputMemorySize); > > cudaMemset(dev_out_x, 0, outputMemorySize); > > cudaMemset(dev_out_y, 0, outputMemorySize); > > cudaMemset(dev_out_z, 0, outputMemorySize); > > printf("Device Variable Copying:\t%s\n", cudaGetErrorString( > cudaGetLastError())); > > > > // Thread Block Dimensions > > dim3 dimBlock = dim3(16, 4, 4); > > > > int blocksInX = iDivUp(size[0], dimBlock.x); > > int blocksInY = iDivUp(size[1], dimBlock.y); > > int blocksInZ = iDivUp(size[2], dimBlock.z); > > > > dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ); > > > > gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x, > dev_out_y, dev_out_z, dev_Size, dev_Spacing); > > cudaDeviceSynchronize(); > > printf("Device Variable Copying:\t%s\n", cudaGetErrorString( > cudaGetLastError())); > > CUDA_CHECK_ERROR; > > } > > > > __global__ > > void > > gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, > int3 c_Size, float3 c_Spacing) > > { > > > > unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; > > unsigned int j = blockIdx.y * blockDim.y + threadIdx.y; > > unsigned int k = blockIdx.z * blockDim.z + threadIdx.z; > > > > if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z) > > return; > > > > long int id = (k * c_Size.y + j) * c_Size.x + i; > > long int id_x = (k * c_Size.y + j) * c_Size.x + i + 1; > > long int id_y = (k * c_Size.y + j + 1)* c_Size.x + i; > > long int id_z = ((k + 1) * c_Size.y + j) * c_Size.x + i; > > > > if (i == (c_Size.x - 1)) grad_x[id] = 0; > > else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x; > > > > if (j == (c_Size.y - 1)) grad_y[id] = 0; > > else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y; > > > > if (k == (c_Size.z - 1)) grad_z[id] = 0; > > else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z; > > } > > > > > > > > _______________________________________________ > Rtk-users mailing list > [email protected] > https://public.kitware.com/mailman/listinfo/rtk-users > >
_______________________________________________ Rtk-users mailing list [email protected] https://public.kitware.com/mailman/listinfo/rtk-users
