Hi Simon,
I used the printf in the kernel to print the id and the fixed value. Therefore
I know that the kernel is reached. It looked similar to this snippet:
gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z,
int3 c_Size, float3 c_Spacing)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
return;
long int id = (k * c_Size.y + j) * c_Size.x + i;
grad_x[id] = 10.f;
printf("ID: %i -> %f\n", id, grad_x[id]);
}
With the expected result of “ID: xxx -> 10.0000”
To get the print out a cudaDeviceSynchronize (); is need ed after the call of
CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, pout_z);.
I used the file writer to provide the grad_(xyz) CudaImages right after the
call of CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, pout_z);
Those images are the same I created before.
auto filewriter = itk::ImageFileWriter<CudaImage<float, 3>>::New();
filewriter->SetFileName("outX.nrrd");
filewriter->SetInput(grad_x);
filewriter->Update();
The size.Fill() should be an size[i]. All dimension in my test case have the
same size this wasn’t a problem.
Still no idea. ☹
Von: Simon Rit [mailto:[email protected]]
Gesendet: Mittwoch, 27. Februar 2019 21:58
An: Kabelitz, Gordian
Cc: [email protected]
Betreff: Re: [Rtk-users] GPU kernel do not change output variables
Hi,
Sounds like a challenge. When you say you set fixed numbers, did you check that
you reach the point where you set this number? You can use cuprintf to check
what's going on in the kernel.
One thing wrong I noticed: you use size.Fill in a loop, which is a bit odd
because it will Fill the size with the last value of the loop.
I hope this helps,
Simon
On Wed, Feb 27, 2019 at 9:39 PM Kabelitz, Gordian
<[email protected]<mailto:[email protected]>>
wrote:
Hi rtk-users,
I am facing an oddity which I cannot explain.
I want to implement a new gradient filter. The input is an CudaImage<float,3>
and the output should be an CudaImage<CovariantVector<float,3>,3>. The filter
runs without any cuda errors but the output (pout_(xyz)) is has not changed at
all. The kernel function is accessed and the print out from there seems to be
okay. I tried to explicitly copy the content of the GPUBuffer into the
CPUBuffer. Still no success. Even if I set fixed numbers in the kernel to the
output image nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK
2.0 as remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets
are below.
Do I miss something obvious? Any recommendation are welcome.
With kind regards,
Gordian
The GPUGenerateData function:
GPUGenerateData()
{
int inputSize[3];
int outputSize[3];
float inputSpacing[3];
float outputSpacing[3];
for (int i = 0; i<3; i++)
{
inputSize[i] =
this->GetInput()->GetBufferedRegion().GetSize()[i];
outputSize[i] =
this->GetOutput()->GetBufferedRegion().GetSize()[i];
inputSpacing[i] = this->GetInput()->GetSpacing()[i];
outputSpacing[i] = this->GetOutput()->GetSpacing()[i];
if ((inputSize[i] != outputSize[i]) || (inputSpacing[i] !=
outputSpacing[i]))
{
std::cerr << "The CUDA laplacian filter can only
handle input and output regions of equal size and spacing" << std::endl;
exit(1);
}
}
float *pin =
*(float**)(this->GetInput()->GetCudaDataManager()->GetGPUBufferPointer());
// This is a test area
typename InputImageType::IndexType index;
index.Fill(0);
typename InputImageType::SizeType size;
for (auto i = 0; i < 3; ++i)
size.Fill(this->GetInput()->GetLargestPossibleRegion().GetSize()[i]);
typename InputImageType::RegionType region(index, size);
// images for gradients
auto grad_x = CudaImage<float, 3>::New();
grad_x->SetRegions(region);
grad_x->Allocate();
grad_x->FillBuffer(1);
auto grad_y = CudaImage<float, 3>::New();
grad_y->SetRegions(region);
grad_y->Allocate();
auto grad_z = CudaImage<float, 3>::New();
grad_z->SetRegions(region);
grad_z->Allocate();
float *pout_x =
*(float**)(grad_x->GetCudaDataManager()->GetGPUBufferPointer());
float *pout_y =
*(float**)(grad_y->GetCudaDataManager()->GetGPUBufferPointer());
float *pout_z =
*(float**)(grad_z->GetCudaDataManager()->GetGPUBufferPointer());
CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y,
pout_z); // after this line neither of the pout_(xyz) images have changed.
// put the gradient images in a single covariant vector image
auto CompositeImageFilter =
itk::ComposeImageFilter<CudaImage<float, 3>, CudaImage<CovariantVector<float,
3>,3>>::New();
CompositeImageFilter->SetInput1(grad_x);
CompositeImageFilter->SetInput2(grad_y);
CompositeImageFilter->SetInput3(grad_z);
CompositeImageFilter->Update();
this->GetOutput()->Graft(CompositeImageFilter->GetOutput());
}
The cuda/kernel function
__global__ void gradient_kernel(float * in, float * grad_x, float * grad_y,
float * grad_z, int3 c_Size, float3 c_Spacing);
void
CUDA_gradient(
int size[3],
float spacing[3],
float *dev_in,
float *dev_out_x,
float *dev_out_y,
float *dev_out_z)
{
int3 dev_Size = make_int3(size[0], size[1], size[2]);
float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);
// Output volume
long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(float);
cudaMalloc((void**)&dev_out_x, outputMemorySize);
cudaMalloc((void**)&dev_out_y, outputMemorySize);
cudaMalloc((void**)&dev_out_z, outputMemorySize);
cudaMemset(dev_out_x, 0, outputMemorySize);
cudaMemset(dev_out_y, 0, outputMemorySize);
cudaMemset(dev_out_z, 0, outputMemorySize);
printf("Device Variable Copying:\t%s\n",
cudaGetErrorString(cudaGetLastError()));
// Thread Block Dimensions
dim3 dimBlock = dim3(16, 4, 4);
int blocksInX = iDivUp(size[0], dimBlock.x);
int blocksInY = iDivUp(size[1], dimBlock.y);
int blocksInZ = iDivUp(size[2], dimBlock.z);
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x, dev_out_y,
dev_out_z, dev_Size, dev_Spacing);
cudaDeviceSynchronize();
printf("Device Variable Copying:\t%s\n",
cudaGetErrorString(cudaGetLastError()));
CUDA_CHECK_ERROR;
}
__global__
void
gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z,
int3 c_Size, float3 c_Spacing)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
return;
long int id = (k * c_Size.y + j) * c_Size.x + i;
long int id_x = (k * c_Size.y + j) * c_Size.x + i + 1;
long int id_y = (k * c_Size.y + j + 1)* c_Size.x + i;
long int id_z = ((k + 1) * c_Size.y + j) * c_Size.x + i;
if (i == (c_Size.x - 1)) grad_x[id] = 0;
else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x;
if (j == (c_Size.y - 1)) grad_y[id] = 0;
else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y;
if (k == (c_Size.z - 1)) grad_z[id] = 0;
else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z;
}
_______________________________________________
Rtk-users mailing list
[email protected]<mailto:[email protected]>
https://public.kitware.com/mailman/listinfo/rtk-users
_______________________________________________
Rtk-users mailing list
[email protected]
https://public.kitware.com/mailman/listinfo/rtk-users