Re: [Rtk-users] GPU kernel do not change output variables

Kabelitz, Gordian Wed, 27 Feb 2019 14:31:22 -0800

Hi Simon,

I used the printf in the kernel to print the id and the fixed value. Therefore 
I know that the kernel is reached. It looked similar to this snippet:
gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, 
int3 c_Size, float3 c_Spacing)
{


       unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
       unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
       unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;

       if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
              return;

       long int id = (k     * c_Size.y + j)    * c_Size.x + i;
       grad_x[id] = 10.f;
       printf("ID: %i -> %f\n", id, grad_x[id]);

}
With the expected result of “ID: xxx -> 10.0000”
To get the print out a cudaDeviceSynchronize (); is need ed after the call of 
CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, pout_z);.

I used the file writer to provide the grad_(xyz) CudaImages right after the 
call of CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, pout_z); 
Those images are the same I created before.
auto filewriter = itk::ImageFileWriter<CudaImage<float, 3>>::New();
filewriter->SetFileName("outX.nrrd");
filewriter->SetInput(grad_x);
filewriter->Update();

The size.Fill() should be an size[i]. All dimension in my test case have the 
same size this wasn’t a problem.

Still no idea. ☹

Von: Simon Rit [mailto:[email protected]]
Gesendet: Mittwoch, 27. Februar 2019 21:58
An: Kabelitz, Gordian
Cc: [email protected]
Betreff: Re: [Rtk-users] GPU kernel do not change output variables

Hi,
Sounds like a challenge. When you say you set fixed numbers, did you check that 
you reach the point where you set this number? You can use cuprintf to check 
what's going on in the kernel.
One thing wrong I noticed: you use size.Fill in a loop, which is a bit odd 
because it will Fill the size with the last value of the loop.
I hope this helps,
Simon

On Wed, Feb 27, 2019 at 9:39 PM Kabelitz, Gordian 
<[email protected]<mailto:[email protected]>>
 wrote:
Hi rtk-users,
I am facing an oddity which I cannot explain.
I want to implement a new gradient  filter. The input is an CudaImage<float,3>  
and the output should be an CudaImage<CovariantVector<float,3>,3>. The filter 
runs without any cuda errors but the output (pout_(xyz)) is has not changed at 
all. The kernel function is accessed and the print out from there seems to be 
okay. I tried to explicitly copy the content of the GPUBuffer into the 
CPUBuffer. Still no success.  Even if I set fixed numbers in the kernel to the 
output image nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK 
2.0 as remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets 
are below.
Do I miss something obvious? Any recommendation are welcome.
With kind regards,
Gordian

The GPUGenerateData function:
GPUGenerateData()
       {
              int inputSize[3];
              int outputSize[3];
              float inputSpacing[3];
              float outputSpacing[3];

              for (int i = 0; i<3; i++)
              {
                     inputSize[i] = 
this->GetInput()->GetBufferedRegion().GetSize()[i];
                     outputSize[i] = 
this->GetOutput()->GetBufferedRegion().GetSize()[i];
                     inputSpacing[i] = this->GetInput()->GetSpacing()[i];
                     outputSpacing[i] = this->GetOutput()->GetSpacing()[i];

                     if ((inputSize[i] != outputSize[i]) || (inputSpacing[i] != 
outputSpacing[i]))
                     {
                           std::cerr << "The CUDA laplacian filter can only 
handle input and output regions of equal size and spacing" << std::endl;
                           exit(1);
                     }
              }

              float *pin = 
*(float**)(this->GetInput()->GetCudaDataManager()->GetGPUBufferPointer());

              // This is a test area
typename InputImageType::IndexType index;
              index.Fill(0);
              typename InputImageType::SizeType size;
              for (auto i = 0; i < 3; ++i)
                     
size.Fill(this->GetInput()->GetLargestPossibleRegion().GetSize()[i]);
              typename InputImageType::RegionType region(index, size);
              // images for gradients
              auto grad_x = CudaImage<float, 3>::New();
              grad_x->SetRegions(region);
              grad_x->Allocate();
              grad_x->FillBuffer(1);
              auto grad_y = CudaImage<float, 3>::New();
              grad_y->SetRegions(region);
              grad_y->Allocate();
              auto grad_z = CudaImage<float, 3>::New();
              grad_z->SetRegions(region);
              grad_z->Allocate();

              float *pout_x = 
*(float**)(grad_x->GetCudaDataManager()->GetGPUBufferPointer());
              float *pout_y = 
*(float**)(grad_y->GetCudaDataManager()->GetGPUBufferPointer());
              float *pout_z = 
*(float**)(grad_z->GetCudaDataManager()->GetGPUBufferPointer());

              CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, 
pout_z); // after this line neither of the pout_(xyz) images have changed.

              // put the gradient images in a single covariant vector image
              auto CompositeImageFilter = 
itk::ComposeImageFilter<CudaImage<float, 3>, CudaImage<CovariantVector<float, 
3>,3>>::New();
              CompositeImageFilter->SetInput1(grad_x);
              CompositeImageFilter->SetInput2(grad_y);
              CompositeImageFilter->SetInput3(grad_z);
              CompositeImageFilter->Update();

              this->GetOutput()->Graft(CompositeImageFilter->GetOutput());
       }

The cuda/kernel function

__global__ void gradient_kernel(float * in, float * grad_x, float * grad_y, 
float * grad_z, int3 c_Size, float3 c_Spacing);

void
CUDA_gradient(
       int size[3],
       float spacing[3],
       float *dev_in,
       float *dev_out_x,
       float *dev_out_y,
       float *dev_out_z)
{
       int3 dev_Size = make_int3(size[0], size[1], size[2]);
       float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);

       // Output volume
       long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(float);
       cudaMalloc((void**)&dev_out_x, outputMemorySize);
       cudaMalloc((void**)&dev_out_y, outputMemorySize);
       cudaMalloc((void**)&dev_out_z, outputMemorySize);
       cudaMemset(dev_out_x, 0, outputMemorySize);
       cudaMemset(dev_out_y, 0, outputMemorySize);
       cudaMemset(dev_out_z, 0, outputMemorySize);
       printf("Device Variable Copying:\t%s\n", 
cudaGetErrorString(cudaGetLastError()));

       // Thread Block Dimensions
       dim3 dimBlock = dim3(16, 4, 4);

       int blocksInX = iDivUp(size[0], dimBlock.x);
       int blocksInY = iDivUp(size[1], dimBlock.y);
       int blocksInZ = iDivUp(size[2], dimBlock.z);

       dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);

       gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x, dev_out_y, 
dev_out_z, dev_Size, dev_Spacing);
       cudaDeviceSynchronize();
       printf("Device Variable Copying:\t%s\n", 
cudaGetErrorString(cudaGetLastError()));
       CUDA_CHECK_ERROR;
}

__global__
void
gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, 
int3 c_Size, float3 c_Spacing)
{

       unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
       unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
       unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;

       if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
              return;

       long int id = (k     * c_Size.y + j)    * c_Size.x + i;
       long int id_x = (k     * c_Size.y + j)    * c_Size.x + i + 1;
       long int id_y = (k     * c_Size.y + j + 1)* c_Size.x + i;
       long int id_z = ((k + 1) * c_Size.y + j)    * c_Size.x + i;

       if (i == (c_Size.x - 1)) grad_x[id] = 0;
       else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x;

       if (j == (c_Size.y - 1)) grad_y[id] = 0;
       else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y;

       if (k == (c_Size.z - 1)) grad_z[id] = 0;
       else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z;
}



_______________________________________________
Rtk-users mailing list
[email protected]<mailto:[email protected]>
https://public.kitware.com/mailman/listinfo/rtk-users

_______________________________________________
Rtk-users mailing list
[email protected]
https://public.kitware.com/mailman/listinfo/rtk-users

Re: [Rtk-users] GPU kernel do not change output variables

Reply via email to