Re: [PyOpenCL] Re3: Very specific bug: round(float) fails on Ti780 with python only

CRV§ADER//KY Fri, 11 Apr 2014 10:14:42 -0700

Resending - forgot to add the list
On 11 Apr 2014 18:12, "CRV§ADER//KY" <[email protected]> wrote:


> I think the key should be searched in some difference in the command line
> parameters passed to the opencl compiler between c++ and pyopencl. Not sure
> how to trace that though
> On 11 Apr 2014 16:56, "Kai Ruhl" <[email protected]> wrote:
>
>> Yes, here it is (sorry for the bad tabbing).
>>
>>
>> #include "CL/cl.h"
>> #include <fcntl.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <string.h>
>> #include <math.h>
>> #include <unistd.h>
>> #include <sys/types.h>
>> #include <sys/stat.h>
>>
>> // gcc testcl.c -lOpenCL -lm
>>
>> #define CHECK(x) if ((x) != CL_SUCCESS) exit(EXIT_FAILURE);
>> #define CHECK0(x) if (!(x)) exit(EXIT_FAILURE);
>>
>> const char *KERNEL_SOURCE = \
>> "__kernel void square(global float* input, global float* output, const
>> unsigned int count) {" \
>> "  int i = get_global_id(0);" \
>> "  if (i < count) output[i] = round(input[i]);" \
>> "}";
>>
>> const size_t DATA_SIZE = 1024;
>>
>> int main(int argc, char** argv) {
>>     int err;
>>
>>     size_t i = 0;
>>     float data[DATA_SIZE];
>>     for (i = 0; i < DATA_SIZE; ++i) {
>>          data[i] = 100.0 * rand() / (float)RAND_MAX;
>>     }
>>     data[1023] = 88.9f;
>>
>>     cl_platform_id platform_id; CHECK(clGetPlatformIDs(1, &platform_id,
>> NULL));
>>     cl_device_id device_id; CHECK(clGetDeviceIDs(platform_id,
>> CL_DEVICE_TYPE_GPU, 1, &device_id, NULL));
>>     cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL,
>> &err); CHECK(err); CHECK0(context);
>>     cl_command_queue commands = clCreateCommandQueue(context, device_id,
>> 0, &err); CHECK(err); CHECK0(commands);
>>     cl_program program = clCreateProgramWithSource(context, 1, (const
>> char **)&KERNEL_SOURCE, NULL, &err); CHECK(err); CHECK0(program);
>>
>>     err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
>>     if (err != CL_SUCCESS) {
>>         size_t len;
>>         char buffer[2048];
>>         printf("Error: Failed to build program executable!\n");
>>         clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
>> sizeof(buffer), buffer, &len);
>>         printf("%s\n", buffer);
>>         exit(EXIT_FAILURE);
>>     }
>>
>>     cl_kernel kernel = clCreateKernel(program, "square", &err);
>> CHECK(err); CHECK0(kernel);
>>     cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY,
>> sizeof(float) * DATA_SIZE, NULL, NULL); CHECK0(input);
>>     cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
>> sizeof(float) * DATA_SIZE, NULL, NULL); CHECK0(output);
>>     CHECK(clEnqueueWriteBuffer(commands, input, CL_TRUE, 0,
>> sizeof(float) * DATA_SIZE, data, 0, NULL, NULL));
>>
>>     CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &input));
>>     CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &output));
>>     CHECK(clSetKernelArg(kernel, 2, sizeof(unsigned int), &DATA_SIZE));
>>
>>     // get the maximum work group size for executing the kernel on the
>> device
>>     size_t global = DATA_SIZE, local;
>>     CHECK(clGetKernelWorkGroupInfo(kernel, device_id,
>> CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
>>     CHECK(clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global,
>> &local, 0, NULL, NULL));
>>     clFinish(commands);
>>
>>     float results[DATA_SIZE];
>>     CHECK(clEnqueueReadBuffer(commands, output, CL_TRUE, 0,
>> sizeof(float) * DATA_SIZE, results, 0, NULL, NULL));
>>
>>         err = 0;
>>         for (i = 0; i < DATA_SIZE; ++i) {
>>                 printf("%f -> %f\n", data[i], results[i]);
>>                 if (results[i] != round(results[i])) {
>>                         fprintf(stderr, "%f -> %f != %f\n", data[i],
>> results[i],
>> round(results[i]));
>>                         ++err;
>>                 }
>>         }
>>         printf("%d errors\n", err);
>>
>>     clReleaseMemObject(input);
>>     clReleaseMemObject(output);
>>     clReleaseProgram(program);
>>     clReleaseKernel(kernel);
>>     clReleaseCommandQueue(commands);
>>     clReleaseContext(context);
>>
>>     return 0;
>> }
>>
>>
>>
>>
>>
>> On 11.04.2014 17:47, CRV§ADER//KY wrote:
>> > Could you post the c++ code as well? This doesn't make much sense...
>> >
>> > On 11 Apr 2014 16:21, "Kai Ruhl" <[email protected]
>> > <mailto:[email protected]>> wrote:
>> >
>> >     I have a *very* specific bug in pyopencl: When I use round(88.9f)
>> with
>> >     pyopencl from git (2014-04-08) on a Nvidia Ti780, it will give me
>> 88.0
>> >     instead of 89.0.
>> >
>> >     - If I change the gfx card to my older GTX590, it will work.
>> >     - If I round doubles instead of floats, it works on both cards.
>> >     - If I write a test in C++, it works on both cards.
>> >
>> >     Can anyone with a Ti780 confirm this bug?
>> >
>> >
>> >
>> >     Side info: I have compiled pyopencl with ENABLE_GL=True and
>> >     CL_PRETEND_VERSION="1.1" due to missing clCreateSubDevices in nvidia
>> >     OpenCL.
>> >     OS is Ubuntu 14.04.
>> >
>> >     My test is:
>> >
>> >     import pyopencl as cl
>> >     import numpy as np
>> >     ctx = cl.create_some_context()
>> >     que = cl.CommandQueue(ctx)
>> >     cl_prg = cl.Program(ctx, "__kernel void doit(__global float *a) {
>> a[0] =
>> >     round(88.9f); }").build()
>> >     a = np.zeros(1, dtype=np.float32); A = cl.Buffer(ctx,
>> >     cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
>> >     cl_prg.doit(que, [1], None, A); que.finish()
>> >     cl.enqueue_copy(que, a, A)
>> >     print a[0]
>> >
>> >
>> >
>> >     _______________________________________________
>> >     PyOpenCL mailing list
>> >     [email protected] <mailto:[email protected]>
>> >     http://lists.tiker.net/listinfo/pyopencl
>> >
>>
>

_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

Re: [PyOpenCL] Re3: Very specific bug: round(float) fails on Ti780 with python only

Reply via email to