Re: [pocl-devel] NVIDIA device backend for POCL

Peter Colberg Wed, 20 Mar 2013 16:50:34 -0700

On Mon, Mar 18, 2013 at 10:07:26PM +0200, Pekka Jääskeläinen wrote:
> It should be doable with the CUDA API and the LLVM NVPTX backend.
> I took a look at the CUDA API some time ago with this exact idea in
> mind, but didn't have the time to move forward with it.


I succeeded to execute a kernel on an NVIDIA Fermi GPU, containing
get_global_id() and an array copy from source to destination buffer.

Anything more complex will crash the LLVM (3.2) compiler though, which
is probably due to both missing specialized implementations of the CL
library for nvptx, or missing features in the LLVM NVPTX backend.

The work by Tom Stellard on a gallium driver mentioned on the
wiki [1] was very helpful in getting started with a GPU device.

The OpenCL run-time part seems doable, but the OpenCL compiler part
should be implemented by an expert, not by a chemical physicist… ;-)

Regards,
Peter

[1] 
http://sourceforge.net/apps/mediawiki/pocl/index.php?title=Ideas#Integrate_pocl_with_gallium_drivers_for_GPU_support

diff --git a/configure.ac b/configure.ac
index be3142d..6f61f2b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -626,7 +626,7 @@ case $host_cpu in
     AC_MSG_ERROR([unknown/unsupported host $host])
     ;;
 esac
-OCL_TARGETS=$kernel_dir
+OCL_TARGETS="$kernel_dir nvptx64"
 
 
 AC_DEFINE_UNQUOTED([KERNEL_DIR], "$kernel_dir", [Use the libkernel from lib/kernel/$KERNEL_DIR/])
@@ -674,7 +674,7 @@ AC_DEFINE_UNQUOTED([POCL_DEVICE_ADDRESS_BITS],
 ####################################################################
 # Determine which device drivers to build.
 
-OCL_DRIVERS="basic pthreads"
+OCL_DRIVERS="basic pthreads cuda"
 
 # look for the TCE library (needed for the ttasim device driver)
 AC_PATH_PROG([TCE_CONFIG], [tce-config])
@@ -744,6 +744,7 @@ AC_SUBST([OCL_TARGETS])
 AC_CONFIG_FILES([lib/kernel/arm/Makefile
                  lib/kernel/tce/Makefile
                  lib/kernel/cellspu/Makefile
+                 lib/kernel/nvptx64/Makefile
                  lib/kernel/powerpc/Makefile
                  lib/kernel/powerpc64/Makefile
                  lib/kernel/x86_64/Makefile])
@@ -801,6 +802,7 @@ AC_CONFIG_FILES([include/arm/types.h
                  include/powerpc/types.h
                  include/powerpc64/types.h
                  include/cellspu/types.h
+                 include/nvptx64/types.h
                  include/x86_64/types.h])
 
 AC_MSG_NOTICE([Checking target compiler characteristics])
@@ -890,6 +892,7 @@ AC_CONFIG_FILES([Makefile
                  lib/CL/devices/tce/ttasim/Makefile
                  lib/CL/devices/topology/Makefile
                  lib/CL/devices/cellspu/Makefile
+                 lib/CL/devices/cuda/Makefile
                  lib/llvmopencl/Makefile
                  lib/kernel/Makefile
                  lib/poclu/Makefile
diff --git a/include/Makefile.am b/include/Makefile.am
index d2dfef5..c6d94f6 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -30,6 +30,7 @@ pkginclude_HEADERS = pocl.h _kernel.h pocl_device.h
 nobase_pkginclude_HEADERS = $(sort 				\
 							arm/types.h			\
                             x86_64/types.h		\
+                            nvptx64/types.h		\
                             tce/types.h)
 include_HEADERS = poclu.h
 
diff --git a/include/_kernel.h b/include/_kernel.h
index e40d791..451d222 100644
--- a/include/_kernel.h
+++ b/include/_kernel.h
@@ -119,11 +119,11 @@
 
    This mess will be cleaned up at latest when SPIR and its standard
    address space numbers gets finished and implemented in Clang. */
-#define __global __attribute__ ((address_space(3))) 
-#define __constant __attribute__ ((address_space(5))) 
+#define __global __attribute__ ((address_space(1)))
+#define __constant __attribute__ ((address_space(2)))
 
-#define global __attribute__ ((address_space(3)))
-#define constant __attribute__ ((address_space(5))) 
+#define global __attribute__ ((address_space(1)))
+#define constant __attribute__ ((address_space(2)))
 
 typedef enum {
   CLK_LOCAL_MEM_FENCE = 0x1,
diff --git a/include/nvptx64/types.h.in b/include/nvptx64/types.h.in
new file mode 100644
index 0000000..5062668
--- /dev/null
+++ b/include/nvptx64/types.h.in
@@ -0,0 +1,12 @@
+#define cles_khr_int64
+#define cl_khr_fp64
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+
+typedef ulong size_t;
+typedef long ptrdiff_t;
+typedef long intptr_t;
+typedef ulong uintptr_t;
diff --git a/include/pocl.h b/include/pocl.h
index 53ee39b..1c07c50 100644
--- a/include/pocl.h
+++ b/include/pocl.h
@@ -38,9 +38,9 @@
 #include "pocl_device.h"
 
 #define POCL_ADDRESS_SPACE_PRIVATE 0
-#define POCL_ADDRESS_SPACE_GLOBAL 3
-#define POCL_ADDRESS_SPACE_LOCAL 4
-#define POCL_ADDRESS_SPACE_CONSTANT 5
+#define POCL_ADDRESS_SPACE_GLOBAL 1
+#define POCL_ADDRESS_SPACE_LOCAL 3
+#define POCL_ADDRESS_SPACE_CONSTANT 2
 
 typedef struct _mem_mapping mem_mapping_t;
 /* represents a single buffer to host memory mapping */
diff --git a/include/pocl_device.h b/include/pocl_device.h
index 0ad4bbc..3b32cfd 100644
--- a/include/pocl_device.h
+++ b/include/pocl_device.h
@@ -34,6 +34,7 @@
 struct pocl_context {
   uint32_t work_dim;
   size_t num_groups[3];
+  size_t num_items_in_group[3];
   size_t group_id[3];
   size_t global_offset[3];
 };
diff --git a/lib/CL/Makefile.am b/lib/CL/Makefile.am
index b1617ca..ea43e55 100644
--- a/lib/CL/Makefile.am
+++ b/lib/CL/Makefile.am
@@ -126,7 +126,7 @@ libpocl_la_SOURCES = clCreateContextFromType.c	\
 
 
 libpocl_la_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/fix-include/OpenCL -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL/devices $(OCL_ICD_CFLAGS)
-libpocl_la_LDFLAGS = -lltdl @PTHREAD_CFLAGS@ -version-info ${LIB_VERSION}
+libpocl_la_LDFLAGS = -lltdl @PTHREAD_CFLAGS@ -version-info ${LIB_VERSION} -lcuda
 #libpocl_la_LDFLAGS += -Wl,-z,defs 
 
 if TCE_AVAILABLE
diff --git a/lib/CL/clEnqueueNDRangeKernel.c b/lib/CL/clEnqueueNDRangeKernel.c
index 2deb12e..668d0b4 100644
--- a/lib/CL/clEnqueueNDRangeKernel.c
+++ b/lib/CL/clEnqueueNDRangeKernel.c
@@ -274,6 +274,9 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
   pc.num_groups[0] = global_x / local_x;
   pc.num_groups[1] = global_y / local_y;
   pc.num_groups[2] = global_z / local_z;
+  pc.num_items_in_group[0] = local_x;
+  pc.num_items_in_group[1] = local_y;
+  pc.num_items_in_group[2] = local_z;
   pc.global_offset[0] = offset_x;
   pc.global_offset[1] = offset_y;
   pc.global_offset[2] = offset_z;
diff --git a/lib/CL/devices/Makefile.am b/lib/CL/devices/Makefile.am
index d8b4666..e8fad73 100644
--- a/lib/CL/devices/Makefile.am
+++ b/lib/CL/devices/Makefile.am
@@ -23,14 +23,15 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-SUBDIRS = topology pthread basic
+SUBDIRS = topology pthread basic cuda
 
 noinst_LTLIBRARIES = libpocl-devices.la
 
 libpocl_devices_la_SOURCES = devices.h devices.c bufalloc.c dev_image.h \
 	prototypes.inc common.h common.c bufalloc.h cpuinfo.c cpuinfo.h
 libpocl_devices_la_LIBADD = pthread/libpocl-devices-pthread.la \
-	basic/libpocl-devices-basic.la topology/libpocl-devices-topology.la 
+	basic/libpocl-devices-basic.la topology/libpocl-devices-topology.la \
+	cuda/libpocl-devices-cuda.la
 
 libpocl_devices_la_CPPFLAGS = \
   -I$(top_srcdir)/fix-include \
diff --git a/lib/CL/devices/cuda/Makefile.am b/lib/CL/devices/cuda/Makefile.am
new file mode 100644
index 0000000..746e80c
--- /dev/null
+++ b/lib/CL/devices/cuda/Makefile.am
@@ -0,0 +1,30 @@
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/lib/CL/devices/cuda
+# 
+# Copyright (c) 2013 <placeholder>
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+noinst_LTLIBRARIES = libpocl-devices-cuda.la
+
+libpocl_devices_cuda_la_SOURCES = pocl-cuda.h cuda.c
+
+libpocl_devices_cuda_la_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL/devices -I$(top_srcdir)/lib/CL $(OCL_ICD_CFLAGS)
+libpocl_devices_cuda_la_LDFLAGS = -lltdl @PTHREAD_CFLAGS@ --version-info ${LIB_VERSION}
diff --git a/lib/CL/devices/cuda/cuda.c b/lib/CL/devices/cuda/cuda.c
new file mode 100644
index 0000000..25ca0f3
--- /dev/null
+++ b/lib/CL/devices/cuda/cuda.c
@@ -0,0 +1,185 @@
+/* cuda.c - a pocl device driver for the CUDA driver API.
+
+   Copyright (c) 2013 <placeholder>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl-cuda.h"
+
+#include <cuda.h>
+#include <stdlib.h>
+
+#define COMMAND_LENGTH 2048
+
+struct data {
+  CUcontext ctx;
+};
+
+void
+pocl_cuda_init (cl_device_id device, const char* parameters)
+{
+  struct data *d;
+  size_t free, total;
+  d = (struct data *) malloc (sizeof (struct data));
+  device->data = d;
+  if (cuInit(0) != CUDA_SUCCESS) { fprintf(stderr, "cuInit failed\n"); abort(); }
+  if (cuCtxCreate(&d->ctx, 0, 0) != CUDA_SUCCESS) { fprintf(stderr, "cuCtxCreate failed\n"); abort(); }
+  if (cuMemGetInfo(&free, &total) != CUDA_SUCCESS) { fprintf(stderr, "cuMemGetInfo failed\n"); abort(); }
+  device->global_mem_size = total;
+  device->max_mem_alloc_size = total;
+  device->available = CL_TRUE;
+}
+
+void
+pocl_cuda_uninit (cl_device_id device)
+{
+  struct data *d = device->data;
+  if (cuCtxDestroy(d->ctx) != CUDA_SUCCESS) { fprintf(stderr, "cuCtxDestroy failed\n"); abort(); }
+}
+
+void *
+pocl_cuda_malloc (void *device_data, cl_mem_flags flags, size_t size, void *host_ptr)
+{
+  void *b;
+  struct data* d = (struct data*)device_data;
+  if (flags & CL_MEM_USE_HOST_PTR && host_ptr != NULL)
+    {
+      return host_ptr;
+    }
+  if (flags & CL_MEM_ALLOC_HOST_PTR)
+    {
+      if (cuMemAllocHost(&b, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemAllocHost failed\n"); abort(); }
+      if (flags & CL_MEM_COPY_HOST_PTR)
+        {
+          if (cuMemcpyHtoH(b, host_ptr, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemcpyHtoH failed\n"); abort(); }
+        }
+      return b;
+    }
+  else
+    {
+      if (cuMemAlloc(&b, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemAllocHost failed\n"); abort(); }
+      if (flags & CL_MEM_COPY_HOST_PTR)
+        {
+          if (cuMemcpyHtoD(b, host_ptr, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemcpyHtoD failed\n"); abort(); }
+        }
+      return b;
+    }
+  return NULL;
+}
+
+void
+pocl_cuda_free (void *data, cl_mem_flags flags, void *ptr)
+{
+  if (flags & CL_MEM_ALLOC_HOST_PTR)
+    {
+      if (cuMemFreeHost(ptr) != CUDA_SUCCESS) { fprintf(stderr, "cuMemFreeHost failed\n"); abort(); }
+    }
+  else
+    {
+      if (cuMemFree(ptr) != CUDA_SUCCESS) { fprintf(stderr, "cuMemFree failed\n"); abort(); }
+    }
+}
+
+void
+pocl_cuda_run (void *data, _cl_command_node* cmd)
+{
+  struct data *d = data;
+  int error;
+  int i;
+  char bytecode[POCL_FILENAME_LENGTH];
+  char assembly[POCL_FILENAME_LENGTH];
+  char command[COMMAND_LENGTH];
+  char* tmpdir = cmd->command.run.tmp_dir;
+  cl_kernel kernel = cmd->command.run.kernel;
+  struct pocl_context *pc = &cmd->command.run.pc;
+  CUmodule module;
+  CUfunction func;
+
+  error = snprintf (bytecode, POCL_FILENAME_LENGTH,
+                    "%s/%s", tmpdir, POCL_PARALLEL_BC_FILENAME);
+  assert (error >= 0);
+  
+  error = snprintf (assembly, POCL_FILENAME_LENGTH,
+                    "%s/parallel.s",
+                    tmpdir);
+  assert (error >= 0);
+  
+  error = snprintf (command, COMMAND_LENGTH,
+                    LLC " " HOST_LLC_FLAGS " -mcpu=sm_20 -mattr=ptx30" " -o %s %s",
+                    assembly,
+                    bytecode);
+  assert (error >= 0);
+  
+  error = system (command);
+  assert (error == 0);
+
+  if (cuModuleLoad(&module, assembly) != CUDA_SUCCESS) { fprintf(stderr, "cuModuleLoad failed\n"); abort(); }
+  if (cuModuleGetFunction(&func, module, kernel->function_name) != CUDA_SUCCESS) { fprintf(stderr, "cuModuleGetFunction failed\n"); abort(); }
+
+  void *params[10];
+  for (i = 0; i < kernel->num_args + kernel->num_locals; ++i)
+    {
+      cl_mem *mem = kernel->dyn_arguments[i].value;
+      params[i] = &(*mem)->device_ptrs[0];
+    }
+  if (cuLaunchKernel(func, pc->num_groups[0], pc->num_groups[1], pc->num_groups[2], pc->num_items_in_group[0], pc->num_items_in_group[1], pc->num_items_in_group[2], 0, 0, params, 0) != CUDA_SUCCESS) { fprintf(stderr, "cuLauchKernel failed\n"); abort(); }
+  if (cuCtxSynchronize() != CUDA_SUCCESS) { fprintf(stderr, "cuCtxSynchronize failed\n"); abort(); }
+}
+
+void
+pocl_cuda_read (void *data, void *host_ptr, const void *device_ptr, size_t cb)
+{
+  if (host_ptr == device_ptr)
+    return;
+
+  if (cuMemcpyDtoH(host_ptr, device_ptr, cb) != CUDA_SUCCESS) { fprintf(stderr, "cuMemcpyDtoH failed\n"); abort(); }
+}
+
+void
+pocl_cuda_write (void *data, const void *host_ptr, void *device_ptr, size_t cb)
+{
+  if (host_ptr == device_ptr)
+    return;
+
+  if (cuMemcpyHtoD(device_ptr, host_ptr, cb) != CUDA_SUCCESS) { fprintf(stderr, "cuMemcpyHtoD failed\n"); abort(); }
+}
+
+void *
+pocl_cuda_map_mem (void *data, void *buf_ptr,
+                   size_t offset, size_t size,
+                   void *host_ptr)
+{
+  if (host_ptr == NULL)
+    {
+      if (cuMemAllocHost(&host_ptr, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemAllocHost failed\n"); abort(); }
+    }
+  else
+    {
+      if (cuMemcpyDtoH(host_ptr, buf_ptr, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemcpyDtoH failed\n"); abort(); }
+    }
+  return host_ptr;
+}
+
+void *
+pocl_cuda_unmap_mem (void *data, void *host_ptr, void *device_start_ptr, size_t size)
+{
+  if (cuMemcpyHtoD(device_start_ptr, host_ptr, size) != CUDA_SUCCESS) { fprintf(stderr, "cuMemcpyHtoD failed\n"); abort(); }
+  if (cuMemFreeHost(host_ptr) != CUDA_SUCCESS) { fprintf(stderr, "cuMemFreeHost failed\n"); abort(); }
+}
diff --git a/lib/CL/devices/cuda/pocl-cuda.h b/lib/CL/devices/cuda/pocl-cuda.h
new file mode 100644
index 0000000..6f11b9e
--- /dev/null
+++ b/lib/CL/devices/cuda/pocl-cuda.h
@@ -0,0 +1,148 @@
+/* cuda.h - a pocl device driver for Cell SPU.
+
+   Copyright (c) 2013 <placeholder>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifndef POCL_CUDA_H
+#define POCL_CUDA_H
+
+#include "pocl_cl.h"
+#include "pocl_icd.h"
+#include "bufalloc.h"
+
+#include "prototypes.inc"
+
+/* simplistic linker script: 
+ * this is the SPU local address where 'OpenCL global' memory starts.
+ * (if we merge the spus to a single device, this is the 'OpenCL local' memory
+ * 
+ * The idea is to allocate
+ * 64k (0-64k) for text.
+ * 128k (64k-192k) for Opencl local memory.
+ * 64k (192k-256k) for stack + heap (if any)
+ * 
+ * I was unable to place the stack to start at 0x20000, thus the "unclean" division.
+ */
+#define CUDA_OCL_BUFFERS_START 0x10000
+#define CUDA_OCL_BUFFERS_SIZE  0x20000
+#define CUDA_KERNEL_CMD_ADDR   0x30000
+//#define CUDA_OCL_KERNEL_ADDRESS 0x2000
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+GEN_PROTOTYPES (cuda)
+
+#define POCL_DEVICES_CUDA {						\
+  POCL_DEVICE_ICD_DISPATCH     						\
+  POCL_OBJECT_INIT, \
+  CL_DEVICE_TYPE_GPU, /* type */					\
+  0, /* vendor_id */							\
+  1, /* max_compute_units */						\
+  3, /* max_work_item_dimensions */					\
+  {CL_INT_MAX, CL_INT_MAX, CL_INT_MAX}, /* max_work_item_sizes */		\
+  1024, /* max_work_group_size */					\
+  8, /* preferred_wg_size_multiple */                                \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR  , /* preferred_vector_width_char */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT , /* preferred_vector_width_short */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT   , /* preferred_vector_width_int */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG  , /* preferred_vector_width_long */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT , /* preferred_vector_width_float */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE, /* preferred_vector_width_double */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF  , /* preferred_vector_width_half */ \
+  /* TODO: figure out what the difference between preferred and native widths are. */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR  , /* preferred_vector_width_char */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT , /* preferred_vector_width_short */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT   , /* preferred_vector_width_int */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG  , /* preferred_vector_width_long */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT , /* preferred_vector_width_float */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE, /* preferred_vector_width_double */ \
+  POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF  , /* preferred_vector_width_half */ \
+  100, /* max_clock_frequency */						\
+  0, /* address_bits */							\
+  0, /* max_mem_alloc_size */						\
+  CL_FALSE, /* image_support */						\
+  0, /* max_read_image_args */						\
+  0, /* max_write_image_args */						\
+  0, /* image2d_max_width */						\
+  0, /* image2d_max_height */						\
+  0, /* image3d_max_width */						\
+  0, /* image3d_max_height */						\
+  0, /* image3d_max_depth */						\
+  0, /* max_samplers */							\
+  0, /* max_parameter_size */						\
+  0, /* mem_base_addr_align */						\
+  0, /* min_data_type_align_size */					\
+  CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN, /* single_fp_config */	\
+  CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN, /* double_fp_config */       \
+  CL_NONE, /* global_mem_cache_type */					\
+  0, /* global_mem_cacheline_size */					\
+  0, /* global_mem_cache_size */					\
+  0, /* global_mem_size */						\
+  0, /* max_constant_buffer_size */					\
+  0, /* max_constant_args */						\
+  CL_GLOBAL, /* local_mem_type */					\
+  0, /* local_mem_size */						\
+  CL_FALSE, /* error_correction_support */				\
+  CL_TRUE, /* host_unified_memory */                \
+  0, /* profiling_timer_resolution */					\
+  CL_FALSE, /* endian_little */						\
+  CL_TRUE, /* available */						\
+  CL_TRUE, /* compiler_available */					\
+  CL_EXEC_KERNEL, /*execution_capabilities */				\
+  CL_QUEUE_PROFILING_ENABLE, /* queue_properties */			\
+  0, /* platform */							\
+  "cuda", /* name */							\
+  "STI", /* vendor */							\
+  PACKAGE_VERSION, /* driver_version */						\
+  "EMBEDDED_PROFILE", /* profile */						\
+  "OpenCL 1.2 pocl", /* version */					\
+  "", /* extensions */							\
+  /* implementation */							\
+  pocl_cuda_uninit, /* uninit */                                     \
+  pocl_cuda_init, /* init */                                       \
+  pocl_cuda_malloc, /* malloc */					\
+  NULL, \
+  pocl_cuda_free, /* free */						\
+  pocl_cuda_read, /* read */						\
+  NULL, /* read_rect */				\
+  pocl_cuda_write, /* write */					\
+  NULL, /* write_rect */				\
+  NULL, /* copy */						\
+  NULL, /* copy_rect */				\
+  pocl_cuda_map_mem, /* map_mem */                              \
+  pocl_cuda_unmap_mem, /* unmap_mem */                    \
+  pocl_cuda_run, /* run */                         \
+  NULL,                \
+  NULL, /*pocl_cuda_build_program */	 \
+  NULL, /* data */                               \
+  "nvptx64", /* kernel_lib_target (forced kernel library dir) */    \
+  "nvptx64", /* llvm_target_triplet */               \
+  0 /* dev_id */ \
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* POCL_CUDA_H */
diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c
index 4e69bad..45adadb 100644
--- a/lib/CL/devices/devices.c
+++ b/lib/CL/devices/devices.c
@@ -30,6 +30,8 @@
 #include "basic/basic.h"
 #include "pthread/pocl-pthread.h"
 
+#include "cuda/pocl-cuda.h"
+
 #if defined(BUILD_SPU)
 #include "cellspu/cellspu.h"
 #endif
@@ -43,9 +45,9 @@ struct _cl_device_id* pocl_devices = NULL;
 int pocl_num_devices = 0;
 
 #ifdef TCE_AVAILABLE
-#define POCL_NUM_DEVICE_TYPES 4
+#define POCL_NUM_DEVICE_TYPES 5
 #else
-#define POCL_NUM_DEVICE_TYPES 3
+#define POCL_NUM_DEVICE_TYPES 4
 #endif
 
 /* All device drivers available to the pocl. */
@@ -58,6 +60,7 @@ static struct _cl_device_id pocl_device_types[POCL_NUM_DEVICE_TYPES] = {
 #if defined(TCE_AVAILABLE)
   POCL_DEVICES_TTASIM,
 #endif
+  POCL_DEVICES_CUDA,
 };
 
 void 
@@ -75,7 +78,7 @@ pocl_init_devices()
     }
   else
     {
-      device_list = "pthread";
+      device_list = "pthread cuda";
     }
   
   ptr = tofree = strdup(device_list);
diff --git a/lib/CL/devices/prototypes.inc b/lib/CL/devices/prototypes.inc
index b044c1f..8b2721d 100644
--- a/lib/CL/devices/prototypes.inc
+++ b/lib/CL/devices/prototypes.inc
@@ -64,6 +64,8 @@
   void*                                                                 \
   pocl_##__DRV__##_map_mem (void *data, void *buf_ptr,                      \
                         size_t offset, size_t size, void *host_ptr); \
+  void*                                                                 \
+  pocl_##__DRV__##_unmap_mem (void *data, void *host_ptr, void *device_start_ptr, size_t size); \
   cl_ulong pocl_##__DRV__##_get_timer_value(void *data); \
   int pocl_##__DRV__##_build_program (void *data, char *source_fn, char *binary_fn, char *default_cmd, char *dev_tmpdir);\
   _Pragma ("GCC visibility pop")                                        \
diff --git a/lib/kernel/Makefile.am b/lib/kernel/Makefile.am
index e6a4895..f2a8059 100644
--- a/lib/kernel/Makefile.am
+++ b/lib/kernel/Makefile.am
@@ -22,7 +22,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-DIST_SUBDIRS = tce x86_64 powerpc powerpc64 cellspu arm
+DIST_SUBDIRS = tce x86_64 powerpc powerpc64 cellspu arm nvptx64
 SUBDIRS = @OCL_TARGETS@ 
 
 # The source files in this dir are not built directly
diff --git a/lib/kernel/nvptx64/Makefile.am b/lib/kernel/nvptx64/Makefile.am
new file mode 100644
index 0000000..d8f2463
--- /dev/null
+++ b/lib/kernel/nvptx64/Makefile.am
@@ -0,0 +1,52 @@
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/lib/kernel/nvptx64. 
+#
+# The nvptx64 target.
+# 
+# Copyright (c) 2013 <placeholder>
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# Authors: Tom Stellard <[email protected]>
+#
+
+KERNEL_TARGET=nvptx64
+TARGET_DIR=nvptx64
+
+targetpkglibdir = $(pkglibdir)/nvptx64
+targetpkglib_DATA = kernel-nvptx64.bc
+
+include ../sources.mk
+
+get_group_id.bc: get_group_id.ll
+	@LLVM_AS@ -o $@ $<
+get_local_id.bc: get_local_id.ll
+	@LLVM_AS@ -o $@ $<
+get_local_size.bc: get_local_size.ll
+	@LLVM_AS@ -o $@ $<
+get_num_groups.bc: get_num_groups.ll
+	@LLVM_AS@ -o $@ $<
+get_global_id.bc: get_global_id.ll
+	@LLVM_AS@ -o $@ $<
+get_global_size.bc: get_global_size.ll
+	@LLVM_AS@ -o $@ $<
+
+all: kernel-nvptx64.bc
+
diff --git a/lib/kernel/nvptx64/get_global_id.ll b/lib/kernel/nvptx64/get_global_id.ll
new file mode 100644
index 0000000..b7d8305
--- /dev/null
+++ b/lib/kernel/nvptx64/get_global_id.ll
@@ -0,0 +1,61 @@
+; ModuleID = '/tmp/get_global_id.cl'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64"
+
+define ptx_device i64 @get_global_id(i32 %dim) nounwind readnone alwaysinline {
+entry:
+  switch i32 %dim, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb5
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %0 = tail call i32 @llvm.ptx.read.tid.x()
+  %1 = tail call i32 @llvm.ptx.read.ntid.x()
+  %2 = tail call i32 @llvm.ptx.read.ctaid.x()
+  %mul = mul nsw i32 %2, %1
+  %add = add nsw i32 %mul, %0
+  %conv = sext i32 %add to i64
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %3 = tail call i32 @llvm.ptx.read.tid.y()
+  %4 = tail call i32 @llvm.ptx.read.ntid.y()
+  %5 = tail call i32 @llvm.ptx.read.ctaid.y()
+  %mul2 = mul nsw i32 %5, %4
+  %add3 = add nsw i32 %mul2, %3
+  %conv4 = sext i32 %add3 to i64
+  br label %return
+
+sw.bb5:                                           ; preds = %entry
+  %6 = tail call i32 @llvm.ptx.read.tid.z()
+  %7 = tail call i32 @llvm.ptx.read.ntid.z()
+  %8 = tail call i32 @llvm.ptx.read.ctaid.z()
+  %mul6 = mul nsw i32 %8, %7
+  %add7 = add nsw i32 %mul6, %6
+  %conv8 = sext i32 %add7 to i64
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb5, %sw.bb1, %sw.bb
+  %retval.0 = phi i64 [ %conv8, %sw.bb5 ], [ %conv4, %sw.bb1 ], [ %conv, %sw.bb ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+declare i32 @llvm.ptx.read.tid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.ctaid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.tid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.ctaid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.tid.z() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.z() nounwind readnone
+
+declare i32 @llvm.ptx.read.ctaid.z() nounwind readnone
diff --git a/lib/kernel/nvptx64/get_global_size.ll b/lib/kernel/nvptx64/get_global_size.ll
new file mode 100644
index 0000000..fbc710f
--- /dev/null
+++ b/lib/kernel/nvptx64/get_global_size.ll
@@ -0,0 +1,49 @@
+; ModuleID = '/tmp/get_global_size.cl'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64"
+
+define ptx_device i64 @get_global_size(i32 %dim) nounwind readnone alwaysinline {
+entry:
+  switch i32 %dim, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb4
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %0 = tail call i32 @llvm.ptx.read.ntid.x()
+  %1 = tail call i32 @llvm.ptx.read.nctaid.x()
+  %mul = mul nsw i32 %1, %0
+  %conv = sext i32 %mul to i64
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %2 = tail call i32 @llvm.ptx.read.ntid.y()
+  %3 = tail call i32 @llvm.ptx.read.nctaid.y()
+  %mul2 = mul nsw i32 %3, %2
+  %conv3 = sext i32 %mul2 to i64
+  br label %return
+
+sw.bb4:                                           ; preds = %entry
+  %4 = tail call i32 @llvm.ptx.read.ntid.z()
+  %5 = tail call i32 @llvm.ptx.read.nctaid.z()
+  %mul5 = mul nsw i32 %5, %4
+  %conv6 = sext i32 %mul5 to i64
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb4, %sw.bb1, %sw.bb
+  %retval.0 = phi i64 [ %conv6, %sw.bb4 ], [ %conv3, %sw.bb1 ], [ %conv, %sw.bb ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+declare i32 @llvm.ptx.read.ntid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.nctaid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.nctaid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.z() nounwind readnone
+
+declare i32 @llvm.ptx.read.nctaid.z() nounwind readnone
diff --git a/lib/kernel/nvptx64/get_group_id.ll b/lib/kernel/nvptx64/get_group_id.ll
new file mode 100644
index 0000000..97b9fb5
--- /dev/null
+++ b/lib/kernel/nvptx64/get_group_id.ll
@@ -0,0 +1,37 @@
+; ModuleID = 'get_group_id.cl'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64"
+
+define ptx_device i64 @get_group_id(i32 %dim) nounwind readnone alwaysinline {
+entry:
+  switch i32 %dim, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %0 = tail call i32 @llvm.ptx.read.ctaid.x()
+  %conv = sext i32 %0 to i64
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %1 = tail call i32 @llvm.ptx.read.ctaid.y()
+  %conv2 = sext i32 %1 to i64
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  %2 = tail call i32 @llvm.ptx.read.ctaid.z()
+  %conv4 = sext i32 %2 to i64
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i64 [ %conv4, %sw.bb3 ], [ %conv2, %sw.bb1 ], [ %conv, %sw.bb ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+declare i32 @llvm.ptx.read.ctaid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.ctaid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.ctaid.z() nounwind readnone
diff --git a/lib/kernel/nvptx64/get_local_id.ll b/lib/kernel/nvptx64/get_local_id.ll
new file mode 100644
index 0000000..60144e9
--- /dev/null
+++ b/lib/kernel/nvptx64/get_local_id.ll
@@ -0,0 +1,37 @@
+; ModuleID = 'get_local_id.cl'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64"
+
+define ptx_device i64 @get_local_id(i32 %dim) nounwind readnone alwaysinline {
+entry:
+  switch i32 %dim, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %0 = tail call i32 @llvm.ptx.read.tid.x()
+  %conv = sext i32 %0 to i64
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %1 = tail call i32 @llvm.ptx.read.tid.y()
+  %conv2 = sext i32 %1 to i64
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  %2 = tail call i32 @llvm.ptx.read.tid.z()
+  %conv4 = sext i32 %2 to i64
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i64 [ %conv4, %sw.bb3 ], [ %conv2, %sw.bb1 ], [ %conv, %sw.bb ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+declare i32 @llvm.ptx.read.tid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.tid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.tid.z() nounwind readnone
diff --git a/lib/kernel/nvptx64/get_local_size.ll b/lib/kernel/nvptx64/get_local_size.ll
new file mode 100644
index 0000000..3707bfb
--- /dev/null
+++ b/lib/kernel/nvptx64/get_local_size.ll
@@ -0,0 +1,37 @@
+; ModuleID = 'get_local_size.cl'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64"
+
+define ptx_device i64 @get_local_size(i32 %dim) nounwind readnone alwaysinline {
+entry:
+  switch i32 %dim, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %0 = tail call i32 @llvm.ptx.read.ntid.x()
+  %conv = sext i32 %0 to i64
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %1 = tail call i32 @llvm.ptx.read.ntid.y()
+  %conv2 = sext i32 %1 to i64
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  %2 = tail call i32 @llvm.ptx.read.ntid.z()
+  %conv4 = sext i32 %2 to i64
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i64 [ %conv4, %sw.bb3 ], [ %conv2, %sw.bb1 ], [ %conv, %sw.bb ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+declare i32 @llvm.ptx.read.ntid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.ntid.z() nounwind readnone
diff --git a/lib/kernel/nvptx64/get_num_groups.ll b/lib/kernel/nvptx64/get_num_groups.ll
new file mode 100644
index 0000000..8b3028a
--- /dev/null
+++ b/lib/kernel/nvptx64/get_num_groups.ll
@@ -0,0 +1,37 @@
+; ModuleID = 'get_num_groups.cl'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64"
+
+define ptx_device i64 @get_num_groups(i32 %dim) nounwind readnone alwaysinline {
+entry:
+  switch i32 %dim, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %0 = tail call i32 @llvm.ptx.read.nctaid.x()
+  %conv = sext i32 %0 to i64
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %1 = tail call i32 @llvm.ptx.read.nctaid.y()
+  %conv2 = sext i32 %1 to i64
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  %2 = tail call i32 @llvm.ptx.read.nctaid.z()
+  %conv4 = sext i32 %2 to i64
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i64 [ %conv4, %sw.bb3 ], [ %conv2, %sw.bb1 ], [ %conv, %sw.bb ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+declare i32 @llvm.ptx.read.nctaid.x() nounwind readnone
+
+declare i32 @llvm.ptx.read.nctaid.y() nounwind readnone
+
+declare i32 @llvm.ptx.read.nctaid.z() nounwind readnone
diff --git a/scripts/pocl-build.in b/scripts/pocl-build.in
index 5c3363f..3aa30eb 100644
--- a/scripts/pocl-build.in
+++ b/scripts/pocl-build.in
@@ -48,6 +48,7 @@ case $target in
   arm*)        target_dir="arm"
                target="armv7";;
   cellspu-*)   target_dir="cellspu";;
+  nvptx64-*)   target_dir="nvptx64";;
   powerpc-*) target_dir="powerpc";;
   powerpc64-*) target_dir="powerpc64";;
   tce-*)       target_dir="tce";;
diff --git a/scripts/pocl-kernel.in b/scripts/pocl-kernel.in
index c293523..d26cb9e 100644
--- a/scripts/pocl-kernel.in
+++ b/scripts/pocl-kernel.in
@@ -63,6 +63,8 @@ fi
 
 #pure clang doesn't allow "-target tce-tut-llvm"
 case $target in
+  nvptx*)
+    target_flags="" ;;
   tce-*)
     target_flags="" ;;
   *)
diff --git a/scripts/pocl-workgroup.in b/scripts/pocl-workgroup.in
index 7051e14..a654c5f 100644
--- a/scripts/pocl-workgroup.in
+++ b/scripts/pocl-workgroup.in
@@ -74,6 +74,7 @@ target_dir=${target}
 case $target in
   arm*)        target_dir="arm";;
   cellspu-*)   target_dir="cellspu";;
+  nvptx64-*)   target_dir="nvptx64";;
   powerpc-*)   target_dir="powerpc";;
   powerpc64-*) target_dir="powerpc64";;
   tce-*)       target_dir="tce";;
@@ -138,10 +139,19 @@ fi
 
 #set -x
 
-@OPT@ -load=${pocl_lib} -domtree -workitem-handler-chooser -break-constgeps -generate-header -flatten -always-inline \
-    -globaldce -simplifycfg -loop-simplify -phistoallocas -isolate-regions -loop-barriers \
-    -barriertails -barriers -isolate-regions -add-wi-metadata -wi-aa -workitemrepl -workitemloops  \
-    -workgroup -kernel=${kernel} -local-size=${size_x} ${size_y} ${size_z} ${OPT_SWITCH} ${WG_VECTORIZER} \
-    ${EXTRA_OPTS} -instcombine -header=${header} -o ${output_file} ${linked_bc}
+case $target in
+  nvptx*)
+    @OPT@ -internalize-public-api-list=${kernel} -internalize -inline-threshold=1000000000 -inline \
+          -globaldce \
+          -o ${output_file} ${linked_bc}
+    ;;
+  *)
+    @OPT@ -load=${pocl_lib} -domtree -workitem-handler-chooser -break-constgeps -generate-header -flatten -always-inline \
+        -globaldce -simplifycfg -loop-simplify -phistoallocas -isolate-regions -loop-barriers \
+        -barriertails -barriers -isolate-regions -add-wi-metadata -wi-aa -workitemrepl -workitemloops  \
+        -workgroup -kernel=${kernel} -local-size=${size_x} ${size_y} ${size_z} ${OPT_SWITCH} ${WG_VECTORIZER} \
+        ${EXTRA_OPTS} -instcombine -header=${header} -o ${output_file} ${linked_bc}
+    ;;
+esac
 
 #set +x

------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_d2d_mar

_______________________________________________
pocl-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/pocl-devel

Re: [pocl-devel] NVIDIA device backend for POCL

Reply via email to