Hi,
I wrote a program which uses pyOpenCL to call clAmdBlas DGEMM . This example
is a recoding of example_sgemm.c in the clAmdBlas distribution, changed to use
Python, Cython, NumPY, pyOpenCL and DGEMM. The reason for doing this is to
show how to access fast matrix multiplication which is also host-tunable, while
at the same time be able to mix with other pyOpenCL calls for example for
simple addition of matrices and for random number generation. I followed this
route for two reasons:
1. It is hard to find self-contained OpenCL kernels on the Internet for
fast matrix multiplication for matrices of any dimension. Most examples assume
a dimension which happens to fit the gadget. More pragmatic matrix
multiplication codes use a C++ program to generate the kernel on the fly based
on the dimensions. This is what clAmdBlas does. I didn't find any Python
codes that replicate this.
2. I couldn't figure out by reading the pyOpenCL documentation how to
multiply two matrices. I guessed I could do it with Reikna, but I wasn't
convinced the result would be dimension-optimised. I looked in the code for
matrix multiplication in the pyOpenCL distribution and it looked at first
glance like an element-wise multiply.
I am sharing this with PyOpenCL mailing list in case PyOpenCL folks want to
incorporate wrappers for clAmdBlas and clAmdRnd (references below). Here is
the Cython header for clAmdBlasDGEMM:
from libc.stdint cimport intptr_t, uintptr_t
cdef extern from "clAmdBlas.h":
enum:
CL_SUCCESS = 0
CL_INVALID_VALUE = -30
CL_INVALID_COMMAND_QUEUE = -36
CL_INVALID_CONTEXT = -34
CL_INVALID_MEM_OBJECT = -38
CL_INVALID_DEVICE = -33
CL_INVALID_EVENT_WAIT_LIST = -57
CL_OUT_OF_RESOURCES = -5
CL_OUT_OF_HOST_MEMORY = -6
CL_INVALID_OPERATION = -59
CL_COMPILER_NOT_AVAILABLE = -3
CL_BUILD_PROGRAM_FAILURE = -11
enum clAmdBlasStatus:
clAmdBlasSuccess = CL_SUCCESS
clAmdBlasInvalidValue = CL_INVALID_VALUE
clAmdBlasInvalidCommandQueue = CL_INVALID_COMMAND_QUEUE
clAmdBlasInvalidContext = CL_INVALID_CONTEXT
clAmdBlasInvalidMemObject = CL_INVALID_MEM_OBJECT
clAmdBlasInvalidDevice = CL_INVALID_DEVICE
clAmdBlasInvalidEventWaitList = CL_INVALID_EVENT_WAIT_LIST
clAmdBlasOutOfResources = CL_OUT_OF_RESOURCES
clAmdBlasOutOfHostMemory = CL_OUT_OF_HOST_MEMORY
clAmdBlasInvalidOperation = CL_INVALID_OPERATION
clAmdBlasCompilerNotAvailable = CL_COMPILER_NOT_AVAILABLE
clAmdBlasBuildProgramFailure = CL_BUILD_PROGRAM_FAILURE
clAmdBlasNotImplemented = -1024
clAmdBlasNotInitialized = -1023
clAmdBlasInvalidMatA
clAmdBlasInvalidMatB
clAmdBlasInvalidMatC
clAmdBlasInvalidVecX
clAmdBlasInvalidVecY
clAmdBlasInvalidDim
clAmdBlasInvalidLeadDimA
clAmdBlasInvalidLeadDimB
clAmdBlasInvalidLeadDimC
clAmdBlasInvalidIncX
clAmdBlasInvalidIncY
clAmdBlasInsufficientMemMatA
clAmdBlasInsufficientMemMatB
clAmdBlasInsufficientMemMatC
clAmdBlasInsufficientMemVecX
clAmdBlasInsufficientMemVecY
enum clAmdBlasOrder:
clAmdBlasRowMajor = 0
clAmdBlasColumnMajor = 1
enum clAmdBlasTranspose:
clAmdBlasNoTrans = 0
clAmdBlasTrans = 1
clAmdBlasConjTrans = 2
ctypedef unsigned int cl_uint
ctypedef signed int cl_int
ctypedef float cl_float
ctypedef double cl_double
ctypedef void* cl_mem
ctypedef void* cl_command_queue
ctypedef void* cl_event
ctypedef void* cl_platform_id
ctypedef void* cl_device_id
ctypedef void* cl_context
ctypedef unsigned long cl_ulong
ctypedef cl_ulong cl_bitfield
ctypedef cl_bitfield cl_device_type
cl_device_type CL_DEVICE_TYPE_GPU = (1 << 2)
ctypedef cl_bitfield cl_command_queue_properties
ctypedef cl_uint cl_bool
cl_bool CL_FALSE = 0
cl_bool CL_TRUE = 1
ctypedef cl_bitfield cl_mem_flags
cl_mem_flags CL_MEM_READ_ONLY = (1 << 2)
cl_mem_flags CL_MEM_READ_WRITE = (1 << 0)
cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
cl_uint *num_platforms)
cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
cl_uint num_entries, cl_device_id *devices, cl_uint
*num_devices)
ctypedef intptr_t cl_context_properties
cl_context_properties CL_CONTEXT_PLATFORM = 0x1084
cl_context clCreateContext(cl_context_properties *properties, cl_uint
ndevices, cl_device_id *devices,
void *pfn_notify, void *user_data, cl_int
*errcode_ret)
cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id
device,
cl_command_queue_properties
properties, cl_int *errcode_ret)
cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_read,
size_t offset,
size_t size,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event)
cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_write,
size_t offset,
size_t size,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event)
cl_int clReleaseContext(cl_context context)
cl_int clReleaseCommandQueue(cl_command_queue command_queue)
cl_mem clCreateBuffer(cl_context context,
cl_mem_flags flags,
size_t size,
void * host_ptr,
cl_int * errcode_ret)
cl_int clWaitForEvents(cl_uint num_events, const cl_event * event_list)
cl_int clReleaseMemObject(cl_mem memobj)
clAmdBlasStatus clAmdBlasSetup( )
void clAmdBlasTeardown( )
clAmdBlasStatus clAmdBlasDgemm(clAmdBlasOrder order,
clAmdBlasTranspose transA,
clAmdBlasTranspose transB,
size_t M,
size_t N,
size_t K,
cl_double alpha,
const cl_mem A,
size_t lda,
const cl_mem B,
size_t ldb,
cl_double beta,
cl_mem C,
size_t ldc,
cl_uint numCommandQueues,
cl_command_queue *commandQueues,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *events)
Here is the main program:
import sys
from clAmdBlas cimport *
import ctypes
import numpy as np
cimport numpy as np
import pyopencl as cl
import pyopencl.array as cl_array
cdef class GPU:
cdef cl_platform_id platform
cdef cl_device_id device
cdef cl_event event
cdef cl_command_queue queue
cdef cl_context ctx
cdef cl_int err
cdef public py_platform, py_device, py_context, py_queue
def __dealloc__(self):
clAmdBlasTeardown()
print "clean up and shut down gadget"
def __cinit__(self):
self.event = NULL
cdef cl_context_properties props[3]
props[:] = [CL_CONTEXT_PLATFORM, 0, 0]
self.py_platform = cl.get_platforms()[0]
cdef intptr_t platform_p = self.py_platform.int_ptr
self.platform = <cl_platform_id>platform_p
self.py_device = self.py_platform.get_devices()[0]
cdef intptr_t device_p = self.py_device.int_ptr
self.device = <cl_device_id>device_p
self.py_context = cl.Context(devices=[self.py_device])
cdef intptr_t context_p = self.py_context.int_ptr
self.ctx = <cl_context>context_p
self.py_queue = cl.CommandQueue(self.py_context, device=self.py_device)
cdef intptr_t queue_p = <intptr_t>self.py_queue.int_ptr
self.queue = <cl_command_queue>queue_p
self.err = clAmdBlasSetup()
if self.err != CL_SUCCESS:
raise ValueError("clAmdBlasSetup() failed with %d" % self.err)
else:
print "Blas setup"
DTYPE = np.float64
ctypedef np.float64_t DTYPE_t
def dgemm_test():
gpu = GPU()
cdef size_t M = 4
cdef size_t N = 3
cdef size_t K = 5
cdef size_t A_size = M*K*sizeof(cl_double)
a =
np.ascontiguousarray([11,12,13,14,15,21,22,23,24,25,31,32,33,34,35,41,42,43,44,45],
dtype=DTYPE)
a_d = cl_array.to_device(gpu.py_queue, a)
cdef intptr_t a_d_p = a_d.data.int_ptr
cdef cl_mem bufA = <cl_mem> a_d_p
print "A"
cdef size_t B_size = K*N*sizeof(cl_double)
b = np.ascontiguousarray([11,12,13,21,22,23,31,32,33,41,42,43,51,52,53],
dtype=DTYPE)
b_d = cl_array.to_device(gpu.py_queue, b)
cdef intptr_t b_d_p = b_d.data.int_ptr
cdef cl_mem bufB = <cl_mem> b_d_p
print "B"
cdef size_t C_size = M*N*sizeof(cl_double)
c = np.ascontiguousarray([11,12,13,21,22,23,31,32,33,41,42,43], dtype=DTYPE)
c_d = cl_array.to_device(gpu.py_queue, c)
cdef intptr_t c_d_p = c_d.data.int_ptr
cdef cl_mem bufC = <cl_mem> c_d_p
print "C"
cdef size_t result_size = C_size
cdef np.ndarray[DTYPE_t, ndim=1] r = np.zeros(M*N, dtype=DTYPE)
cdef DTYPE_t[:] r_view = r
cdef cl_double *result = &r_view[0]
print "r"
cdef cl_double alpha = 10
cdef cl_double beta = 20
err =
clAmdBlasDgemm(clAmdBlasRowMajor,clAmdBlasNoTrans,clAmdBlasNoTrans,M,N,K,alpha,bufA,K,bufB,N,beta,bufC,N,1,&gpu.queue,0,NULL,&gpu.event)
if err != CL_SUCCESS:
return "clAmdBlasDgemm() failed with %d" % err
else:
print "DGEMMed"
err = clWaitForEvents(1, &gpu.event)
err = clEnqueueReadBuffer(gpu.queue, bufC, CL_TRUE, 0, result_size, result,
0, NULL, NULL)
return r
This then gets compiled to a Python module by running python
example_dgemm_setup.py build_ext -inplace with this build definition file
example_dgemm_setup.py:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np
extensions = [
Extension(name = 'example_dgemm',
sources = ['example_dgemm.pyx'],
include_dirs = [ "C:\\Program Files (x86)\\AMD APP
SDK\\2.9-1\\include",
"C:\\Program Files
(x86)\\AMD\\clAmdBlas\\include",
np.get_include() ],
library_dirs = ["C:\\Program Files (x86)\\AMD APP
SDK\\2.9-1\\bin\\x86_64",
"c:\\Program Files (x86)\\AMD\\clAmdBlas\\bin64"],
libraries=['clAmdBlas', 'OpenCL'])
]
extensions = cythonize(extensions)
setup(
ext_modules = extensions
)
This can then be imported into a Python script like this one cytest.py:
import example_dgemm
r = example_dgemm.dgemm_test()
print "expecting 21370 22040 22710 37070 38240 39410 52770 54440 56110 68470
70640 72810"
print r
This can then be run with "python cytest.py", giving this output:
python dytest.py
Blas setup
A
B
C
r
DGEMMed
clean up and shut down gadget
expecting 21370 22040 22710 37070 38240 39410 52770 54440 56110 68470 70640
72810
[ 21370. 22040. 22710. 37070. 38240. 39410. 52770. 54440. 56110.
68470. 70640. 72810.]
I was helped by looking at an initial wrapper from Kent Knox in the clMath
distribution, plus a couple of other clues mentioned in references. Note that
clAmdBlasTune.exe is undocumented but mentioned here and there on the Internet.
The code above isn't long, but involves NumPy, OpenCL, pyOpenCL, Cython and
MemoryViews, ctypes and libc.stdint.intptr_t, and about 10 cups of coffee.
References:
1. NumPy and Ctypes
a.
https://sage.math.washington.edu:8091/hudson/job/cython-docs/doclinks/1/src/userguide/memoryviews.html
b. https://docs.python.org/2/library/ctypes.html
c. http://docs.scipy.org/doc/numpy/user/c-info.python-as-glue.html
2. Cython
a. http://cython.org/
b. http://docs.cython.org/src/userguide/extension_types.html
3. DGEMM and clMath and clAmdBlas
a. http://www.math.utah.edu/software/lapack/lapack-blas/dgemm.html
b.
http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-math-libraries/
c. C:\Program Files (x86)\AMD APP SDK\2.9-1\include\CL\cl.h
d. c:/Program Files (x86)/AMD/clAmdBlas/samples/example_sgemm.c
e. c:/Program Files (x86)/AMD/clAmdBlas/bin64/clAmdBlas.dll
f. c:/Program Files (x86)/AMD/clAmdBlas/bin64/clAmdBlasTune.exe
g. C:\Program Files (x86)\AMD\clAmdBlas\include\clAmdBlas.h
h.
https://github.com/clMathLibraries/clBLAS/tree/master/src/wrappers/python
(seems correct but complete example not provided)
i. http://lists.tiker.net/pipermail/pyopencl/2014-February/001690.html
(incomplete attempt, coredumps maybe due to mixing raw OpenCL and pyOpenCL
deallocation)
j.
http://stackoverflow.com/questions/13396443/pyopencl-difference-between-to-device-and-buffer
k.
http://diffusionht.blogspot.com/2013/08/clamdblastune-ati-radeon-7970-linux.html
4. clRNG (similarly, would be nice to have in pyOpenCL)
http://developer.amd.com/community/blog/2015/05/05/amd-releases-clrng-an-opencl-random-number-library/
Thanks,
Lars Ericson
Quantitative Analytics Consultant
Market & Institutional Risk Management
Wells Fargo Bank, N.A. | 301 S. College St., 4th Floor | Charlotte, NC
28202-6000
MAC D1053-04X
Tel 704-410-2219 | Cell 917-891-1639
[email protected]<mailto:[email protected]>
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl