[PyOpenCL] Interfacing PyOpenCL to clAmdBlas in Cython

Lars.Ericson Wed, 17 Jun 2015 09:34:30 -0700

Hi,

I wrote a program which uses pyOpenCL to call clAmdBlas DGEMM .  This example 
is a recoding of example_sgemm.c in the clAmdBlas distribution, changed to use 
Python, Cython, NumPY, pyOpenCL and DGEMM.    The reason for doing this is to 
show how to access fast matrix multiplication which is also host-tunable, while 
at the same time be able to  mix with other pyOpenCL calls for example for 
simple addition of matrices and for random number generation.   I followed this 
route for two reasons:



1.       It is hard to find self-contained OpenCL kernels on the Internet for 
fast matrix multiplication for matrices of any dimension.  Most examples assume 
a dimension which happens to fit the gadget.   More pragmatic matrix 
multiplication codes use a C++ program to generate the kernel on the fly based 
on the dimensions.  This is what clAmdBlas does.  I didn't find any Python 
codes that replicate this.

2.       I couldn't figure out by reading the pyOpenCL documentation how to 
multiply two matrices.  I guessed I could do it with Reikna, but I wasn't 
convinced the result would be dimension-optimised.  I looked in the code for 
matrix multiplication in the pyOpenCL distribution and it looked at first 
glance like an element-wise multiply.

I am sharing this with PyOpenCL mailing list in case PyOpenCL folks want to 
incorporate wrappers for clAmdBlas and clAmdRnd (references below).  Here is 
the Cython header for clAmdBlasDGEMM:

from libc.stdint cimport intptr_t, uintptr_t

cdef extern from "clAmdBlas.h":
    enum:
        CL_SUCCESS                      = 0
        CL_INVALID_VALUE                = -30
        CL_INVALID_COMMAND_QUEUE        = -36
        CL_INVALID_CONTEXT              = -34
        CL_INVALID_MEM_OBJECT           = -38
        CL_INVALID_DEVICE               = -33
        CL_INVALID_EVENT_WAIT_LIST      = -57
        CL_OUT_OF_RESOURCES             = -5
        CL_OUT_OF_HOST_MEMORY           = -6
        CL_INVALID_OPERATION            = -59
        CL_COMPILER_NOT_AVAILABLE       = -3
        CL_BUILD_PROGRAM_FAILURE        = -11

    enum clAmdBlasStatus:
        clAmdBlasSuccess               = CL_SUCCESS
        clAmdBlasInvalidValue          = CL_INVALID_VALUE
        clAmdBlasInvalidCommandQueue   = CL_INVALID_COMMAND_QUEUE
        clAmdBlasInvalidContext        = CL_INVALID_CONTEXT
        clAmdBlasInvalidMemObject      = CL_INVALID_MEM_OBJECT
        clAmdBlasInvalidDevice         = CL_INVALID_DEVICE
        clAmdBlasInvalidEventWaitList  = CL_INVALID_EVENT_WAIT_LIST
        clAmdBlasOutOfResources        = CL_OUT_OF_RESOURCES
        clAmdBlasOutOfHostMemory       = CL_OUT_OF_HOST_MEMORY
        clAmdBlasInvalidOperation      = CL_INVALID_OPERATION
        clAmdBlasCompilerNotAvailable  = CL_COMPILER_NOT_AVAILABLE
        clAmdBlasBuildProgramFailure   = CL_BUILD_PROGRAM_FAILURE
        clAmdBlasNotImplemented        = -1024
        clAmdBlasNotInitialized        = -1023
        clAmdBlasInvalidMatA
        clAmdBlasInvalidMatB
        clAmdBlasInvalidMatC
        clAmdBlasInvalidVecX
        clAmdBlasInvalidVecY
        clAmdBlasInvalidDim
        clAmdBlasInvalidLeadDimA
        clAmdBlasInvalidLeadDimB
        clAmdBlasInvalidLeadDimC
        clAmdBlasInvalidIncX
        clAmdBlasInvalidIncY
        clAmdBlasInsufficientMemMatA
        clAmdBlasInsufficientMemMatB
        clAmdBlasInsufficientMemMatC
        clAmdBlasInsufficientMemVecX
        clAmdBlasInsufficientMemVecY

    enum clAmdBlasOrder:
        clAmdBlasRowMajor             = 0
        clAmdBlasColumnMajor          = 1

    enum clAmdBlasTranspose:
        clAmdBlasNoTrans             = 0
        clAmdBlasTrans               = 1
        clAmdBlasConjTrans           = 2

    ctypedef unsigned int cl_uint
    ctypedef signed int cl_int
    ctypedef float cl_float
    ctypedef double cl_double
    ctypedef void* cl_mem
    ctypedef void* cl_command_queue
    ctypedef void* cl_event
    ctypedef void* cl_platform_id
    ctypedef void* cl_device_id
    ctypedef void* cl_context

    ctypedef unsigned long cl_ulong
    ctypedef cl_ulong cl_bitfield
    ctypedef cl_bitfield cl_device_type

    cl_device_type CL_DEVICE_TYPE_GPU  = (1 << 2)

    ctypedef cl_bitfield cl_command_queue_properties

    ctypedef cl_uint cl_bool
    cl_bool CL_FALSE = 0
    cl_bool CL_TRUE = 1

    ctypedef cl_bitfield cl_mem_flags
    cl_mem_flags CL_MEM_READ_ONLY = (1 << 2)
    cl_mem_flags CL_MEM_READ_WRITE = (1 << 0)

    cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, 
cl_uint *num_platforms)
    cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
                          cl_uint num_entries, cl_device_id *devices, cl_uint 
*num_devices)

    ctypedef intptr_t cl_context_properties
    cl_context_properties CL_CONTEXT_PLATFORM = 0x1084

    cl_context clCreateContext(cl_context_properties *properties, cl_uint 
ndevices, cl_device_id *devices,
                               void *pfn_notify, void *user_data, cl_int 
*errcode_ret)
    cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id 
device,
                                          cl_command_queue_properties 
properties, cl_int *errcode_ret)

    cl_int clEnqueueReadBuffer(cl_command_queue    command_queue,
                               cl_mem              buffer,
                               cl_bool             blocking_read,
                               size_t              offset,
                               size_t              size,
                               void *              ptr,
                               cl_uint             num_events_in_wait_list,
                               const cl_event *    event_wait_list,
                               cl_event *          event)

    cl_int clEnqueueWriteBuffer(cl_command_queue   command_queue,
                                cl_mem             buffer,
                                cl_bool            blocking_write,
                                size_t             offset,
                                size_t             size,
                                const void *       ptr,
                                cl_uint            num_events_in_wait_list,
                                const cl_event *   event_wait_list,
                                cl_event *         event)

    cl_int clReleaseContext(cl_context context)
    cl_int clReleaseCommandQueue(cl_command_queue command_queue)
    cl_mem clCreateBuffer(cl_context   context,
                                  cl_mem_flags flags,
                          size_t  size,
                          void *   host_ptr,
                          cl_int * errcode_ret)
    cl_int clWaitForEvents(cl_uint num_events, const cl_event * event_list)
    cl_int clReleaseMemObject(cl_mem memobj)

    clAmdBlasStatus clAmdBlasSetup( )
    void clAmdBlasTeardown( )
    clAmdBlasStatus clAmdBlasDgemm(clAmdBlasOrder order,
                                   clAmdBlasTranspose transA,
                                   clAmdBlasTranspose transB,
                                   size_t M,
                                   size_t N,
                                   size_t K,
                                   cl_double alpha,
                                   const cl_mem A,
                                   size_t lda,
                                   const cl_mem B,
                                   size_t ldb,
                                   cl_double beta,
                                   cl_mem C,
                                   size_t ldc,
                                   cl_uint numCommandQueues,
                                   cl_command_queue *commandQueues,
                                   cl_uint numEventsInWaitList,
                                   const cl_event *eventWaitList,
                                   cl_event *events)

Here is the main program:

import sys
from clAmdBlas cimport *
import ctypes
import numpy as np
cimport numpy as np
import pyopencl as cl
import pyopencl.array as cl_array

cdef class GPU:
    cdef cl_platform_id platform
    cdef cl_device_id device
    cdef cl_event event
    cdef cl_command_queue queue
    cdef cl_context ctx
    cdef cl_int err
    cdef public py_platform, py_device, py_context, py_queue

    def __dealloc__(self):
        clAmdBlasTeardown()
        print "clean up and shut down gadget"

    def __cinit__(self):
        self.event = NULL
        cdef cl_context_properties props[3]
        props[:] = [CL_CONTEXT_PLATFORM, 0, 0]

        self.py_platform = cl.get_platforms()[0]
        cdef intptr_t platform_p = self.py_platform.int_ptr
        self.platform = <cl_platform_id>platform_p

        self.py_device = self.py_platform.get_devices()[0]
        cdef intptr_t device_p = self.py_device.int_ptr
        self.device = <cl_device_id>device_p

        self.py_context = cl.Context(devices=[self.py_device])
        cdef intptr_t context_p = self.py_context.int_ptr
        self.ctx = <cl_context>context_p

        self.py_queue = cl.CommandQueue(self.py_context, device=self.py_device)
        cdef intptr_t queue_p = <intptr_t>self.py_queue.int_ptr
        self.queue = <cl_command_queue>queue_p

        self.err = clAmdBlasSetup()
        if self.err != CL_SUCCESS:
            raise ValueError("clAmdBlasSetup() failed with %d" % self.err)
        else:
            print "Blas setup"

DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

def dgemm_test():
    gpu = GPU()

    cdef size_t M = 4
    cdef size_t N = 3
    cdef size_t K = 5

    cdef size_t A_size = M*K*sizeof(cl_double)
    a = 
np.ascontiguousarray([11,12,13,14,15,21,22,23,24,25,31,32,33,34,35,41,42,43,44,45],
 dtype=DTYPE)
    a_d = cl_array.to_device(gpu.py_queue, a)
    cdef intptr_t a_d_p = a_d.data.int_ptr
    cdef cl_mem bufA = <cl_mem> a_d_p
    print "A"

    cdef size_t B_size = K*N*sizeof(cl_double)
    b = np.ascontiguousarray([11,12,13,21,22,23,31,32,33,41,42,43,51,52,53], 
dtype=DTYPE)
    b_d = cl_array.to_device(gpu.py_queue, b)
    cdef intptr_t b_d_p = b_d.data.int_ptr
    cdef cl_mem bufB = <cl_mem> b_d_p
    print "B"

    cdef size_t C_size = M*N*sizeof(cl_double)
    c = np.ascontiguousarray([11,12,13,21,22,23,31,32,33,41,42,43], dtype=DTYPE)
    c_d = cl_array.to_device(gpu.py_queue, c)
    cdef intptr_t c_d_p = c_d.data.int_ptr
    cdef cl_mem bufC = <cl_mem> c_d_p
    print "C"

    cdef size_t result_size = C_size
    cdef np.ndarray[DTYPE_t, ndim=1] r = np.zeros(M*N, dtype=DTYPE)
    cdef DTYPE_t[:] r_view = r
    cdef cl_double *result = &r_view[0]
    print "r"

    cdef cl_double alpha = 10
    cdef cl_double beta = 20
    err = 
clAmdBlasDgemm(clAmdBlasRowMajor,clAmdBlasNoTrans,clAmdBlasNoTrans,M,N,K,alpha,bufA,K,bufB,N,beta,bufC,N,1,&gpu.queue,0,NULL,&gpu.event)
    if err != CL_SUCCESS:
        return "clAmdBlasDgemm() failed with %d" % err
    else:
        print "DGEMMed"
    err = clWaitForEvents(1, &gpu.event)
    err = clEnqueueReadBuffer(gpu.queue, bufC, CL_TRUE, 0, result_size, result, 
0, NULL, NULL)

    return r

This then gets compiled to a Python module by running python 
example_dgemm_setup.py build_ext -inplace with this build definition file 
example_dgemm_setup.py:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np

extensions = [
    Extension(name = 'example_dgemm',
             sources = ['example_dgemm.pyx'],
             include_dirs = [ "C:\\Program Files (x86)\\AMD APP 
SDK\\2.9-1\\include",
                              "C:\\Program Files 
(x86)\\AMD\\clAmdBlas\\include",
                              np.get_include() ],
             library_dirs = ["C:\\Program Files (x86)\\AMD APP 
SDK\\2.9-1\\bin\\x86_64",
                             "c:\\Program Files (x86)\\AMD\\clAmdBlas\\bin64"],
             libraries=['clAmdBlas', 'OpenCL'])
]
extensions = cythonize(extensions)

setup(
    ext_modules = extensions
)

This can then be imported into a Python script like this one cytest.py:

import example_dgemm
r = example_dgemm.dgemm_test()
print "expecting 21370 22040 22710 37070 38240 39410 52770 54440 56110 68470 
70640 72810"
print r

This can then be run with "python cytest.py", giving this output:

python dytest.py
Blas setup
A
B
C
r
DGEMMed
clean up and shut down gadget
expecting 21370 22040 22710 37070 38240 39410 52770 54440 56110 68470 70640 
72810
[ 21370.  22040.  22710.  37070.  38240.  39410.  52770.  54440.  56110.
  68470.  70640.  72810.]

I was helped by looking at an initial wrapper from Kent Knox in the clMath 
distribution, plus a couple of other clues mentioned in references.  Note that 
clAmdBlasTune.exe is undocumented but mentioned here and there on the Internet. 
 The code above isn't long, but involves NumPy, OpenCL, pyOpenCL, Cython and 
MemoryViews, ctypes and libc.stdint.intptr_t, and about 10 cups of coffee.

References:


1.       NumPy and Ctypes

a.       
https://sage.math.washington.edu:8091/hudson/job/cython-docs/doclinks/1/src/userguide/memoryviews.html

b.      https://docs.python.org/2/library/ctypes.html

c.       http://docs.scipy.org/doc/numpy/user/c-info.python-as-glue.html

2.       Cython

a.       http://cython.org/

b.      http://docs.cython.org/src/userguide/extension_types.html

3.       DGEMM and clMath and clAmdBlas

a.       http://www.math.utah.edu/software/lapack/lapack-blas/dgemm.html

b.      
http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-math-libraries/

c.       C:\Program Files (x86)\AMD APP SDK\2.9-1\include\CL\cl.h

d.      c:/Program Files (x86)/AMD/clAmdBlas/samples/example_sgemm.c

e.      c:/Program Files (x86)/AMD/clAmdBlas/bin64/clAmdBlas.dll

f.        c:/Program Files (x86)/AMD/clAmdBlas/bin64/clAmdBlasTune.exe

g.       C:\Program Files (x86)\AMD\clAmdBlas\include\clAmdBlas.h

h.      
https://github.com/clMathLibraries/clBLAS/tree/master/src/wrappers/python 
(seems correct but complete example not provided)

i.         http://lists.tiker.net/pipermail/pyopencl/2014-February/001690.html 
(incomplete attempt, coredumps maybe due to mixing raw OpenCL and pyOpenCL 
deallocation)

j.        
http://stackoverflow.com/questions/13396443/pyopencl-difference-between-to-device-and-buffer

k.       
http://diffusionht.blogspot.com/2013/08/clamdblastune-ati-radeon-7970-linux.html

4.    clRNG (similarly, would be nice to have in pyOpenCL) 
http://developer.amd.com/community/blog/2015/05/05/amd-releases-clrng-an-opencl-random-number-library/



Thanks,

Lars Ericson

Quantitative Analytics Consultant
Market & Institutional Risk Management

Wells Fargo Bank, N.A. | 301 S. College St., 4th Floor | Charlotte, NC 
28202-6000
MAC D1053-04X
Tel  704-410-2219 | Cell 917-891-1639

[email protected]<mailto:[email protected]>

_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

[PyOpenCL] Interfacing PyOpenCL to clAmdBlas in Cython

Reply via email to