Hi,

I wrapped AMD's deprecated DGEMM as a form of matrix_multiply in pyOpenCL as 
follows:

python matrix_multiply_setup.py build_ext -inplace

where matrix_multiply_setup.py is

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np

extensions = [
    Extension(name = 'matrix_multiply',
             sources = ['matrix_multiply.pyx'],
             include_dirs = [ "C:\\Program Files (x86)\\AMD APP 
SDK\\2.9-1\\include",
                              "C:\\Program Files 
(x86)\\AMD\\clAmdBlas\\include",
                              np.get_include() ],
             library_dirs = ["C:\\Program Files (x86)\\AMD APP 
SDK\\2.9-1\\bin\\x86_64",
                             "c:\\Program Files (x86)\\AMD\\clAmdBlas\\bin64"],
             libraries=['clAmdBlas', 'OpenCL'])
]
extensions = cythonize(extensions)

setup(
    ext_modules = extensions
)

and matrix_multiply.pyx is:

import numpy as np
cimport numpy as np
import pyopencl as cl
import pyopencl.array as cla
import pyopencl.clrandom as clr
import pyopencl.clmath as clm
from clAmdBlas cimport *

def blas_setup():
  clAmdBlasSetup()

def blas_teardown():
  clAmdBlasTeardown()

def matrix_multiply(A_g,B_g,C_g,queue):
  (M,K)=A_g.shape
  N=B_g.shape[1]
  cdef cl_event event = NULL
  cdef intptr_t queue_p = <intptr_t>queue.int_ptr
  cdef cl_command_queue cq = <cl_command_queue>queue_p
  cdef intptr_t A_g_p = A_g.data.int_ptr
  cdef cl_mem bufA = <cl_mem> A_g_p
  cdef intptr_t B_g_p = B_g.data.int_ptr
  cdef cl_mem bufB = <cl_mem> B_g_p
  cdef intptr_t C_g_p = C_g.data.int_ptr
  cdef cl_mem bufC = <cl_mem> C_g_p
  err = 
clAmdBlasDgemm(clAmdBlasRowMajor,clAmdBlasNoTrans,clAmdBlasNoTrans,M,N,K,1.0,
                       bufA,K,bufB,N,0.0,bufC,N,1,&cq,0,NULL,&event)

where clAmdBlas.pxd is:

from libc.stdint cimport intptr_t, uintptr_t

cdef extern from "clAmdBlas.h":
    enum:
        CL_SUCCESS                      = 0
    enum clAmdBlasStatus:
        clAmdBlasSuccess               = CL_SUCCESS
    enum clAmdBlasOrder:
        clAmdBlasRowMajor             = 0
    enum clAmdBlasTranspose:
        clAmdBlasNoTrans             = 0
    ctypedef unsigned int cl_uint
    ctypedef double cl_double
    ctypedef void* cl_mem
    ctypedef void* cl_command_queue
    ctypedef void* cl_event
    ctypedef void* cl_platform_id
    ctypedef void* cl_device_id
    ctypedef void* cl_context
    clAmdBlasStatus clAmdBlasSetup( )
    void clAmdBlasTeardown( )
    clAmdBlasStatus clAmdBlasDgemm(clAmdBlasOrder order,  clAmdBlasTranspose 
transA,   clAmdBlasTranspose transB,
                                   size_t M,  size_t N,   size_t K,  cl_double 
alpha,   const cl_mem A,    size_t lda,     const cl_mem B,
                                   size_t ldb,    cl_double beta,   cl_mem C,   
size_t ldc,
                                   cl_uint numCommandQueues,    
cl_command_queue *commandQueues,
                                   cl_uint numEventsInWaitList,   const 
cl_event *eventWaitList,    cl_event *events)

Once matrix_multiply.pyd is created, it can be used in a pure Python program 
involving PyOpenCL for example as follows, where queue is a pyOpenCL queue:

import pyopencl.array as cla
import matrix_multiply
import numpy as np
import pyopencl as cl
A = np.ascontiguousarray(np.ones((2,2)))
B = np.ascontiguousarray(np.ones(2,2)))
bufA = cla.to_device(queue, A)
bufB = cla.to_device(queue, B)
bufC = cl.array.zeros(queue, shape=((2,2)), dtype=np.float64)
matrix_multiply.blas_setup()
matrix_multiply.matrix_multiply(bufA, bufB, bufC, queue)
matrix_multiply.blas_teardown()

Note though that clAmdBlas is deprecated in favor of clBlas on gitub: 
https://github.com/clMathLibraries/clBLAS, which doesn't have a Windows 
installer, whereas clAmdBlas did.

The reason it doesn't have a Windows installer is that a process of post-build 
host-based tuning which was used on clAmdBlas has been replaced by direct 
access to driver properties in the compilation of clBlas.  This means that each 
user of the package has to compile it on their machine before they can use it, 
which also means that wrappers for the package to pyOpenCL have to be built at 
that time.  In addition, if you have a machine with multiple OpenCL devices 
(for example I have AMD and Intel OpenCL on my workstation, with the Intel CPU 
chip acting as a separate platform and device), I don't know if the build is 
correct and optimal for all devices and platforms on the machine at build time 
or only correct and optimal for the AMD device.

Thanks,

Lars Ericson

Quantitative Analytics Consultant
Market & Institutional Risk Management

Wells Fargo Bank, N.A. | 301 S. College St., 4th Floor | Charlotte, NC 
28202-6000
MAC D1053-04X
Tel  704-410-2219 | Cell 917-891-1639

[email protected]<mailto:[email protected]>

_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

Reply via email to