I don't have the slightest idea what I'm doing, but....
____ file name - the_lib.c ___ #include <stdio.h> #include <time.h> #include <omp.h> #include <math.h> void dists2d( double *a_ps, int na, double *b_ps, int nb, double *dist, int num_threads) { int i, j; int dynamic=0; omp_set_dynamic(dynamic); omp_set_num_threads(num_threads); double ax,ay, dif_x, dif_y; int nx1=2; int nx2=2; #pragma omp parallel for private(j, i,ax,ay, dif_x, dif_y) for(i=0;i<na;i++) { ax=a_ps[i*nx1]; ay=a_ps[i*nx1+1]; for(j=0;j<nb;j++) { dif_x = ax - b_ps[j*nx2]; dif_y = ay - b_ps[j*nx2+1]; dist[2*i+j] = sqrt(dif_x*dif_x+dif_y*dif_y); } } } ________ COMPILE: __________ gcc -c the_lib.c -fPIC -fopenmp -ffast-math gcc -shared -o the_lib.so the_lib.o -lgomp -lm ____ the_python_prog.py _____________ from ctypes import * my_lib=CDLL('the_lib.so') #or full path to lib import numpy as np import time na=329 nb=340 a=np.random.rand(na,2) b=np.random.rand(nb,2) c=np.zeros(na*nb) trials=100 max_threads = 24 for k in range(1,max_threads): n_threads =c_int(k) na2=c_int(na) nb2=c_int(nb) start = time.time() for k1 in range(trials): ret = my_lib.dists2d(a.ctypes.data_as(c_void_p),na2,b.ctypes.data_as(c_void_p),nb2,c.ctypes.data_as(c_void_p),n_threads) print "c_threads",k, " time ", (time.time()-start)/trials ____ Results on my machine, dual xeon, 12 cores na=329 nb=340 ____ 100 trials each: c_threads 1 time 0.00109949827194 c_threads 2 time 0.0005726313591 c_threads 3 time 0.000429179668427 c_threads 4 time 0.000349278450012 c_threads 5 time 0.000287139415741 c_threads 6 time 0.000252468585968 c_threads 7 time 0.000222821235657 c_threads 8 time 0.000206289291382 c_threads 9 time 0.000187981128693 c_threads 10 time 0.000172770023346 c_threads 11 time 0.000164999961853 c_threads 12 time 0.000157740116119 ____ ____ Results on my machine, dual xeon, 12 cores na=3290 nb=3400 ______ 100 trials each: c_threads 1 time 0.10744508028 c_threads 2 time 0.0542239999771 c_threads 3 time 0.037127559185 c_threads 4 time 0.0280736112595 c_threads 5 time 0.0228648614883 c_threads 6 time 0.0194904088974 c_threads 7 time 0.0165715909004 c_threads 8 time 0.0145838689804 c_threads 9 time 0.0130002498627 c_threads 10 time 0.0116940999031 c_threads 11 time 0.0107557415962 c_threads 12 time 0.00990005016327 (speedup almost 11) _______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion