Most of the arguments I have heard are "oh but its compiled with
-O3" or whatever. Any decent HPC code person will tell you that that
is most definitely not a guaranteed way to a faster system ...
Hey...as I stated above, one would have to be quite silly to claim
-O3 as the all well and all good optimization solution. At least you
can rest assured your solutions will add up correctly with GCC. To get a
Well, sometimes. You still need to be careful with it.
This said, I am not sure icc/pgi/... are uniformly better than gcc. I
did an admittedly tiny study of this http://scalability.org/?p=470
some time ago. What I found was the gcc really held its own. It did
a very good job on a very simple test case.
Very nice post, thanks for that, it so happens I am going through the
exact same steps trying to optimize a very simple piece of code
computing the Euclidean distance and I was a little stomped to find out
the simople C code outperforms BLAS (both GOTO and MKL). If you have
gnuplot, a BLAS library with cblas interface, and icc installed, all you
have to do is run `make` with the three attached files in the same dir
and you'll get nice plots of what's going on. I'm also attaching an
example run with:
icc 10.1.017
gcc 4.3.1
GOTO BLAS 1.24
Eric
PS: regular disclaimers about crappy code writing apply ;)
#include <stdlib.h>
#include <stdio.h>
#include "cblas.h"
#include "math.h"
#include <sys/time.h>
#define MAXT 10000000
#define REPS 30
#define timeadd(a,b) a.tv_sec+=b.tv_sec;\
if ((a.tv_usec+b.tv_usec) > 1000000)\
{\
a.tv_usec+=b.tv_usec-1000000;\
a.tv_sec++;\
}\
else \
a.tv_usec+=b.tv_usec;
#define time2double(a) ((double) a.tv_sec + (double) a.tv_usec/1000000)
//TIC_START tags timeofday into start
#define TIC_START(start) gettimeofday(&start,NULL);
//TIC_STOP tags time difference between start TAG and _now_ and adds it to t_store
#define TIC_STOP(t_store,start) gettimeofday(&t_stop,NULL);\
timeval_subtract(&t_diff,&t_stop,&t_start);\
timeadd(t_store,t_diff);
int timeval_subtract (result, x, y)
struct timeval *result, *x, *y;
{
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec) {
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000) {
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}
/* Compute the time remaining to wait.
tv_usec is certainly positive. */
result->tv_sec = x->tv_sec - y->tv_sec;
result->tv_usec = x->tv_usec - y->tv_usec;
/* Return 1 if result is negative. */
return x->tv_sec < y->tv_sec;
}
// blas temp vectors:
double *Vdist;
// actual vectors vars:
double *X,*Y,dist;
int size;
// statistics accumulation:
struct timeval tempo1, tempo2;
struct timezone tzp;
double tempo;
void printfvect(const char *vname, const double *vect, int size)
{
int i;
printf("%s contains: \n",vname);
for (i=0; i<size; i++) printf("%f,",vect[i]);
printf("\n\n");
}
double blas_ed(const double *v1, const double *v2, int size){
cblas_dcopy(size,v1,1,Vdist,1);
cblas_daxpy(size,-1.0,v2,1,Vdist,1);
return cblas_dnrm2(size,Vdist,1);
}
double ed(const double *v1, const double *v2, int size){
register double sum=0.0;
register int i;
for(i=size; i>0; i--)
sum+=pow((v1[i]-v2[i]),2);
return sqrt(sum);
}
void stats(void){
unsigned int j,i;
double btime,ctime;
struct timeval t_start, t_stop, t_diff;
// printf("size,blas,c\n");
for (i=1;i<MAXT;i=i*2)
{
// using blas
TIC_START(t_start);
for (j=0;j<REPS;j++) dist=blas_ed(X,Y,i);
TIC_STOP(t_diff,t_stop);
tempo = time2double(t_diff);
btime = tempo/(double)REPS;
// using C
TIC_START(t_start);
gettimeofday(&tempo1,&tzp);
for (j=0;j<REPS;j++) dist=ed(X,Y,i);
TIC_STOP(t_diff,t_stop);
tempo = time2double(t_diff);
ctime = tempo/(double)REPS;
printf("%d\t%E\t%E\n",i,btime,ctime);
}
}
int main (void)
{
int j;
gettimeofday(&tempo1,&tzp);
X = (double *) malloc( MAXT * sizeof(double));
Y = (double *) malloc( MAXT * sizeof(double));
Vdist = (double *) malloc( MAXT * sizeof(double));
gettimeofday(&tempo2,&tzp);
tempo = (double)(tempo2.tv_usec - tempo1.tv_usec);
// printf("Malloc time:%f\n",tempo);
for (size=0;size<MAXT;size++)
{
X[size] = 1.0;
Y[size] = 3.0;
}
stats();
// Uncomment the following to understand the process:
/*
printfvect("X",X,T);
printfvect("Y",Y,T);
// Y = -1*X+Y
cblas_saxpy(T,-1.0,X,1,Y,1);
printf("Y after saxpy:\n");
printfvect("Y",Y,T);
// ED = sqrt(SUM(Y.^2)) Euclidean disntance)
printf("Euclidean Distance is: %f\n",cblas_snrm2(T,Y,1));
*/
return 0;
}
### Global, generic variables ###
SHELL = /bin/bash
ARCH = $(shell uname -m)
GCC = gcc-4.3.1
#GCCFLAGS = -Wall -march=native -mfpmath=sse,387 -O3 -fomit-frame-pointer
-fkeep-inline-functions -funsafe-loop-optimizations
-freorder-blocks-and-partition -fno-math-errno -ffinite-math-only
-fno-trapping-math -fno-signaling-nans -fwhole-program --param
l1-cache-line-size=1 --param l1-cache-size=64 --param l2-cache-size=4096
GCCFLAGS = -Wall -march=native -O3
# For ICC
# on Opteron: -xW
#ICCFLAGS = -xW
# on Core2 Duo -xT
ICCFLAGS = -xT
LIBS = -lm -lblas -lcblas
LDFLAGS = $(LIBS)
### TAU specific variables ###
TAU_MAKEFILE = ~/TAU/TAU/$(ARCH)/lib/Makefile.tau-pdt
TAU_CXX = tau_cxx.sh
TAU_CC = tau_cc.sh
TAU_OPTS = -optNoRevert -optLinking="$(LIBS)" -optTauCC="$(CC)"
-optCPPOpts="$(GCCFLAGS)" -tau_makefile=$(TAU_MAKEFILE)
PROGNAM = EuclideanDist
PROGRAM = $(PROGNAM) # nom de l'executable
PROGOUT = $(PROGNAM)_$(ARCH)
TAU_PROG = $(PROGOUT)_TAU
SRCS = $(PROGNAM).c # les fichiers source
OBJS = $(PROGNAM).o # fichiers objets
MKL_LIBS = -liomp5 -lpthread -I/opt/intel/mkl/10.0.3.020/include/
-L/opt/intel/mkl/10.0.3.020/lib/em64t/
.SUFFIXES: .c .o
.cpp.o:
$(CXX) -c $(GCCFLAGS) $<
.c.o:
$(CC) -c $(GCCFLAGS) $<
# Targets
default: all
# all: $(PROGRAM) icc gcc
all: $(PROGRAM) icc gcc tests plots
$(PROGRAM):
$(GCC) $(SRCS) -o $(PROGOUT) $(LDFLAGS)
tau:
$(TAU_CC) $(TAU_OPTS) $(SRCS) -o $(TAU_PROG)
set_mkl_blas:
sudo eselect blas set mkl-gfortran
set_goto_blas:
sudo eselect blas set goto
icc:
icc $(MKL_LIBS) $(LIBS) $(ICCFLAGS) $(SRCS) -o $(PROGOUT)_ICC
gcc:
$(GCC) $(LIBS) $(GCCFLAGS) $(SRCS) -o $(PROGOUT)_GCC
clean:
/bin/rm -f $(OBJS) $(PROGRAM) $(TAU_PROG) *.dat
tests: icctest gcctest
icctest:
./$(PROGOUT)_ICC > icc.dat
gcctest:
./$(PROGOUT)_GCC > gcc.dat
plots:
gnuplot Plot.gp
set title "BLAS Vs C execution time for\n Euclidean Distance computation"
set xlabel "Vector Size (bytes)"
set ylabel "Time (sec)"
set logscale xy
set grid xtics mxtics
set key top left
set key box
#set term post enh # enhanced PostScript, essentially PostScript
# with bounding boxes
#set term postscript
#set term png
set term postscript enhanced color
set out 'BlasVsC.eps'
plot "icc.dat" using 1:2 title 'icc-BLAS' w l lw 1 , \
"icc.dat" using 1:3 title 'icc' w l lw 1 , \
"gcc.dat" using 1:2 title 'gcc-BLAS' w l lw 1 , \
"gcc.dat" using 1:3 title 'gcc' w l lw 1
_______________________________________________
Beowulf mailing list, [email protected]
To change your subscription (digest mode or unsubscribe) visit
http://www.beowulf.org/mailman/listinfo/beowulf