Most of the arguments I have heard are "oh but its compiled with -O3" or whatever. Any decent HPC code person will tell you that that is most definitely not a guaranteed way to a faster system ...
Hey...as I stated above, one would have to be quite silly to claim -O3 as the all well and all good optimization solution. At least you can rest assured your solutions will add up correctly with GCC. To get a
Well, sometimes.  You still need to be careful with it.

This said, I am not sure icc/pgi/... are uniformly better than gcc. I did an admittedly tiny study of this http://scalability.org/?p=470 some time ago. What I found was the gcc really held its own. It did a very good job on a very simple test case.
Very nice post, thanks for that, it so happens I am going through the exact same steps trying to optimize a very simple piece of code computing the Euclidean distance and I was a little stomped to find out the simople C code outperforms BLAS (both GOTO and MKL). If you have gnuplot, a BLAS library with cblas interface, and icc installed, all you have to do is run `make` with the three attached files in the same dir and you'll get nice plots of what's going on. I'm also attaching an example run with:

icc 10.1.017
gcc 4.3.1
GOTO BLAS 1.24

Eric
PS: regular disclaimers about crappy code writing apply ;)
#include <stdlib.h>
#include <stdio.h>
#include "cblas.h"
#include "math.h"
#include <sys/time.h>
#define MAXT 10000000
#define REPS 30

#define timeadd(a,b) a.tv_sec+=b.tv_sec;\
                    if ((a.tv_usec+b.tv_usec) > 1000000)\
{\
                        a.tv_usec+=b.tv_usec-1000000;\
                        a.tv_sec++;\
}\
                    else \
                        a.tv_usec+=b.tv_usec;

#define time2double(a) ((double) a.tv_sec + (double) a.tv_usec/1000000)
//TIC_START tags timeofday into start
#define TIC_START(start) gettimeofday(&start,NULL);

//TIC_STOP tags time difference between start TAG and _now_ and adds it to t_store
#define TIC_STOP(t_store,start) gettimeofday(&t_stop,NULL);\
                        timeval_subtract(&t_diff,&t_stop,&t_start);\
                        timeadd(t_store,t_diff);

int timeval_subtract (result, x, y)
struct timeval *result, *x, *y;
{
    /* Perform the carry for the later subtraction by updating y. */
    if (x->tv_usec < y->tv_usec) {
        int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
        y->tv_usec -= 1000000 * nsec;
        y->tv_sec += nsec;
    }
    if (x->tv_usec - y->tv_usec > 1000000) {
        int nsec = (x->tv_usec - y->tv_usec) / 1000000;
        y->tv_usec += 1000000 * nsec;
        y->tv_sec -= nsec;
    }

       /* Compute the time remaining to wait.
    tv_usec is certainly positive. */
    result->tv_sec = x->tv_sec - y->tv_sec;
    result->tv_usec = x->tv_usec - y->tv_usec;

    /* Return 1 if result is negative. */
    return x->tv_sec < y->tv_sec;
}

// blas temp vectors:
double *Vdist;

// actual vectors vars:
double *X,*Y,dist;
int size;

// statistics accumulation:
struct timeval tempo1, tempo2;
struct timezone tzp;
double tempo;


void printfvect(const char *vname, const double *vect, int size)
{
int i;
	printf("%s contains: \n",vname);
	for (i=0; i<size; i++) printf("%f,",vect[i]);
		printf("\n\n");
}

double blas_ed(const double *v1, const double *v2, int size){
	cblas_dcopy(size,v1,1,Vdist,1);
	cblas_daxpy(size,-1.0,v2,1,Vdist,1);
	return cblas_dnrm2(size,Vdist,1);
}

double ed(const double *v1, const double *v2, int size){
    register double sum=0.0;
    register int i;
    for(i=size; i>0; i--)
        sum+=pow((v1[i]-v2[i]),2);
    return sqrt(sum);
}

void stats(void){
	unsigned int j,i;
	double btime,ctime;
	struct timeval t_start, t_stop, t_diff;

//	printf("size,blas,c\n");
	for (i=1;i<MAXT;i=i*2)
	{
		// using blas
		TIC_START(t_start);
		for (j=0;j<REPS;j++) dist=blas_ed(X,Y,i);
		TIC_STOP(t_diff,t_stop);
		tempo = time2double(t_diff);
		btime = tempo/(double)REPS;

		// using C
		TIC_START(t_start);
		gettimeofday(&tempo1,&tzp);
		for (j=0;j<REPS;j++) dist=ed(X,Y,i);
		TIC_STOP(t_diff,t_stop);
		tempo = time2double(t_diff);
		ctime = tempo/(double)REPS;

		printf("%d\t%E\t%E\n",i,btime,ctime);
	}
}

int main (void)
{
	int j;
 	gettimeofday(&tempo1,&tzp);
	
	X     = (double *) malloc( MAXT * sizeof(double));
	Y     = (double *) malloc( MAXT * sizeof(double));
	Vdist = (double *) malloc( MAXT * sizeof(double));

	gettimeofday(&tempo2,&tzp);
	tempo = (double)(tempo2.tv_usec - tempo1.tv_usec);

//	printf("Malloc time:%f\n",tempo);

	for (size=0;size<MAXT;size++)
	{
		X[size] = 1.0;
		Y[size] = 3.0;
	}

	stats();

// Uncomment the following to understand the process:
/*
	printfvect("X",X,T);
	printfvect("Y",Y,T);
	// Y = -1*X+Y
	cblas_saxpy(T,-1.0,X,1,Y,1);
	printf("Y after saxpy:\n");
	printfvect("Y",Y,T);
	// ED = sqrt(SUM(Y.^2)) Euclidean disntance)
	printf("Euclidean Distance is: %f\n",cblas_snrm2(T,Y,1));
*/
	return 0;
}
### Global, generic variables ###
SHELL     = /bin/bash
ARCH      = $(shell uname -m)
GCC       = gcc-4.3.1
#GCCFLAGS  = -Wall -march=native -mfpmath=sse,387 -O3 -fomit-frame-pointer 
-fkeep-inline-functions -funsafe-loop-optimizations 
-freorder-blocks-and-partition -fno-math-errno -ffinite-math-only 
-fno-trapping-math -fno-signaling-nans -fwhole-program --param 
l1-cache-line-size=1 --param l1-cache-size=64 --param l2-cache-size=4096
GCCFLAGS  = -Wall -march=native -O3 

# For ICC 
# on Opteron: -xW
#ICCFLAGS    = -xW
# on Core2 Duo -xT
ICCFLAGS    = -xT 

LIBS      = -lm -lblas -lcblas
LDFLAGS   = $(LIBS)

### TAU specific variables ###
TAU_MAKEFILE = ~/TAU/TAU/$(ARCH)/lib/Makefile.tau-pdt
TAU_CXX      = tau_cxx.sh
TAU_CC       = tau_cc.sh
TAU_OPTS     = -optNoRevert -optLinking="$(LIBS)" -optTauCC="$(CC)" 
-optCPPOpts="$(GCCFLAGS)" -tau_makefile=$(TAU_MAKEFILE)

PROGNAM    = EuclideanDist
PROGRAM    = $(PROGNAM)               # nom de l'executable
PROGOUT    = $(PROGNAM)_$(ARCH)
TAU_PROG   = $(PROGOUT)_TAU
SRCS       = $(PROGNAM).c             # les fichiers source
OBJS       = $(PROGNAM).o             # fichiers objets

MKL_LIBS = -liomp5 -lpthread -I/opt/intel/mkl/10.0.3.020/include/ 
-L/opt/intel/mkl/10.0.3.020/lib/em64t/

.SUFFIXES: .c .o
.cpp.o:
        $(CXX) -c $(GCCFLAGS) $<
.c.o:
        $(CC) -c $(GCCFLAGS) $<

# Targets
default: all
# all: $(PROGRAM) icc gcc
all: $(PROGRAM) icc gcc tests plots
$(PROGRAM):
        $(GCC) $(SRCS) -o $(PROGOUT) $(LDFLAGS)

tau:
        $(TAU_CC) $(TAU_OPTS) $(SRCS) -o $(TAU_PROG)

set_mkl_blas:
        sudo eselect blas set mkl-gfortran

set_goto_blas:
        sudo eselect blas set goto

icc:
        icc $(MKL_LIBS) $(LIBS) $(ICCFLAGS) $(SRCS) -o $(PROGOUT)_ICC

gcc:
        $(GCC) $(LIBS) $(GCCFLAGS) $(SRCS) -o $(PROGOUT)_GCC
        
clean:
        /bin/rm -f $(OBJS) $(PROGRAM) $(TAU_PROG) *.dat

tests: icctest gcctest

icctest:
        ./$(PROGOUT)_ICC     > icc.dat

gcctest:
        ./$(PROGOUT)_GCC     > gcc.dat

plots:
        gnuplot Plot.gp
set title "BLAS Vs C execution time for\n Euclidean Distance computation"
set xlabel "Vector Size (bytes)"
set ylabel "Time (sec)"
set logscale xy
set grid xtics mxtics
set key top left
set key box
#set term post enh         # enhanced PostScript, essentially PostScript
                           # with bounding boxes
#set term postscript
#set term png
set term postscript enhanced color

set out 'BlasVsC.eps'

plot "icc.dat" using 1:2 title 'icc-BLAS' w l lw 1 , \
"icc.dat" using 1:3 title 'icc' w l lw 1 , \
"gcc.dat" using 1:2 title 'gcc-BLAS'  w l lw 1 , \
"gcc.dat" using 1:3 title 'gcc' w l lw 1  
_______________________________________________
Beowulf mailing list, [email protected]
To change your subscription (digest mode or unsubscribe) visit 
http://www.beowulf.org/mailman/listinfo/beowulf

Reply via email to