Hi, i have implemented a "real" average neighborhood algorithm that runs in parallel using openmp. The source code and the benchmark shell script is attached.
The neighbor program computes the average moving window of arbitrary size. The size of the map rows x cols and the size of the moving window (odd number cols==rows) can be specified. ./neighbor rows cols mw_size IMHO the new program is better for compiler comparison and neighborhood operation performance. This is the benchmark on my 5 year old AMD phenom 4 core computer using 1, 2 and 4 threads: gcc -Wall -fopenmp -lgomp -Ofast main.c -o neighbor export OMP_NUM_THREADS=1 time ./neighbor 5000 5000 23 real 0m37.211s user 0m36.998s sys 0m0.196s export OMP_NUM_THREADS=2 time ./neighbor 5000 5000 23 real 0m19.907s user 0m38.890s sys 0m0.248s export OMP_NUM_THREADS=4 time ./neighbor 5000 5000 23 real 0m10.170s user 0m38.466s sys 0m0.192s Happy hacking, compiling and testing. :) Best regards Soeren 2013/6/29 Markus Metz <markus.metz.gisw...@gmail.com> > On Sat, Jun 29, 2013 at 1:26 PM, Hamish <hamis...@yahoo.com> wrote: > > Markus Metz wrote: > > > >> Some more results with Sören's test program on a Intel(R) Core(TM) i5 > >> CPU M450 @ 2.40GHz (2 real cores, 4 fake cores) with gcc 4.7.2 and > >> clang 3.3 > >> > >> gcc -O3 > >> v is 2.09131e+13 > >> > >> real 2m0.393s > >> user 1m57.610s > >> sys 0m0.003s > >> > >> gcc -Ofast > >> v is 2.09131e+13 > >> > >> real 0m7.218s > >> user 0m7.018s > >> sys 0m0.017s > > > > > > nice. one thing we need to remember though is that it's not entirely > > free, one thing -Ofast turns on is -ffast-math, > > """ > > This option is not turned on by any -O option besides -Ofast since it > can > > result in incorrect output for programs that depend on an exact > > implementation of IEEE or ISO rules/specifications for math functions. > It > > may, however, yield faster code for programs that do not require the > > guarantees of these specifications. > > """ > > > > which may not be fit for our purposes. > > > > > > With the ifort compiler there is '-fp-model precise' which allows only > > optimizations which don't harm the results. Maybe gcc has something > > similar. > > In gcc, you can turn of -ffoo with -fno-foo, maybe this way you can > use -Ofast -fno-fast-math to preserve IEEE specifications. > > > > Glad to see -floop-parallelize-all in gcc 4.7, it will help us identify > > places to focus OpenMP work on. > > > > > > Hamish > > >
benchmark.sh
Description: Bourne shell script
#include <stdio.h> #include <stdlib.h> /* #define DEBUG 1 */ /* Prototypes for gathering and average computation */ static int gather_values(double **input, double *buff, int nrows, int ncols, int mw_size, int col, int row, int dist); static double average(double *values, int size); int main(int argc, char **argv) { int nrows, ncols, mw_size, size, dist; double **input = NULL, **output = NULL; int i, j; /* Check and parse the input parameter */ if(argc != 4) { fprintf(stderr, "Warning!\n"); fprintf(stderr, "Please specifiy the number of rows and columns and the " "\nsize of the moving window (must be an odd number)\n"); fprintf(stderr, "\nUsage: neighbor 5000 5000 51\n"); fprintf(stderr, "\nUsing default values: rows = 5000, cols = 5000, moving window = 51\n"); nrows = 5000; ncols = 5000; mw_size = 51; } else { sscanf(argv[1], "%d", &nrows); sscanf(argv[2], "%d", &ncols); sscanf(argv[3], "%d", &mw_size); if(mw_size%2 == 0) { fprintf(stderr,"The size of the moving window must be odd"); return -1; } } size = mw_size * mw_size; dist = mw_size / 2; /* Allocate input and output */ input = (double**)calloc(nrows, sizeof(double*)); output= (double**)calloc(nrows, sizeof(double*)); if(input == NULL || output == NULL) { fprintf(stderr, "Unable to allocate arrays"); return -1; } for(i = 0; i < nrows; i++) { input[i] = (double*)calloc(ncols, sizeof(double)); output[i]= (double*)calloc(ncols, sizeof(double)); if(input[i] == NULL || output[i] == NULL) { fprintf(stderr, "Unable to allocate arrays"); return -1; } #ifdef DEBUG for(j = 0; j < ncols; j++) input[i][j] = i + j; #endif } #pragma omp parallel for private(i, j) for(i = 0; i < nrows; i++) { for(j = 0; j < ncols; j++) { /* Value buffer with maximum size */ double *buff = NULL; buff = (double*)calloc(size, sizeof(double)); /* Gather value in moving window */ int num = gather_values(input, buff, nrows, ncols, mw_size, i, j, dist); output[i][j] = average(buff, num); free(buff); } } #ifdef DEBUG printf("\nInput\n"); for(i = 0; i < nrows; i++) { for(j = 0; j < ncols; j++) { printf("%.2f ", input[i][j]); } printf("\n"); } printf("\nOutput\n"); for(i = 0; i < nrows; i++) { for(j = 0; j < ncols; j++) { printf("%.2f ", output[i][j]); } printf("\n"); } #endif return 0; } int gather_values(double **input, double *buff, int nrows, int ncols, int mw_size, int col, int row, int dist) { int i, j, k; int start_row = row - dist; int start_col = col - dist; int end_row = start_row + mw_size; int end_col = start_col + mw_size; if(start_row < 0) start_row = 0; if(start_col < 0) start_col = 0; if(end_row > nrows) end_row = nrows; if(end_col > ncols) end_col = ncols; k = 0; for(i = start_row; i < end_row; i++) { for(j = start_col; j < end_col; j++) { buff[k] = input[i][j]; k++; } } return k; } double average(double *values, int size) { int i; double a = 0.0; for(i = 0; i < size; i++) a += values[i]; a = a / (double)size; return a; }
_______________________________________________ grass-user mailing list grass-user@lists.osgeo.org http://lists.osgeo.org/mailman/listinfo/grass-user