On Wed, 2014-11-12 at 21:47 +0200, Juha-Pekka Heikkila wrote: > On 12.11.2014 19:36, Bruno Jimenez wrote: > > On Wed, 2014-11-12 at 14:50 +0200, Juha-Pekka Heikkila wrote: > >> Signed-off-by: Juha-Pekka Heikkila <juhapekka.heikk...@gmail.com> > >> --- > >> src/mesa/Makefile.am | 8 +++ > >> src/mesa/main/sse2_clamping.c | 138 > >> ++++++++++++++++++++++++++++++++++++++++++ > >> src/mesa/main/sse2_clamping.h | 49 +++++++++++++++ > >> 3 files changed, 195 insertions(+) > >> create mode 100644 src/mesa/main/sse2_clamping.c > >> create mode 100644 src/mesa/main/sse2_clamping.h > >> > >> diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am > >> index 932db4f..43dbe87 100644 > >> --- a/src/mesa/Makefile.am > >> +++ b/src/mesa/Makefile.am > >> @@ -111,6 +111,10 @@ if SSE41_SUPPORTED > >> ARCH_LIBS += libmesa_sse41.la > >> endif > >> > >> +if SSE2_SUPPORTED > >> +ARCH_LIBS += libmesa_sse2.la > >> +endif > >> + > >> MESA_ASM_FILES_FOR_ARCH = > >> > >> if HAVE_X86_ASM > >> @@ -155,6 +159,10 @@ libmesa_sse41_la_SOURCES = \ > >> main/sse_minmax.c > >> libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1 > >> > >> +libmesa_sse2_la_SOURCES = \ > >> + main/sse2_clamping.c > >> +libmesa_sse2_la_CFLAGS = $(AM_CFLAGS) -msse2 > >> + > >> pkgconfigdir = $(libdir)/pkgconfig > >> pkgconfig_DATA = gl.pc > >> > >> diff --git a/src/mesa/main/sse2_clamping.c b/src/mesa/main/sse2_clamping.c > >> new file mode 100644 > >> index 0000000..66c7dc7 > >> --- /dev/null > >> +++ b/src/mesa/main/sse2_clamping.c > >> @@ -0,0 +1,138 @@ > >> +/* > >> + * Copyright © 2014 Intel Corporation > >> + * > >> + * Permission is hereby granted, free of charge, to any person obtaining a > >> + * copy of this software and associated documentation files (the > >> "Software"), > >> + * to deal in the Software without restriction, including without > >> limitation > >> + * the rights to use, copy, modify, merge, publish, distribute, > >> sublicense, > >> + * and/or sell copies of the Software, and to permit persons to whom the > >> + * Software is furnished to do so, subject to the following conditions: > >> + * > >> + * The above copyright notice and this permission notice (including the > >> next > >> + * paragraph) shall be included in all copies or substantial portions of > >> the > >> + * Software. > >> + * > >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > >> EXPRESS OR > >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > >> MERCHANTABILITY, > >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT > >> SHALL > >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR > >> OTHER > >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > >> DEALINGS > >> + * IN THE SOFTWARE. > >> + * > >> + * Authors: > >> + * Juha-Pekka Heikkila <juhapekka.heikk...@gmail.com> > >> + * > >> + */ > >> + > >> +#ifdef __SSE2__ > >> +#include "main/macros.h" > >> +#include "main/sse2_clamping.h" > >> +#include <emmintrin.h> > >> + > >> +/** > >> + * Clamp four float values to [min,max] > >> + */ > >> +static inline void > >> +_mesa_clamp_float_rgba(GLfloat src[4], GLfloat result[4], const float min, > >> + const float max) > >> +{ > >> + __m128 operand, minval, maxval; > >> + > >> + operand = _mm_loadu_ps(src); > >> + minval = _mm_set1_ps(min); > >> + maxval = _mm_set1_ps(max); > >> + operand = _mm_max_ps(operand, minval); > >> + operand = _mm_min_ps(operand, maxval); > >> + _mm_storeu_ps(result, operand); > >> +} > >> + > >> + > >> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 > >> + */ > >> +__attribute__((optimize("unroll-loops"))) > >> +void > >> +_mesa_streaming_clamp_float_rgba(const GLuint n, GLfloat rgba_src[][4], > >> + GLfloat rgba_dst[][4], const GLfloat min, > >> + const GLfloat max) > >> +{ > >> + int c, prefetch_c; > >> + float* worker = &rgba_src[0][0]; > >> + __m128 operand[2], minval, maxval; > >> + > >> + _mm_prefetch((char*) (((unsigned long)worker)|0x1f) + 65, _MM_HINT_T0); > > ^^^^ ^^^ > > > > Hi, > > > > May I ask why precisely this numbers? > > 0x1f as you note below is a typo, should be 0x0f. 65 is cache line > length added with one to even the |0x1f operation.
Hi, I supposed that it could be something like that, but I wasn't fully sure, thanks for the answer. > > > > >> + > >> + minval = _mm_set1_ps(min); > >> + maxval = _mm_set1_ps(max); > >> + > >> + for (c = n*4; c > 0 && (((unsigned long)worker)&0x1f) != 0; c--, > >> worker++) { > > ^^^^^ > > > > I guess that this is for alignment, but you only need to align to a 16 > > bytes boundary, not 32. Or maybe I am missing something obvious. > > > > You are correct, 0x1f is typo. should be 0x0f > > >> + operand[0] = _mm_load_ss(worker); > >> + operand[0] = _mm_max_ss(operand[0], minval); > >> + operand[0] = _mm_min_ss(operand[0], maxval); > >> + _mm_store_ss(worker, operand[0]); > >> + } > >> + > >> + while (c >= 8) { > >> + _mm_prefetch((char*) worker + 64, _MM_HINT_T0); > > ^^^ > >> + > >> + for (prefetch_c = 64/8; prefetch_c > 0 && c >= 8; prefetch_c--, > >> c-=8, > > ^^^^ > > > > May I ask also why this numbers? > > > > 64 is cache line length in bytes, 8 mean this loop handle 8 floats in > one go, operand[0] get 4 floats and same goes for operand[1]. I found > interleaving this way to give more performance, adding more operands did > not give any more performance thus 2 operands to work with which turn > into the number 8. Now that you asked I think 64 has to be divided by 32 > and not 8 (8 elements, each 4 bytes -> 32) > > Idea here is we send prefetch for next cache line before we start to > work with current line, arriving to next loop iteration we already have > new cache line ready. I guess that, as the access is serial, the CPU might be able to predict that it will need the next block of data. Thanks a lot :) Bruno > > > > >> + worker += 8) { > >> + > >> + operand[0] = _mm_load_ps(worker); > >> + operand[1] = _mm_load_ps(worker+4); > >> + operand[0] = _mm_max_ps(operand[0], minval); > >> + operand[1] = _mm_max_ps(operand[1], minval); > >> + operand[0] = _mm_min_ps(operand[0], maxval); > >> + operand[1] = _mm_min_ps(operand[1], maxval); > >> + > >> + _mm_store_ps(worker, operand[0]); > >> + _mm_store_ps(worker+4, operand[1]); > >> + } > >> + } > >> + > >> + for (; c > 0; c--, worker++) { > >> + operand[0] = _mm_load_ss(worker); > >> + operand[0] = _mm_max_ss(operand[0], minval); > >> + operand[0] = _mm_min_ss(operand[0], maxval); > >> + _mm_store_ss(worker, operand[0]); > >> + } > >> +} > >> + > >> + > >> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 and apply > >> + * scaling and mapping to components. > >> + * > >> + * this replace handling of [RGBA] channels: > >> + * rgba_temp[RCOMP] = CLAMP(rgba[i][RCOMP], 0.0F, 1.0F); > >> + * rgba[i][RCOMP] = rMap[F_TO_I(rgba_temp[RCOMP] * scale[RCOMP])]; > >> + */ > >> +void > >> +_mesa_clamp_float_rgba_scale_and_map(const GLuint n, GLfloat > >> rgba_src[][4], > >> + GLfloat rgba_dst[][4], const GLfloat > >> min, > >> + const GLfloat max, > >> + const GLfloat scale[4], > >> + const GLfloat* rMap, const GLfloat* > >> gMap, > >> + const GLfloat* bMap, const GLfloat* > >> aMap) > >> +{ > >> + int i; > >> + GLfloat __attribute__((aligned(16))) temp[4]; > >> + __m128 *operand = (__m128*) &temp, multiplier, mmove; > >> + __m128i truncated_integers; > >> + > >> + const unsigned int* map_p = (const unsigned int*) &truncated_integers; > >> + > >> + multiplier = _mm_loadu_ps(scale); > >> + > >> + for(i = 0; i < n; i++) { > >> + _mesa_clamp_float_rgba(rgba_src[i], temp, min, max); > >> + > >> + *operand = _mm_mul_ps(multiplier, *operand); > >> + truncated_integers = _mm_cvttps_epi32(*operand); > >> + mmove = _mm_set_ps(aMap[map_p[ACOMP]], bMap[map_p[BCOMP]], > >> + gMap[map_p[GCOMP]], rMap[map_p[RCOMP]] ); > >> + > >> + _mm_storeu_ps(rgba_dst[i], mmove); > >> + } > >> +} > >> + > >> +#endif /* __SSE2__ */ > >> diff --git a/src/mesa/main/sse2_clamping.h b/src/mesa/main/sse2_clamping.h > >> new file mode 100644 > >> index 0000000..688fab7 > >> --- /dev/null > >> +++ b/src/mesa/main/sse2_clamping.h > >> @@ -0,0 +1,49 @@ > >> +/* > >> + * Copyright © 2014 Intel Corporation > >> + * > >> + * Permission is hereby granted, free of charge, to any person obtaining a > >> + * copy of this software and associated documentation files (the > >> "Software"), > >> + * to deal in the Software without restriction, including without > >> limitation > >> + * the rights to use, copy, modify, merge, publish, distribute, > >> sublicense, > >> + * and/or sell copies of the Software, and to permit persons to whom the > >> + * Software is furnished to do so, subject to the following conditions: > >> + * > >> + * The above copyright notice and this permission notice (including the > >> next > >> + * paragraph) shall be included in all copies or substantial portions of > >> the > >> + * Software. > >> + * > >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > >> EXPRESS OR > >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > >> MERCHANTABILITY, > >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT > >> SHALL > >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR > >> OTHER > >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > >> DEALINGS > >> + * IN THE SOFTWARE. > >> + * > >> + * Authors: > >> + * Juha-Pekka Heikkila <juhapekka.heikk...@gmail.com> > >> + * > >> + */ > >> + > >> +#ifdef __SSE2__ > >> + > >> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 > >> + */ > >> +void > >> +_mesa_streaming_clamp_float_rgba(const GLuint n, GLfloat rgba_src[][4], > >> + GLfloat rgba_dst[][4], const GLfloat min, > >> + const GLfloat max); > >> + > >> + > >> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 and apply > >> + * scaling and mapping to components. > >> + */ > >> +void > >> +_mesa_clamp_float_rgba_scale_and_map(const GLuint n, GLfloat > >> rgba_src[][4], > >> + GLfloat rgba_dst[][4], const GLfloat > >> min, > >> + const GLfloat max, > >> + const GLfloat scale[4], > >> + const GLfloat* rMap, const GLfloat* > >> gMap, > >> + const GLfloat* bMap, const GLfloat* > >> aMap); > >> + > >> +#endif /* __SSE2__ */ > > > > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev