Author: post
Date: 2011-04-13 15:29:47 +0200 (Wed, 13 Apr 2011)
New Revision: 3986
Added:
trunk/plugins/lensfun/lensfun-avx.c
Modified:
trunk/plugins/lensfun/Makefile.am
trunk/plugins/lensfun/lensfun.c
Log:
Add AVX compiled version of lensfun correction.
Modified: trunk/plugins/lensfun/Makefile.am
===================================================================
--- trunk/plugins/lensfun/Makefile.am 2011-04-13 13:01:53 UTC (rev 3985)
+++ trunk/plugins/lensfun/Makefile.am 2011-04-13 13:29:47 UTC (rev 3986)
@@ -18,10 +18,10 @@
libdir = $(datadir)/rawstudio/plugins/
-lensfun_la_LIBADD = @PACKAGE_LIBS@ lensfun-sse2.lo lensfun-c.lo
+lensfun_la_LIBADD = @PACKAGE_LIBS@ lensfun-avx.lo lensfun-sse2.lo lensfun-c.lo
lensfun_la_LDFLAGS = -module -avoid-version
lensfun_la_SOURCES = lensfun-version.c lensfun-version.h
-EXTRA_DIST = lensfun-sse2.c lensfun.c
+EXTRA_DIST = lensfun-avx.c lensfun-sse2.c lensfun.c
lensfun-c.lo: lensfun.c
$(LTCOMPILE) -o lensfun-c.lo -c $(top_srcdir)/plugins/lensfun/lensfun.c
@@ -34,3 +34,10 @@
endif
$(LTCOMPILE) $(SSE_FLAG) -c $(top_srcdir)/plugins/lensfun/lensfun-sse2.c
+lensfun-avx.lo: lensfun-avx.c
+if CAN_COMPILE_AVX
+AVX_FLAG=-mavx
+else
+AVX_FLAG=
+endif
+ $(LTCOMPILE) $(AVX_FLAG) -c $(top_srcdir)/plugins/lensfun/lensfun-avx.c
Added: trunk/plugins/lensfun/lensfun-avx.c
===================================================================
--- trunk/plugins/lensfun/lensfun-avx.c (rev 0)
+++ trunk/plugins/lensfun/lensfun-avx.c 2011-04-13 13:29:47 UTC (rev 3986)
@@ -0,0 +1,370 @@
+/*
+ * * Copyright (C) 2006-2011 Anders Brander <[email protected]>,
+ * * Anders Kvist <[email protected]> and Klaus Post <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
+ */
+
+/* Plugin tmpl version 4 */
+
+#include <rawstudio.h>
+#include <lensfun.h>
+
+#if defined (__AVX__)
+
+#include <emmintrin.h>
+
+static gfloat twofiftytwo_ps[4] __attribute__ ((aligned (16))) = {256.0f,
256.0f, 256.0f, 0.0f};
+static gint _zero12[4] __attribute__ ((aligned (16))) = {0,1,2,0};
+static gint _max_coord[4] __attribute__ ((aligned (16))) =
{65536,65536,65536,65536};
+
+gboolean is_avx_compiled(void)
+{
+ return TRUE;
+}
+
+void
+rs_image16_bilinear_full_avx(RS_IMAGE16 *in, gushort *out, gfloat *pos, const
gint *current_xy, const gint* min_max_xy)
+{
+ const gint m_w = (in->w-1);
+ const gint m_h = (in->h-1);
+
+ __m128 p0, p1;
+ if ((uintptr_t)pos & 15)
+ {
+ p0 = _mm_loadu_ps(pos); // y1x1 y0x0
+ p1 = _mm_loadu_ps(pos+4); // ---- y2x2
+ } else
+ {
+ p0 = _mm_load_ps(pos); // y1x1 y0x0
+ p1 = _mm_load_ps(pos+4); // ---- y2x2
+ }
+
+ // to x2x2 x1x0
+ __m128 xf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(0,0,2,0));
+ // to y2y2 y1y0
+ __m128 yf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(1,1,3,1));
+
+ __m128 fl256 = _mm_load_ps(twofiftytwo_ps);
+ xf = _mm_mul_ps(xf, fl256);
+ yf = _mm_mul_ps(yf, fl256);
+ __m128i x = _mm_cvttps_epi32(xf);
+ __m128i y = _mm_cvttps_epi32(yf);
+
+ __m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
+ __m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
+
+ __m128i x_gt, y_gt;
+
+ /* Clamping */
+ x_gt = _mm_cmpgt_epi32(x, _m_w);
+ y_gt = _mm_cmpgt_epi32(y, _m_h);
+
+ x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
+ y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
+
+ __m128i current_pos = _mm_loadl_epi64((__m128i*)current_xy);
+ __m128i current_x = _mm_shuffle_epi32(current_pos,_MM_SHUFFLE(0,0,0,0));
+ __m128i current_y = _mm_shuffle_epi32(current_pos,_MM_SHUFFLE(1,1,1,1));
+ __m128i max_x = _mm_load_si128((__m128i*)&min_max_xy[8]);
+ __m128i max_y = _mm_load_si128((__m128i*)&min_max_xy[12]);
+ __m128i max_coord = _mm_load_si128((__m128i*)_max_coord);
+ __m128i eq_max_x = _mm_cmpeq_epi32(max_coord, max_x);
+ __m128i eq_max_y = _mm_cmpeq_epi32(max_coord, max_y);
+ x_gt = _mm_and_si128(x_gt, eq_max_x);
+ y_gt = _mm_and_si128(y_gt, eq_max_y);
+ __m128i insert_x = _mm_and_si128(x_gt, current_x);
+ __m128i insert_y = _mm_and_si128(y_gt, current_y);
+ max_x = _mm_or_si128(insert_x, _mm_andnot_si128(x_gt, max_x));
+ max_y = _mm_or_si128(insert_y, _mm_andnot_si128(y_gt, max_y));
+ _mm_store_si128((__m128i*)&min_max_xy[8], max_x);
+ _mm_store_si128((__m128i*)&min_max_xy[12], max_y);
+
+ __m128i zero = _mm_setzero_si128();
+ __m128i x_lt = _mm_cmplt_epi32(x, zero);
+ __m128i y_lt = _mm_cmplt_epi32(y, zero);
+ x = _mm_andnot_si128(x_lt, x);
+ y = _mm_andnot_si128(y_lt, y);
+ __m128i min_x = _mm_load_si128((__m128i*)&min_max_xy[0]);
+ __m128i min_y = _mm_load_si128((__m128i*)&min_max_xy[4]);
+ insert_x = _mm_and_si128(x_lt, current_x);
+ insert_y = _mm_and_si128(y_lt, current_y);
+ min_x = _mm_or_si128(insert_x, _mm_andnot_si128(x_lt, min_x));
+ min_y = _mm_or_si128(insert_y, _mm_andnot_si128(y_lt, min_y));
+ _mm_store_si128((__m128i*)&min_max_xy[0], min_x);
+ _mm_store_si128((__m128i*)&min_max_xy[4], min_y);
+
+ __m128i one = _mm_set1_epi32(1);
+ __m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
+ __m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
+
+ /* Check that 'next' pixels are in bounds */
+ _m_w = _mm_srai_epi32(_m_w, 8);
+ _m_h = _mm_srai_epi32(_m_h, 8);
+
+ x_gt = _mm_cmpgt_epi32(nx, _m_w);
+ y_gt = _mm_cmpgt_epi32(ny, _m_h);
+
+ nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w,
x_gt));
+ ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h,
y_gt));
+
+ int xfer[16] __attribute__ ((aligned (16)));
+
+ /* Pitch as pixels */
+ __m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >>
2)<<16));
+
+ /* Remove remainder */
+ __m128i tx = _mm_srai_epi32(x, 8);
+ __m128i ty = _mm_srai_epi32(y, 8);
+
+ /* Multiply y by pitch */
+ ty = _mm_packs_epi32(ty, ty);
+ __m128i ty_lo = _mm_mullo_epi16(ty, pitch);
+ __m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
+ ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
+
+ /* Same to next pixel */
+ ny = _mm_packs_epi32(ny, ny);
+ __m128i ny_lo = _mm_mullo_epi16(ny, pitch);
+ __m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
+ ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
+
+ /* Add pitch and x offset */
+ __m128i a_offset = _mm_add_epi32(tx, ty);
+ __m128i b_offset = _mm_add_epi32(nx, ty);
+ __m128i c_offset = _mm_add_epi32(tx, ny);
+ __m128i d_offset = _mm_add_epi32(nx, ny);
+
+ /* Multiply by pixelsize and add RGB offsets */
+ __m128i zero12 = _mm_load_si128((__m128i*)_zero12);
+ a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
+ b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
+ c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
+ d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
+
+ _mm_store_si128((__m128i*)xfer, a_offset);
+ _mm_store_si128((__m128i*)&xfer[4], b_offset);
+ _mm_store_si128((__m128i*)&xfer[8], c_offset);
+ _mm_store_si128((__m128i*)&xfer[12], d_offset);
+
+ gushort* pixels[12];
+
+ /* Loop unrolled, allows agressive instruction reordering */
+ /* Red, then G & B */
+ pixels[0] = in->pixels + xfer[0]; // a
+ pixels[1] = in->pixels + xfer[4]; // b
+ pixels[2] = in->pixels + xfer[8]; // c
+ pixels[3] = in->pixels + xfer[12]; // d
+
+ pixels[4] = in->pixels + xfer[1+0]; // a
+ pixels[5] = in->pixels + xfer[1+4]; // b
+ pixels[6] = in->pixels + xfer[1+8]; // c
+ pixels[7] = in->pixels + xfer[1+12]; // d
+
+ pixels[8] = in->pixels + xfer[2+0]; // a
+ pixels[9] = in->pixels + xfer[2+4]; // b
+ pixels[10] = in->pixels + xfer[2+8]; // c
+ pixels[11] = in->pixels + xfer[2+12]; // d
+
+ /* Calculate distances */
+ __m128i twofiftyfive = _mm_set1_epi32(255);
+ __m128i diffx = _mm_and_si128(x, twofiftyfive);
+ __m128i diffy = _mm_and_si128(y, twofiftyfive);
+ __m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
+ __m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
+
+ /* Calculate weights */
+ __m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
+ __m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
+ __m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
+ __m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
+
+ _mm_store_si128((__m128i*)xfer, aw);
+ _mm_store_si128((__m128i*)&xfer[4], bw);
+ _mm_store_si128((__m128i*)&xfer[8], cw);
+ _mm_store_si128((__m128i*)&xfer[12], dw);
+
+ gushort** p = pixels;
+ /* Loop unrolled */
+ out[0] = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] *
*p[2] + xfer[12] * *p[3] + 16384) >> 15 );
+ p+=4;
+ out[1] = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] *
*p[2] + xfer[1+12] * *p[3] + 16384) >> 15 );
+ p+=4;
+ out[2] = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] *
*p[2] + xfer[2+12] * *p[3] + 16384) >> 15 );
+}
+
+void
+rs_image16_bilinear_nomeasure_avx(RS_IMAGE16 *in, gushort *out, gfloat *pos)
+{
+ const gint m_w = (in->w-1);
+ const gint m_h = (in->h-1);
+
+ __m128 p0, p1;
+ if ((uintptr_t)pos & 15)
+ {
+ p0 = _mm_loadu_ps(pos); // y1x1 y0x0
+ p1 = _mm_loadu_ps(pos+4); // ---- y2x2
+ } else
+ {
+ p0 = _mm_load_ps(pos); // y1x1 y0x0
+ p1 = _mm_load_ps(pos+4); // ---- y2x2
+ }
+
+ // to x2x2 x1x0
+ __m128 xf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(0,0,2,0));
+ // to y2y2 y1y0
+ __m128 yf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(1,1,3,1));
+
+ __m128 fl256 = _mm_load_ps(twofiftytwo_ps);
+ xf = _mm_mul_ps(xf, fl256);
+ yf = _mm_mul_ps(yf, fl256);
+ __m128i x = _mm_cvttps_epi32(xf);
+ __m128i y = _mm_cvttps_epi32(yf);
+
+ __m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
+ __m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
+
+ __m128i x_gt, y_gt;
+
+ /* Clamping */
+ x_gt = _mm_cmpgt_epi32(x, _m_w);
+ y_gt = _mm_cmpgt_epi32(y, _m_h);
+
+ x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
+ y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
+
+ __m128i zero = _mm_setzero_si128();
+ __m128i x_lt = _mm_cmplt_epi32(x, zero);
+ __m128i y_lt = _mm_cmplt_epi32(y, zero);
+ x = _mm_andnot_si128(x_lt, x);
+ y = _mm_andnot_si128(y_lt, y);
+
+ __m128i one = _mm_set1_epi32(1);
+ __m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
+ __m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
+
+ /* Check that 'next' pixels are in bounds */
+ _m_w = _mm_srai_epi32(_m_w, 8);
+ _m_h = _mm_srai_epi32(_m_h, 8);
+
+ x_gt = _mm_cmpgt_epi32(nx, _m_w);
+ y_gt = _mm_cmpgt_epi32(ny, _m_h);
+
+ nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w,
x_gt));
+ ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h,
y_gt));
+
+ int xfer[16] __attribute__ ((aligned (16)));
+
+ /* Pitch as pixels */
+ __m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >>
2)<<16));
+
+ /* Remove remainder */
+ __m128i tx = _mm_srai_epi32(x, 8);
+ __m128i ty = _mm_srai_epi32(y, 8);
+
+ /* Multiply y by pitch */
+ ty = _mm_packs_epi32(ty, ty);
+ __m128i ty_lo = _mm_mullo_epi16(ty, pitch);
+ __m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
+ ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
+
+ /* Same to next pixel */
+ ny = _mm_packs_epi32(ny, ny);
+ __m128i ny_lo = _mm_mullo_epi16(ny, pitch);
+ __m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
+ ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
+
+ /* Add pitch and x offset */
+ __m128i a_offset = _mm_add_epi32(tx, ty);
+ __m128i b_offset = _mm_add_epi32(nx, ty);
+ __m128i c_offset = _mm_add_epi32(tx, ny);
+ __m128i d_offset = _mm_add_epi32(nx, ny);
+
+ /* Multiply by pixelsize and add RGB offsets */
+ __m128i zero12 = _mm_load_si128((__m128i*)_zero12);
+ a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
+ b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
+ c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
+ d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
+
+ _mm_store_si128((__m128i*)xfer, a_offset);
+ _mm_store_si128((__m128i*)&xfer[4], b_offset);
+ _mm_store_si128((__m128i*)&xfer[8], c_offset);
+ _mm_store_si128((__m128i*)&xfer[12], d_offset);
+
+ gushort* pixels[12];
+
+ /* Loop unrolled, allows agressive instruction reordering */
+ /* Red, then G & B */
+ pixels[0] = in->pixels + xfer[0]; // a
+ pixels[1] = in->pixels + xfer[4]; // b
+ pixels[2] = in->pixels + xfer[8]; // c
+ pixels[3] = in->pixels + xfer[12]; // d
+
+ pixels[4] = in->pixels + xfer[1+0]; // a
+ pixels[5] = in->pixels + xfer[1+4]; // b
+ pixels[6] = in->pixels + xfer[1+8]; // c
+ pixels[7] = in->pixels + xfer[1+12]; // d
+
+ pixels[8] = in->pixels + xfer[2+0]; // a
+ pixels[9] = in->pixels + xfer[2+4]; // b
+ pixels[10] = in->pixels + xfer[2+8]; // c
+ pixels[11] = in->pixels + xfer[2+12]; // d
+
+ /* Calculate distances */
+ __m128i twofiftyfive = _mm_set1_epi32(255);
+ __m128i diffx = _mm_and_si128(x, twofiftyfive);
+ __m128i diffy = _mm_and_si128(y, twofiftyfive);
+ __m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
+ __m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
+
+ /* Calculate weights */
+ __m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
+ __m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
+ __m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
+ __m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
+
+ _mm_store_si128((__m128i*)xfer, aw);
+ _mm_store_si128((__m128i*)&xfer[4], bw);
+ _mm_store_si128((__m128i*)&xfer[8], cw);
+ _mm_store_si128((__m128i*)&xfer[12], dw);
+
+ gushort** p = pixels;
+ /* Loop unrolled */
+ out[0] = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] *
*p[2] + xfer[12] * *p[3] + 16384) >> 15 );
+ p+=4;
+ out[1] = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] *
*p[2] + xfer[1+12] * *p[3] + 16384) >> 15 );
+ p+=4;
+ out[2] = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] *
*p[2] + xfer[2+12] * *p[3] + 16384) >> 15 );
+}
+
+#else // NO AVX
+
+gboolean is_avx_compiled(void)
+{
+ return FALSE;
+}
+
+void
+rs_image16_bilinear_full_avx(RS_IMAGE16 *in, gushort *out, gfloat *pos,const
gint *current_xy, const gint* min_max_xy)
+{
+}
+
+void
+rs_image16_bilinear_nomeasure_avx(RS_IMAGE16 *in, gushort *out, gfloat *pos)
+{
+}
+
+#endif // defined (__AVX__)
Modified: trunk/plugins/lensfun/lensfun.c
===================================================================
--- trunk/plugins/lensfun/lensfun.c 2011-04-13 13:01:53 UTC (rev 3985)
+++ trunk/plugins/lensfun/lensfun.c 2011-04-13 13:29:47 UTC (rev 3986)
@@ -89,6 +89,9 @@
extern gboolean is_sse2_compiled(void);
extern void rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat
*pos, const gint *current_xy, const gint* min_max_xy);
extern void rs_image16_bilinear_nomeasure_sse2(RS_IMAGE16 *in, gushort *out,
gfloat *pos);
+extern gboolean is_avx_compiled(void);
+extern void rs_image16_bilinear_full_avx(RS_IMAGE16 *in, gushort *out, gfloat
*pos, const gint *current_xy, const gint* min_max_xy);
+extern void rs_image16_bilinear_nomeasure_avx(RS_IMAGE16 *in, gushort *out,
gfloat *pos);
static RSFilterClass *rs_lensfun_parent_class = NULL;
G_MODULE_EXPORT void
@@ -320,13 +323,14 @@
}
gboolean sse2_available = !!(rs_detect_cpu_features() &
RS_CPU_FLAG_SSE2) && is_sse2_compiled();
+ gboolean avx_available = !!(rs_detect_cpu_features() & RS_CPU_FLAG_AVX)
&& is_avx_compiled();
gint min_max_xy[16] __attribute__ ((aligned (16)));
gint current_xy[2] __attribute__ ((aligned (16))) = {0,0};
if (t->input->pixelsize != 4)
- sse2_available = FALSE;
+ sse2_available = avx_available = FALSE;
- if (sse2_available)
+ if (sse2_available || avx_available)
{
for (i = 0; i < 8; i++)
{
@@ -356,7 +360,7 @@
target = GET_PIXEL(t->output, t->roi->x, y);
gfloat* l_pos = pos;
- if (sse2_available)
+ if (avx_available)
{
current_xy[1] = y;
for(x = 0; x < t->roi->width ; x++)
@@ -364,6 +368,24 @@
if (t->measure_minmax_coords)
{
current_xy[0] = x;
+
rs_image16_bilinear_full_avx(t->input, target, l_pos, current_xy, min_max_xy);
+ }
+ else
+ {
+
rs_image16_bilinear_nomeasure_avx(t->input, target, l_pos);
+ }
+ target += 4;
+ l_pos += 6;
+ }
+ }
+ else if (sse2_available)
+ {
+ current_xy[1] = y;
+ for(x = 0; x < t->roi->width ; x++)
+ {
+ if (t->measure_minmax_coords)
+ {
+ current_xy[0] = x;
rs_image16_bilinear_full_sse2(t->input, target, l_pos, current_xy, min_max_xy);
}
else
@@ -373,7 +395,8 @@
target += 4;
l_pos += 6;
}
- } else
+ }
+ else
{
for(x = 0; x < t->roi->width ; x++)
{
@@ -398,7 +421,7 @@
}
}
g_free(pos);
- if (sse2_available)
+ if (sse2_available || avx_available)
{
for (i = 1; i < 4; i++)
{
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit