Author: post
Date: 2011-04-13 15:01:53 +0200 (Wed, 13 Apr 2011)
New Revision: 3985
Added:
trunk/plugins/resample/resample-avx.c
trunk/plugins/resample/resample-sse4.c
Modified:
trunk/plugins/resample/Makefile.am
trunk/plugins/resample/resample.c
Log:
Add untested and un-enabled SSE4+AVX code.
Modified: trunk/plugins/resample/Makefile.am
===================================================================
--- trunk/plugins/resample/Makefile.am 2011-04-12 22:32:17 UTC (rev 3984)
+++ trunk/plugins/resample/Makefile.am 2011-04-13 13:01:53 UTC (rev 3985)
@@ -17,11 +17,11 @@
libdir = $(datadir)/rawstudio/plugins/
-resample_la_LIBADD = @PACKAGE_LIBS@ resample-sse2.lo resample-c.lo
+resample_la_LIBADD = @PACKAGE_LIBS@ resample-avx.lo resample-sse2.lo
resample-sse4.lo resample-c.lo
resample_la_LDFLAGS = -module -avoid-version
resample_la_SOURCES =
-EXTRA_DIST = resample-sse2.c resample.c
+EXTRA_DIST = resample-avx.c resample-sse2.c resample-sse4.c resample.c
resample-c.lo: resample.c
$(LTCOMPILE) -o resample-c.o -c
$(top_srcdir)/plugins/resample/resample.c
@@ -34,3 +34,18 @@
endif
$(LTCOMPILE) $(SSE_FLAG) -c
$(top_srcdir)/plugins/resample/resample-sse2.c
+resample-sse4.lo: resample-sse4.c
+if CAN_COMPILE_SSE4_1
+SSE4_FLAG=-msse4.1
+else
+SSE4_FLAG=
+endif
+ $(LTCOMPILE) $(SSE4_FLAG) -c
$(top_srcdir)/plugins/resample/resample-sse4.c
+
+resample-avx.lo: resample-avx.c
+if CAN_COMPILE_AVX
+AVX_FLAG=-mavx
+else
+AVX_FLAG=
+endif
+ $(LTCOMPILE) $(AVX_FLAG) -c
$(top_srcdir)/plugins/resample/resample-avx.c
Added: trunk/plugins/resample/resample-avx.c
===================================================================
--- trunk/plugins/resample/resample-avx.c (rev 0)
+++ trunk/plugins/resample/resample-avx.c 2011-04-13 13:01:53 UTC (rev
3985)
@@ -0,0 +1,418 @@
+/*
+ * * Copyright (C) 2006-2011 Anders Brander <[email protected]>,
+ * * Anders Kvist <[email protected]> and Klaus Post <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
+ */
+
+/* Plugin tmpl version 4 */
+
+#include <rawstudio.h>
+#include <math.h>
+
+
+/* Special Vertical AVX resampler, that has massive parallism.
+ * An important restriction is that "info->dest_offset_other", must result
+ * in a 16 byte aligned memory pointer.
+ */
+
+typedef struct {
+ RS_IMAGE16 *input; /* Input Image to Resampler */
+ RS_IMAGE16 *output; /* Output Image from Resampler
*/
+ guint old_size; /* Old dimension in the
direction of the resampler*/
+ guint new_size; /* New size in the direction of
the resampler */
+ guint dest_offset_other; /* Where in the unchanged direction
should we begin writing? */
+ guint dest_end_other; /* Where in the unchanged direction
should we stop writing? */
+ guint (*resample_support)(void);
+ gfloat (*resample_func)(gfloat);
+ GThread *threadid;
+ gboolean use_compatible; /* Use compatible resampler if
pixelsize != 4 */
+ gboolean use_fast; /* Use nearest neighbour resampler,
also compatible*/
+} ResampleInfo;
+
+extern void ResizeV(ResampleInfo *info);
+extern void ResizeV_fast(ResampleInfo *info);
+extern void ResizeV_SSE4(ResampleInfo *info);
+static inline guint clampbits(gint x, guint n) { guint32 _y_temp; if(
(_y_temp=x>>n) ) x = ~_y_temp >> (32-n); return x;}
+
+static guint
+lanczos_taps(void)
+{
+ return 3;
+}
+
+static gfloat
+sinc(gfloat value)
+{
+ if (value != 0.0f)
+ {
+ value *= M_PI;
+ return sinf(value) / value;
+ }
+ else
+ return 1.0f;
+}
+
+static gfloat
+lanczos_weight(gfloat value)
+{
+ value = fabsf(value);
+ if (value < lanczos_taps())
+ {
+ return (sinc(value) * sinc(value / lanczos_taps()));
+ }
+ else
+ return 0.0f;
+}
+
+const static gint FPScale = 16384; /* fixed point scaler */
+const static gint FPScaleShift = 14; /* fixed point scaler */
+
+
+#if defined (__x86_64__) && defined(__AVX__)
+#include <smmintrin.h>
+
+void
+ResizeV_AVX(ResampleInfo *info)
+{
+ const RS_IMAGE16 *input = info->input;
+ const RS_IMAGE16 *output = info->output;
+ const guint old_size = info->old_size;
+ const guint new_size = info->new_size;
+ const guint start_x = info->dest_offset_other * input->pixelsize;
+ const guint end_x = info->dest_end_other * input->pixelsize;
+
+ gfloat pos_step = ((gfloat) old_size) / ((gfloat)new_size);
+ gfloat filter_step = MIN(1.0f / pos_step, 1.0f);
+ gfloat filter_support = (gfloat) lanczos_taps() / filter_step;
+ gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+ if (old_size <= fir_filter_size)
+ return ResizeV_fast(info);
+
+ gint *weights = g_new(gint, new_size * fir_filter_size);
+ gint *offsets = g_new(gint, new_size);
+
+ gfloat pos = 0.0;
+
+ gint i,j,k;
+
+ for (i=0; i<new_size; ++i)
+ {
+ gint end_pos = (gint) (pos + filter_support);
+ if (end_pos > old_size-1)
+ end_pos = old_size-1;
+
+ gint start_pos = end_pos - fir_filter_size + 1;
+
+ if (start_pos < 0)
+ start_pos = 0;
+
+ offsets[i] = start_pos;
+
+ /* The following code ensures that the coefficients add to
exactly FPScale */
+ gfloat total = 0.0;
+
+ /* Ensure that we have a valid position */
+ gfloat ok_pos = MAX(0.0,MIN(old_size-1,pos));
+
+ for (j=0; j<fir_filter_size; ++j)
+ {
+ /* Accumulate all coefficients */
+ total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
+ }
+
+ g_assert(total > 0.0f);
+
+ gfloat total2 = 0.0;
+
+ for (k=0; k<fir_filter_size; ++k)
+ {
+ gfloat total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
+ weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+
+ total2 = total3;
+ }
+ pos += pos_step;
+ }
+
+ guint y,x;
+ gint *wg = weights;
+
+ /* 24 pixels = 48 bytes/loop */
+ gint end_x_sse = (end_x/24)*24;
+
+ /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
+ gint add_round_sub = fir_filter_size * (FPScale >> 1);
+
+ __m128i add_32 = _mm_set_epi32(add_round_sub, add_round_sub,
add_round_sub, add_round_sub);
+
+ for (y = 0; y < new_size ; y++)
+ {
+ gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
+ gushort *out = GET_PIXEL(output, 0, y);
+ __m128i zero;
+ zero = _mm_setzero_si128();
+ for (x = start_x; x <= (end_x_sse-24); x+=24)
+ {
+ /* Accumulators, set to 0 */
+ __m128i acc1, acc2, acc3, acc1_h, acc2_h, acc3_h;
+ acc1 = acc2 = acc3 = acc1_h = acc2_h = acc3_h = zero;
+
+ for (i = 0; i < fir_filter_size; i++) {
+ /* Load weight */
+ __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+
+ /* Load source and prefetch next line */
+ int pos = i * input->rowstride;
+ __m128i src1i, src2i, src3i;
+ __m128i* in_sse = (__m128i*)&in[pos];
+ src1i = _mm_load_si128(in_sse);
+ src2i = _mm_load_si128(in_sse+1);
+ src3i = _mm_load_si128(in_sse+2);
+ _mm_prefetch(&in[pos + 32], _MM_HINT_NTA);
+
+ /* Unpack to dwords */
+ __m128i src1i_h, src2i_h, src3i_h;
+ src1i_h = _mm_unpackhi_epi16(src1i, zero);
+ src2i_h = _mm_unpackhi_epi16(src2i, zero);
+ src3i_h = _mm_unpackhi_epi16(src3i, zero);
+ src1i = _mm_unpacklo_epi16(src1i, zero);
+ src2i = _mm_unpacklo_epi16(src2i, zero);
+ src3i = _mm_unpacklo_epi16(src3i, zero);
+
+ /* Multiply my weight */
+ src1i_h = _mm_mullo_epi32(src1i_h, w);
+ src2i_h = _mm_mullo_epi32(src2i_h, w);
+ src3i_h = _mm_mullo_epi32(src3i_h, w);
+ src1i = _mm_mullo_epi32(src1i, w);
+ src2i = _mm_mullo_epi32(src2i, w);
+ src3i = _mm_mullo_epi32(src3i, w);
+
+ /* Accumulate */
+ acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+ acc2_h = _mm_add_epi32(acc2_h, src2i_h);
+ acc3_h = _mm_add_epi32(acc3_h, src3i_h);
+ acc1 = _mm_add_epi32(acc1, src1i);
+ acc2 = _mm_add_epi32(acc2, src2i);
+ acc3 = _mm_add_epi32(acc3, src3i);
+ }
+
+ /* Add rounder and subtract 32768 */
+ acc1_h = _mm_add_epi32(acc1_h, add_32);
+ acc2_h = _mm_add_epi32(acc2_h, add_32);
+ acc3_h = _mm_add_epi32(acc3_h, add_32);
+ acc1 = _mm_add_epi32(acc1, add_32);
+ acc2 = _mm_add_epi32(acc2, add_32);
+ acc3 = _mm_add_epi32(acc3, add_32);
+
+ /* Shift down */
+ acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift);
+ acc2_h = _mm_srai_epi32(acc2_h, FPScaleShift);
+ acc3_h = _mm_srai_epi32(acc3_h, FPScaleShift);
+ acc1 = _mm_srai_epi32(acc1, FPScaleShift);
+ acc2 = _mm_srai_epi32(acc2, FPScaleShift);
+ acc3 = _mm_srai_epi32(acc3, FPScaleShift);
+
+ /* Pack to signed shorts */
+ acc1 = _mm_packus_epi32(acc1, acc1_h);
+ acc2 = _mm_packus_epi32(acc2, acc2_h);
+ acc3 = _mm_packus_epi32(acc3, acc3_h);
+
+ /* Store result */
+ __m128i* sse_dst = (__m128i*)&out[x];
+ _mm_stream_si128(sse_dst, acc1);
+ _mm_stream_si128(sse_dst + 1, acc2);
+ _mm_stream_si128(sse_dst + 2, acc3);
+ in += 24;
+ }
+
+ /* Process remaining pixels */
+ for (; x < end_x; x++)
+ {
+ gint acc1 = 0;
+ for (i = 0; i < fir_filter_size; i++)
+ {
+ acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
+ }
+ out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
+ in++;
+ }
+ wg += fir_filter_size;
+ }
+ _mm_sfence();
+ g_free(weights);
+ g_free(offsets);
+}
+
+#elif defined (__AVX__)
+#include <smmintrin.h>
+
+void
+ResizeV_AVX(ResampleInfo *info)
+{
+ const RS_IMAGE16 *input = info->input;
+ const RS_IMAGE16 *output = info->output;
+ const guint old_size = info->old_size;
+ const guint new_size = info->new_size;
+ const guint start_x = info->dest_offset_other * input->pixelsize;
+ const guint end_x = info->dest_end_other * input->pixelsize;
+
+ gfloat pos_step = ((gfloat) old_size) / ((gfloat)new_size);
+ gfloat filter_step = MIN(1.0f / pos_step, 1.0f);
+ gfloat filter_support = (gfloat) lanczos_taps() / filter_step;
+ gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+ if (old_size <= fir_filter_size)
+ return ResizeV_fast(info);
+
+ gint *weights = g_new(gint, new_size * fir_filter_size);
+ gint *offsets = g_new(gint, new_size);
+
+ gfloat pos = 0.0;
+
+ gint i,j,k;
+
+ for (i=0; i<new_size; ++i)
+ {
+ gint end_pos = (gint) (pos + filter_support);
+ if (end_pos > old_size-1)
+ end_pos = old_size-1;
+
+ gint start_pos = end_pos - fir_filter_size + 1;
+
+ if (start_pos < 0)
+ start_pos = 0;
+
+ offsets[i] = start_pos;
+
+ /* The following code ensures that the coefficients add to
exactly FPScale */
+ gfloat total = 0.0;
+
+ /* Ensure that we have a valid position */
+ gfloat ok_pos = MAX(0.0f,MIN(old_size-1,pos));
+
+ for (j=0; j<fir_filter_size; ++j)
+ {
+ /* Accumulate all coefficients */
+ total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
+ }
+
+ g_assert(total > 0.0f);
+
+ gfloat total2 = 0.0;
+
+ for (k=0; k<fir_filter_size; ++k)
+ {
+ gfloat total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
+ weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+
+ total2 = total3;
+ }
+ pos += pos_step;
+ }
+
+ guint y,x;
+ gint *wg = weights;
+
+ /* 8 pixels = 16 bytes/loop */
+ gint end_x_sse = (end_x/8)*8;
+
+ /* Rounder after accumulation */
+ gint add_round_sub = (FPScale >> 1);
+ /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
+ add_round_sub += fir_filter_size * (FPScale >> 1);
+
+ for (y = 0; y < new_size ; y++)
+ {
+ gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
+ gushort *out = GET_PIXEL(output, 0, y);
+ __m128i zero;
+ zero = _mm_setzero_si128();
+ for (x = start_x; x <= (end_x_sse-8); x+=8)
+ {
+ /* Accumulators, set to 0 */
+ __m128i acc1, acc1_h;
+ acc1 = acc1_h = zero;
+
+ for (i = 0; i < fir_filter_size; i++) {
+ /* Load weight */
+ __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+ /* Load source */
+ __m128i src1i;
+ __m128i* in_sse = (__m128i*)&in[i *
input->rowstride];
+ src1i = _mm_load_si128(in_sse);
+ _mm_prefetch(in_sse+4, _MM_HINT_NTA);
+
+ /* Unpack to dwords */
+ __m128i src1i_h;
+ src1i_h = _mm_unpackhi_epi16(src1i, zero);
+ src1i = _mm_unpacklo_epi16(src1i, zero);
+
+ /* Multiply my weight */
+ src1i_h = _mm_mullo_epi32(src1i_h, w);
+ src1i = _mm_mullo_epi32(src1i, w);
+
+ /* Accumulate */
+ acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+ acc1 = _mm_add_epi32(acc1, src1i);
+ }
+ __m128i add_32 = _mm_set_epi32(add_round_sub,
add_round_sub, add_round_sub, add_round_sub);
+
+ /* Add rounder and subtract 32768 */
+ acc1_h = _mm_add_epi32(acc1_h, add_32);
+ acc1 = _mm_add_epi32(acc1, add_32);
+
+ /* Shift down */
+ acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift);
+ acc1 = _mm_srai_epi32(acc1, FPScaleShift);
+
+ /* Pack to signed shorts */
+ acc1 = _mm_packus_epi32(acc1, acc1_h);
+
+ /* Store result */
+ __m128i* sse_dst = (__m128i*)&out[x];
+ _mm_store_si128(sse_dst, acc1);
+ in += 8;
+ }
+
+ /* Process remaining pixels */
+ for (; x < end_x; x++)
+ {
+ gint acc1 = 0;
+ for (i = 0; i < fir_filter_size; i++)
+ {
+ acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
+ }
+ out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
+ in++;
+ }
+ wg += fir_filter_size;
+ }
+ g_free(weights);
+ g_free(offsets);
+}
+
+#else // not defined (__AVX__)
+
+void
+ResizeV_AVX(ResampleInfo *info)
+{
+ ResizeV_SSE4(info);
+}
+
+#endif // not defined (__x86_64__) and not defined (__AVX__)
+
+
Added: trunk/plugins/resample/resample-sse4.c
===================================================================
--- trunk/plugins/resample/resample-sse4.c (rev 0)
+++ trunk/plugins/resample/resample-sse4.c 2011-04-13 13:01:53 UTC (rev
3985)
@@ -0,0 +1,418 @@
+/*
+ * * Copyright (C) 2006-2011 Anders Brander <[email protected]>,
+ * * Anders Kvist <[email protected]> and Klaus Post <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
+ */
+
+/* Plugin tmpl version 4 */
+
+#include <rawstudio.h>
+#include <math.h>
+
+
+/* Special Vertical SSE4 resampler, that has massive parallism.
+ * An important restriction is that "info->dest_offset_other", must result
+ * in a 16 byte aligned memory pointer.
+ */
+
+typedef struct {
+ RS_IMAGE16 *input; /* Input Image to Resampler */
+ RS_IMAGE16 *output; /* Output Image from Resampler
*/
+ guint old_size; /* Old dimension in the
direction of the resampler*/
+ guint new_size; /* New size in the direction of
the resampler */
+ guint dest_offset_other; /* Where in the unchanged direction
should we begin writing? */
+ guint dest_end_other; /* Where in the unchanged direction
should we stop writing? */
+ guint (*resample_support)(void);
+ gfloat (*resample_func)(gfloat);
+ GThread *threadid;
+ gboolean use_compatible; /* Use compatible resampler if
pixelsize != 4 */
+ gboolean use_fast; /* Use nearest neighbour resampler,
also compatible*/
+} ResampleInfo;
+
+extern void ResizeV_SSE2(ResampleInfo *info);
+extern void ResizeV(ResampleInfo *info);
+extern void ResizeV_fast(ResampleInfo *info);
+static inline guint clampbits(gint x, guint n) { guint32 _y_temp; if(
(_y_temp=x>>n) ) x = ~_y_temp >> (32-n); return x;}
+
+static guint
+lanczos_taps(void)
+{
+ return 3;
+}
+
+static gfloat
+sinc(gfloat value)
+{
+ if (value != 0.0f)
+ {
+ value *= M_PI;
+ return sinf(value) / value;
+ }
+ else
+ return 1.0f;
+}
+
+static gfloat
+lanczos_weight(gfloat value)
+{
+ value = fabsf(value);
+ if (value < lanczos_taps())
+ {
+ return (sinc(value) * sinc(value / lanczos_taps()));
+ }
+ else
+ return 0.0f;
+}
+
+const static gint FPScale = 16384; /* fixed point scaler */
+const static gint FPScaleShift = 14; /* fixed point scaler */
+
+
+#if defined (__x86_64__) && defined(__SSE4_1__)
+#include <smmintrin.h>
+
+void
+ResizeV_SSE4(ResampleInfo *info)
+{
+ const RS_IMAGE16 *input = info->input;
+ const RS_IMAGE16 *output = info->output;
+ const guint old_size = info->old_size;
+ const guint new_size = info->new_size;
+ const guint start_x = info->dest_offset_other * input->pixelsize;
+ const guint end_x = info->dest_end_other * input->pixelsize;
+
+ gfloat pos_step = ((gfloat) old_size) / ((gfloat)new_size);
+ gfloat filter_step = MIN(1.0f / pos_step, 1.0f);
+ gfloat filter_support = (gfloat) lanczos_taps() / filter_step;
+ gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+ if (old_size <= fir_filter_size)
+ return ResizeV_fast(info);
+
+ gint *weights = g_new(gint, new_size * fir_filter_size);
+ gint *offsets = g_new(gint, new_size);
+
+ gfloat pos = 0.0;
+
+ gint i,j,k;
+
+ for (i=0; i<new_size; ++i)
+ {
+ gint end_pos = (gint) (pos + filter_support);
+ if (end_pos > old_size-1)
+ end_pos = old_size-1;
+
+ gint start_pos = end_pos - fir_filter_size + 1;
+
+ if (start_pos < 0)
+ start_pos = 0;
+
+ offsets[i] = start_pos;
+
+ /* The following code ensures that the coefficients add to
exactly FPScale */
+ gfloat total = 0.0;
+
+ /* Ensure that we have a valid position */
+ gfloat ok_pos = MAX(0.0,MIN(old_size-1,pos));
+
+ for (j=0; j<fir_filter_size; ++j)
+ {
+ /* Accumulate all coefficients */
+ total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
+ }
+
+ g_assert(total > 0.0f);
+
+ gfloat total2 = 0.0;
+
+ for (k=0; k<fir_filter_size; ++k)
+ {
+ gfloat total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
+ weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+
+ total2 = total3;
+ }
+ pos += pos_step;
+ }
+
+ guint y,x;
+ gint *wg = weights;
+
+ /* 24 pixels = 48 bytes/loop */
+ gint end_x_sse = (end_x/24)*24;
+
+ /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
+ gint add_round_sub = fir_filter_size * (FPScale >> 1);
+
+ __m128i add_32 = _mm_set_epi32(add_round_sub, add_round_sub,
add_round_sub, add_round_sub);
+
+ for (y = 0; y < new_size ; y++)
+ {
+ gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
+ gushort *out = GET_PIXEL(output, 0, y);
+ __m128i zero;
+ zero = _mm_setzero_si128();
+ for (x = start_x; x <= (end_x_sse-24); x+=24)
+ {
+ /* Accumulators, set to 0 */
+ __m128i acc1, acc2, acc3, acc1_h, acc2_h, acc3_h;
+ acc1 = acc2 = acc3 = acc1_h = acc2_h = acc3_h = zero;
+
+ for (i = 0; i < fir_filter_size; i++) {
+ /* Load weight */
+ __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+
+ /* Load source and prefetch next line */
+ int pos = i * input->rowstride;
+ __m128i src1i, src2i, src3i;
+ __m128i* in_sse = (__m128i*)&in[pos];
+ src1i = _mm_load_si128(in_sse);
+ src2i = _mm_load_si128(in_sse+1);
+ src3i = _mm_load_si128(in_sse+2);
+ _mm_prefetch(&in[pos + 32], _MM_HINT_NTA);
+
+ /* Unpack to dwords */
+ __m128i src1i_h, src2i_h, src3i_h;
+ src1i_h = _mm_unpackhi_epi16(src1i, zero);
+ src2i_h = _mm_unpackhi_epi16(src2i, zero);
+ src3i_h = _mm_unpackhi_epi16(src3i, zero);
+ src1i = _mm_unpacklo_epi16(src1i, zero);
+ src2i = _mm_unpacklo_epi16(src2i, zero);
+ src3i = _mm_unpacklo_epi16(src3i, zero);
+
+ /* Multiply my weight */
+ src1i_h = _mm_mullo_epi32(src1i_h, w);
+ src2i_h = _mm_mullo_epi32(src2i_h, w);
+ src3i_h = _mm_mullo_epi32(src3i_h, w);
+ src1i = _mm_mullo_epi32(src1i, w);
+ src2i = _mm_mullo_epi32(src2i, w);
+ src3i = _mm_mullo_epi32(src3i, w);
+
+ /* Accumulate */
+ acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+ acc2_h = _mm_add_epi32(acc2_h, src2i_h);
+ acc3_h = _mm_add_epi32(acc3_h, src3i_h);
+ acc1 = _mm_add_epi32(acc1, src1i);
+ acc2 = _mm_add_epi32(acc2, src2i);
+ acc3 = _mm_add_epi32(acc3, src3i);
+ }
+
+ /* Add rounder and subtract 32768 */
+ acc1_h = _mm_add_epi32(acc1_h, add_32);
+ acc2_h = _mm_add_epi32(acc2_h, add_32);
+ acc3_h = _mm_add_epi32(acc3_h, add_32);
+ acc1 = _mm_add_epi32(acc1, add_32);
+ acc2 = _mm_add_epi32(acc2, add_32);
+ acc3 = _mm_add_epi32(acc3, add_32);
+
+ /* Shift down */
+ acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift);
+ acc2_h = _mm_srai_epi32(acc2_h, FPScaleShift);
+ acc3_h = _mm_srai_epi32(acc3_h, FPScaleShift);
+ acc1 = _mm_srai_epi32(acc1, FPScaleShift);
+ acc2 = _mm_srai_epi32(acc2, FPScaleShift);
+ acc3 = _mm_srai_epi32(acc3, FPScaleShift);
+
+ /* Pack to signed shorts */
+ acc1 = _mm_packus_epi32(acc1, acc1_h);
+ acc2 = _mm_packus_epi32(acc2, acc2_h);
+ acc3 = _mm_packus_epi32(acc3, acc3_h);
+
+ /* Store result */
+ __m128i* sse_dst = (__m128i*)&out[x];
+ _mm_stream_si128(sse_dst, acc1);
+ _mm_stream_si128(sse_dst + 1, acc2);
+ _mm_stream_si128(sse_dst + 2, acc3);
+ in += 24;
+ }
+
+ /* Process remaining pixels */
+ for (; x < end_x; x++)
+ {
+ gint acc1 = 0;
+ for (i = 0; i < fir_filter_size; i++)
+ {
+ acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
+ }
+ out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
+ in++;
+ }
+ wg += fir_filter_size;
+ }
+ _mm_sfence();
+ g_free(weights);
+ g_free(offsets);
+}
+
+#elif defined (__SSE4_1__)
+#include <smmintrin.h>
+
+void
+ResizeV_SSE4(ResampleInfo *info)
+{
+ const RS_IMAGE16 *input = info->input;
+ const RS_IMAGE16 *output = info->output;
+ const guint old_size = info->old_size;
+ const guint new_size = info->new_size;
+ const guint start_x = info->dest_offset_other * input->pixelsize;
+ const guint end_x = info->dest_end_other * input->pixelsize;
+
+ gfloat pos_step = ((gfloat) old_size) / ((gfloat)new_size);
+ gfloat filter_step = MIN(1.0f / pos_step, 1.0f);
+ gfloat filter_support = (gfloat) lanczos_taps() / filter_step;
+ gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+ if (old_size <= fir_filter_size)
+ return ResizeV_fast(info);
+
+ gint *weights = g_new(gint, new_size * fir_filter_size);
+ gint *offsets = g_new(gint, new_size);
+
+ gfloat pos = 0.0;
+
+ gint i,j,k;
+
+ for (i=0; i<new_size; ++i)
+ {
+ gint end_pos = (gint) (pos + filter_support);
+ if (end_pos > old_size-1)
+ end_pos = old_size-1;
+
+ gint start_pos = end_pos - fir_filter_size + 1;
+
+ if (start_pos < 0)
+ start_pos = 0;
+
+ offsets[i] = start_pos;
+
+ /* The following code ensures that the coefficients add to
exactly FPScale */
+ gfloat total = 0.0;
+
+ /* Ensure that we have a valid position */
+ gfloat ok_pos = MAX(0.0f,MIN(old_size-1,pos));
+
+ for (j=0; j<fir_filter_size; ++j)
+ {
+ /* Accumulate all coefficients */
+ total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
+ }
+
+ g_assert(total > 0.0f);
+
+ gfloat total2 = 0.0;
+
+ for (k=0; k<fir_filter_size; ++k)
+ {
+ gfloat total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
+ weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+
+ total2 = total3;
+ }
+ pos += pos_step;
+ }
+
+ guint y,x;
+ gint *wg = weights;
+
+ /* 8 pixels = 16 bytes/loop */
+ gint end_x_sse = (end_x/8)*8;
+
+ /* Rounder after accumulation */
+ gint add_round_sub = (FPScale >> 1);
+ /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
+ add_round_sub += fir_filter_size * (FPScale >> 1);
+
+ for (y = 0; y < new_size ; y++)
+ {
+ gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
+ gushort *out = GET_PIXEL(output, 0, y);
+ __m128i zero;
+ zero = _mm_setzero_si128();
+ for (x = start_x; x <= (end_x_sse-8); x+=8)
+ {
+ /* Accumulators, set to 0 */
+ __m128i acc1, acc1_h;
+ acc1 = acc1_h = zero;
+
+ for (i = 0; i < fir_filter_size; i++) {
+ /* Load weight */
+ __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+ /* Load source */
+ __m128i src1i;
+ __m128i* in_sse = (__m128i*)&in[i *
input->rowstride];
+ src1i = _mm_load_si128(in_sse);
+ _mm_prefetch(in_sse+4, _MM_HINT_NTA);
+
+ /* Unpack to dwords */
+ __m128i src1i_h;
+ src1i_h = _mm_unpackhi_epi16(src1i, zero);
+ src1i = _mm_unpacklo_epi16(src1i, zero);
+
+ /* Multiply my weight */
+ src1i_h = _mm_mullo_epi32(src1i_h, w);
+ src1i = _mm_mullo_epi32(src1i, w);
+
+ /* Accumulate */
+ acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+ acc1 = _mm_add_epi32(acc1, src1i);
+ }
+ __m128i add_32 = _mm_set_epi32(add_round_sub,
add_round_sub, add_round_sub, add_round_sub);
+
+ /* Add rounder and subtract 32768 */
+ acc1_h = _mm_add_epi32(acc1_h, add_32);
+ acc1 = _mm_add_epi32(acc1, add_32);
+
+ /* Shift down */
+ acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift);
+ acc1 = _mm_srai_epi32(acc1, FPScaleShift);
+
+ /* Pack to signed shorts */
+ acc1 = _mm_packus_epi32(acc1, acc1_h);
+
+ /* Store result */
+ __m128i* sse_dst = (__m128i*)&out[x];
+ _mm_store_si128(sse_dst, acc1);
+ in += 8;
+ }
+
+ /* Process remaining pixels */
+ for (; x < end_x; x++)
+ {
+ gint acc1 = 0;
+ for (i = 0; i < fir_filter_size; i++)
+ {
+ acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
+ }
+ out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
+ in++;
+ }
+ wg += fir_filter_size;
+ }
+ g_free(weights);
+ g_free(offsets);
+}
+
+#else // not defined (__SSE4__)
+
+void
+ResizeV_SSE4(ResampleInfo *info)
+{
+ ResizeV_SSE2(info);
+}
+
+#endif // not defined (__x86_64__) and not defined (__SSE4__)
+
+
Modified: trunk/plugins/resample/resample.c
===================================================================
--- trunk/plugins/resample/resample.c 2011-04-12 22:32:17 UTC (rev 3984)
+++ trunk/plugins/resample/resample.c 2011-04-13 13:01:53 UTC (rev 3985)
@@ -83,6 +83,8 @@
static void ResizeH(ResampleInfo *info);
void ResizeV(ResampleInfo *info);
extern void ResizeV_SSE2(ResampleInfo *info);
+extern void ResizeV_SSE4(ResampleInfo *info);
+extern void ResizeV_AVX(ResampleInfo *info);
static void ResizeH_compatible(ResampleInfo *info);
static void ResizeV_compatible(ResampleInfo *info);
static void ResizeH_fast(ResampleInfo *info);
@@ -317,10 +319,20 @@
if (t->input->h != t->output->h)
{
gboolean sse2_available = !!(rs_detect_cpu_features() &
RS_CPU_FLAG_SSE2);
+#if 0 // FIXME: Test and enable
+ gboolean sse4_available = !!(rs_detect_cpu_features() &
RS_CPU_FLAG_SSE4_1);
+ gboolean avx_available = !!(rs_detect_cpu_features() &
RS_CPU_FLAG_AVX);
+#endif
if (t->use_fast)
ResizeV_fast(t);
else if (t->use_compatible)
ResizeV_compatible(t);
+#if 0 // FIXME: Test and enable
+ else if (avx_available)
+ ResizeV_AVX(t);
+ else if (sse4_available)
+ ResizeV_SSE4(t);
+#endif
else if (sse2_available)
ResizeV_SSE2(t);
else
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit