Author: post
Date: 2009-12-31 00:41:09 +0100 (Thu, 31 Dec 2009)
New Revision: 2931
Added:
branches/rawstudio-ng-color/plugins/resample/resample-sse2.c
Modified:
branches/rawstudio-ng-color/plugins/resample/Makefile.am
branches/rawstudio-ng-color/plugins/resample/resample.c
Log:
Resampler: Move SSE2 assembler to separate file.
Modified: branches/rawstudio-ng-color/plugins/resample/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/resample/Makefile.am 2009-12-30
23:14:23 UTC (rev 2930)
+++ branches/rawstudio-ng-color/plugins/resample/Makefile.am 2009-12-30
23:41:09 UTC (rev 2931)
@@ -1,15 +1,8 @@
plugindir = $(libdir)/rawstudio/plugins
-if CAN_COMPILE_SSE2
-SSE_FLAG=-msse2
-else
-SSE_FLAG=
-endif
-
AM_CFLAGS =\
-Wall\
- -O4\
- $(SSE_FLAG)
+ -O4
AM_CXXFLAGS = $(AM_CFLAGS)
@@ -23,6 +16,16 @@
libdir = $(datadir)/rawstudio/plugins/
-resample_la_LIBADD = @PACKAGE_LIBS@
+resample_la_LIBADD = @PACKAGE_LIBS@ resample-sse2.lo
resample_la_LDFLAGS = -module -avoid-version
-resample_la_SOURCES = resample.c
+resample.lo: resample.c
+ $(LTCOMPILE) -DEXIT_CODE=0 -c resample.c
+
+resample-sse2.lo: resample-sse2.c
+if CAN_COMPILE_SSE2
+SSE_FLAG=-msse2
+else
+SSE_FLAG=
+endif
+ $(LTCOMPILE) $(SSE_FLAG) -DEXIT_CODE=1 -c resample-sse2.c
+
Added: branches/rawstudio-ng-color/plugins/resample/resample-sse2.c
===================================================================
--- branches/rawstudio-ng-color/plugins/resample/resample-sse2.c
(rev 0)
+++ branches/rawstudio-ng-color/plugins/resample/resample-sse2.c
2009-12-30 23:41:09 UTC (rev 2931)
@@ -0,0 +1,440 @@
+/*
+ * Copyright (C) 2006-2009 Anders Brander <[email protected]> and
+ * Anders Kvist <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
+ */
+
+/* Plugin tmpl version 4 */
+
+#include <rawstudio.h>
+#include <math.h>
+
+
+/* Special Vertical SSE2 resampler, that has massive parallism.
+ * An important restriction is that "info->dest_offset_other", must result
+ * in a 16 byte aligned memory pointer.
+ */
+
+typedef struct {
+ RS_IMAGE16 *input; /* Input Image to Resampler */
+ RS_IMAGE16 *output; /* Output Image from Resampler
*/
+ guint old_size; /* Old dimension in the
direction of the resampler*/
+ guint new_size; /* New size in the direction of
the resampler */
+ guint dest_offset_other; /* Where in the unchanged direction
should we begin writing? */
+ guint dest_end_other; /* Where in the unchanged direction
should we stop writing? */
+ guint (*resample_support)();
+ gdouble (*resample_func)(gdouble);
+ GThread *threadid;
+ gboolean use_compatible; /* Use compatible resampler if
pixelsize != 4 */
+ gboolean use_fast; /* Use nearest neighbour resampler,
also compatible*/
+} ResampleInfo;
+
+extern void ResizeV(ResampleInfo *info);
+extern void ResizeV_fast(ResampleInfo *info);
+static inline guint clampbits(gint x, guint n) { guint32 _y_temp; if(
(_y_temp=x>>n) ) x = ~_y_temp >> (32-n); return x;}
+
+static guint
+lanczos_taps()
+{
+ return 3;
+}
+
+static gdouble
+sinc(gdouble value)
+{
+ if (value != 0.0)
+ {
+ value *= M_PI;
+ return sin(value) / value;
+ }
+ else
+ return 1.0;
+}
+
+static gdouble
+lanczos_weight(gdouble value)
+{
+ value = fabs(value);
+ if (value < lanczos_taps())
+ {
+ return (sinc(value) * sinc(value / lanczos_taps()));
+ }
+ else
+ return 0.0;
+}
+
+const static gint FPScale = 16384; /* fixed point scaler */
+const static gint FPScaleShift = 14; /* fixed point scaler */
+
+
+#if defined (__x86_64__)
+#include <emmintrin.h>
+
+void
+ResizeV_SSE2(ResampleInfo *info)
+{
+ const RS_IMAGE16 *input = info->input;
+ const RS_IMAGE16 *output = info->output;
+ const guint old_size = info->old_size;
+ const guint new_size = info->new_size;
+ const guint start_x = info->dest_offset_other * input->pixelsize;
+ const guint end_x = info->dest_end_other * input->pixelsize;
+
+ gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
+ gdouble filter_step = MIN(1.0 / pos_step, 1.0);
+ gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
+ gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+ if (old_size <= fir_filter_size)
+ return ResizeV_fast(info);
+
+ gint *weights = g_new(gint, new_size * fir_filter_size);
+ gint *offsets = g_new(gint, new_size);
+
+ gdouble pos = 0.0;
+
+ gint i,j,k;
+
+ for (i=0; i<new_size; ++i)
+ {
+ gint end_pos = (gint) (pos + filter_support);
+ if (end_pos > old_size-1)
+ end_pos = old_size-1;
+
+ gint start_pos = end_pos - fir_filter_size + 1;
+
+ if (start_pos < 0)
+ start_pos = 0;
+
+ offsets[i] = start_pos;
+
+ /* The following code ensures that the coefficients add to
exactly FPScale */
+ gdouble total = 0.0;
+
+ /* Ensure that we have a valid position */
+ gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
+
+ for (j=0; j<fir_filter_size; ++j)
+ {
+ /* Accumulate all coefficients */
+ total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
+ }
+
+ g_assert(total > 0.0f);
+
+ gdouble total2 = 0.0;
+
+ for (k=0; k<fir_filter_size; ++k)
+ {
+ gdouble total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
+ weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+
+ total2 = total3;
+ }
+ pos += pos_step;
+ }
+
+ guint y,x;
+ gint *wg = weights;
+
+ /* 24 pixels = 48 bytes/loop */
+ gint end_x_sse = (end_x/24)*24;
+
+ /* Subtract 32768 as it would appear after shift */
+ gint add_round_sub = -(32768 << (FPScaleShift-1));
+ /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
+ add_round_sub += fir_filter_size * (FPScale >> 2);
+
+ __m128i add_32 = _mm_set_epi32(add_round_sub, add_round_sub,
add_round_sub, add_round_sub);
+ __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 0x80008000,
0x80008000);
+
+ for (y = 0; y < new_size ; y++)
+ {
+ gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
+ gushort *out = GET_PIXEL(output, 0, y);
+ __m128i zero;
+ zero = _mm_setzero_si128();
+ for (x = start_x; x <= (end_x_sse-24); x+=24)
+ {
+ /* Accumulators, set to 0 */
+ __m128i acc1, acc2, acc3, acc1_h, acc2_h, acc3_h;
+ acc1 = acc2 = acc3 = acc1_h = acc2_h = acc3_h = zero;
+
+ for (i = 0; i < fir_filter_size; i++) {
+ /* Load weight */
+ __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+
+ /* Load source and prefetch next line */
+ int pos = i * input->rowstride;
+ __m128i src1i, src2i, src3i;
+ __m128i* in_sse = (__m128i*)&in[pos];
+ src1i = _mm_load_si128(in_sse);
+ src2i = _mm_load_si128(in_sse+1);
+ src3i = _mm_load_si128(in_sse+2);
+ _mm_prefetch(&in[pos + 32], _MM_HINT_T0);
+
+ /* Unpack to dwords */
+ __m128i src1i_h, src2i_h, src3i_h;
+ src1i_h = _mm_unpackhi_epi16(src1i, zero);
+ src2i_h = _mm_unpackhi_epi16(src2i, zero);
+ src3i_h = _mm_unpackhi_epi16(src3i, zero);
+ src1i = _mm_unpacklo_epi16(src1i, zero);
+ src2i = _mm_unpacklo_epi16(src2i, zero);
+ src3i = _mm_unpacklo_epi16(src3i, zero);
+
+ /*Shift down to 15 bit for multiplication */
+ src1i_h = _mm_srli_epi16(src1i_h, 1);
+ src2i_h = _mm_srli_epi16(src2i_h, 1);
+ src3i_h = _mm_srli_epi16(src3i_h, 1);
+ src1i = _mm_srli_epi16(src1i, 1);
+ src2i = _mm_srli_epi16(src2i, 1);
+ src3i = _mm_srli_epi16(src3i, 1);
+
+ /* Multiply my weight */
+ src1i_h = _mm_madd_epi16(src1i_h, w);
+ src2i_h = _mm_madd_epi16(src2i_h, w);
+ src3i_h = _mm_madd_epi16(src3i_h, w);
+ src1i = _mm_madd_epi16(src1i, w);
+ src2i = _mm_madd_epi16(src2i, w);
+ src3i = _mm_madd_epi16(src3i, w);
+
+ /* Accumulate */
+ acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+ acc2_h = _mm_add_epi32(acc2_h, src2i_h);
+ acc3_h = _mm_add_epi32(acc3_h, src3i_h);
+ acc1 = _mm_add_epi32(acc1, src1i);
+ acc2 = _mm_add_epi32(acc2, src2i);
+ acc3 = _mm_add_epi32(acc3, src3i);
+ }
+
+ /* Add rounder and subtract 32768 */
+ acc1_h = _mm_add_epi32(acc1_h, add_32);
+ acc2_h = _mm_add_epi32(acc2_h, add_32);
+ acc3_h = _mm_add_epi32(acc3_h, add_32);
+ acc1 = _mm_add_epi32(acc1, add_32);
+ acc2 = _mm_add_epi32(acc2, add_32);
+ acc3 = _mm_add_epi32(acc3, add_32);
+
+ /* Shift down */
+ acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
+ acc2_h = _mm_srai_epi32(acc2_h, FPScaleShift - 1);
+ acc3_h = _mm_srai_epi32(acc3_h, FPScaleShift - 1);
+ acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
+ acc2 = _mm_srai_epi32(acc2, FPScaleShift - 1);
+ acc3 = _mm_srai_epi32(acc3, FPScaleShift - 1);
+
+ /* Pack to signed shorts */
+ acc1 = _mm_packs_epi32(acc1, acc1_h);
+ acc2 = _mm_packs_epi32(acc2, acc2_h);
+ acc3 = _mm_packs_epi32(acc3, acc3_h);
+
+ /* Shift sign to unsinged shorts */
+ acc1 = _mm_xor_si128(acc1, signxor);
+ acc2 = _mm_xor_si128(acc2, signxor);
+ acc3 = _mm_xor_si128(acc3, signxor);
+
+ /* Store result */
+ __m128i* sse_dst = (__m128i*)&out[x];
+ _mm_store_si128(sse_dst, acc1);
+ _mm_store_si128(sse_dst + 1, acc2);
+ _mm_store_si128(sse_dst + 2, acc3);
+ in += 24;
+ }
+
+ /* Process remaining pixels */
+ for (; x < end_x; x++)
+ {
+ gint acc1 = 0;
+ for (i = 0; i < fir_filter_size; i++)
+ {
+ acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
+ }
+ out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
+ in++;
+ }
+ wg += fir_filter_size;
+ }
+ g_free(weights);
+ g_free(offsets);
+}
+
+#elif defined (__SSE2__)
+#include <emmintrin.h>
+
+void
+ResizeV_SSE2(ResampleInfo *info)
+{
+ const RS_IMAGE16 *input = info->input;
+ const RS_IMAGE16 *output = info->output;
+ const guint old_size = info->old_size;
+ const guint new_size = info->new_size;
+ const guint start_x = info->dest_offset_other * input->pixelsize;
+ const guint end_x = info->dest_end_other * input->pixelsize;
+
+ gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
+ gdouble filter_step = MIN(1.0 / pos_step, 1.0);
+ gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
+ gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+ if (old_size <= fir_filter_size)
+ return ResizeV_fast(info);
+
+ gint *weights = g_new(gint, new_size * fir_filter_size);
+ gint *offsets = g_new(gint, new_size);
+
+ gdouble pos = 0.0;
+
+ gint i,j,k;
+
+ for (i=0; i<new_size; ++i)
+ {
+ gint end_pos = (gint) (pos + filter_support);
+ if (end_pos > old_size-1)
+ end_pos = old_size-1;
+
+ gint start_pos = end_pos - fir_filter_size + 1;
+
+ if (start_pos < 0)
+ start_pos = 0;
+
+ offsets[i] = start_pos;
+
+ /* The following code ensures that the coefficients add to
exactly FPScale */
+ gdouble total = 0.0;
+
+ /* Ensure that we have a valid position */
+ gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
+
+ for (j=0; j<fir_filter_size; ++j)
+ {
+ /* Accumulate all coefficients */
+ total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
+ }
+
+ g_assert(total > 0.0f);
+
+ gdouble total2 = 0.0;
+
+ for (k=0; k<fir_filter_size; ++k)
+ {
+ gdouble total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
+ weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+
+ total2 = total3;
+ }
+ pos += pos_step;
+ }
+
+ guint y,x;
+ gint *wg = weights;
+
+ /* 8 pixels = 16 bytes/loop */
+ gint end_x_sse = (end_x/8)*8;
+
+ /* Rounder after accumulation, half because input is scaled down */
+ gint add_round_sub = (FPScale >> 2);
+ /* Subtract 32768 as it would appear after shift */
+ add_round_sub -= (32768 << (FPScaleShift-1));
+ /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
+ add_round_sub += fir_filter_size * (FPScale >> 2);
+
+ for (y = 0; y < new_size ; y++)
+ {
+ gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
+ gushort *out = GET_PIXEL(output, 0, y);
+ __m128i zero;
+ zero = _mm_setzero_si128();
+ for (x = start_x; x <= (end_x_sse-8); x+=8)
+ {
+ /* Accumulators, set to 0 */
+ __m128i acc1, acc1_h;
+ acc1 = acc1_h = zero;
+
+ for (i = 0; i < fir_filter_size; i++) {
+ /* Load weight */
+ __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+ /* Load source */
+ __m128i src1i;
+ __m128i* in_sse = (__m128i*)&in[i *
input->rowstride];
+ src1i = _mm_load_si128(in_sse);
+ /* Unpack to dwords */
+ __m128i src1i_h;
+ src1i_h = _mm_unpackhi_epi16(src1i, zero);
+ src1i = _mm_unpacklo_epi16(src1i, zero);
+
+ /*Shift down to 15 bit for multiplication */
+ src1i_h = _mm_srli_epi16(src1i_h, 1);
+ src1i = _mm_srli_epi16(src1i, 1);
+
+ /* Multiply my weight */
+ src1i_h = _mm_madd_epi16(src1i_h, w);
+ src1i = _mm_madd_epi16(src1i, w);
+
+ /* Accumulate */
+ acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+ acc1 = _mm_add_epi32(acc1, src1i);
+ }
+ __m128i add_32 = _mm_set_epi32(add_round_sub,
add_round_sub, add_round_sub, add_round_sub);
+ __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000,
0x80008000, 0x80008000);
+
+ /* Add rounder and subtract 32768 */
+ acc1_h = _mm_add_epi32(acc1_h, add_32);
+ acc1 = _mm_add_epi32(acc1, add_32);
+
+ /* Shift down */
+ acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
+ acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
+
+ /* Pack to signed shorts */
+ acc1 = _mm_packs_epi32(acc1, acc1_h);
+
+ /* Shift sign to unsinged shorts */
+ acc1 = _mm_xor_si128(acc1, signxor);
+
+ /* Store result */
+ __m128i* sse_dst = (__m128i*)&out[x];
+ _mm_store_si128(sse_dst, acc1);
+ in += 8;
+ }
+
+ /* Process remaining pixels */
+ for (; x < end_x; x++)
+ {
+ gint acc1 = 0;
+ for (i = 0; i < fir_filter_size; i++)
+ {
+ acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
+ }
+ out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
+ in++;
+ }
+ wg += fir_filter_size;
+ }
+ g_free(weights);
+ g_free(offsets);
+}
+
+#else // not defined (__SSE2__)
+
+static void
+ResizeV_SSE2(ResampleInfo *info)
+{
+ ResizeV(info);
+}
+
+#endif // not defined (__x86_64__) and not defined (__SSE2__)
+
+
Modified: branches/rawstudio-ng-color/plugins/resample/resample.c
===================================================================
--- branches/rawstudio-ng-color/plugins/resample/resample.c 2009-12-30
23:14:23 UTC (rev 2930)
+++ branches/rawstudio-ng-color/plugins/resample/resample.c 2009-12-30
23:41:09 UTC (rev 2931)
@@ -21,9 +21,6 @@
#include <rawstudio.h>
#include <math.h>
-#if defined (__SSE2__)
-#include <emmintrin.h>
-#endif /* __SSE2__ */
#define RS_TYPE_RESAMPLE (rs_resample_type)
@@ -81,15 +78,15 @@
static gint get_width(RSFilter *filter);
static gint get_height(RSFilter *filter);
static void ResizeH(ResampleInfo *info);
-static void ResizeV(ResampleInfo *info);
-static void ResizeV_SSE2(ResampleInfo *info);
+void ResizeV(ResampleInfo *info);
+extern void ResizeV_SSE2(ResampleInfo *info);
static void ResizeH_compatible(ResampleInfo *info);
static void ResizeV_compatible(ResampleInfo *info);
static void ResizeH_fast(ResampleInfo *info);
-static void ResizeV_fast(ResampleInfo *info);
+void ResizeV_fast(ResampleInfo *info);
static RSFilterClass *rs_resample_parent_class = NULL;
-inline guint clampbits(gint x, guint n) { guint32 _y_temp; if( (_y_temp=x>>n)
) x = ~_y_temp >> (32-n); return x;}
+static inline guint clampbits(gint x, guint n) { guint32 _y_temp; if(
(_y_temp=x>>n) ) x = ~_y_temp >> (32-n); return x;}
G_MODULE_EXPORT void
rs_plugin_load(RSPlugin *plugin)
@@ -561,7 +558,7 @@
}
-static void
+void
ResizeV(ResampleInfo *info)
{
const RS_IMAGE16 *input = info->input;
@@ -658,368 +655,7 @@
}
-/* Special Vertical SSE2 resampler, that has massive parallism.
- * An important restriction is that "info->dest_offset_other", must result
- * in a 16 byte aligned memory pointer.
- */
-
-#if defined (__x86_64__)
-#if defined (__SSE2__)
-
static void
-ResizeV_SSE2(ResampleInfo *info)
-{
- const RS_IMAGE16 *input = info->input;
- const RS_IMAGE16 *output = info->output;
- const guint old_size = info->old_size;
- const guint new_size = info->new_size;
- const guint start_x = info->dest_offset_other * input->pixelsize;
- const guint end_x = info->dest_end_other * input->pixelsize;
-
- gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
- gdouble filter_step = MIN(1.0 / pos_step, 1.0);
- gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
- gint fir_filter_size = (gint) (ceil(filter_support*2));
-
- if (old_size <= fir_filter_size)
- return ResizeV_fast(info);
-
- gint *weights = g_new(gint, new_size * fir_filter_size);
- gint *offsets = g_new(gint, new_size);
-
- gdouble pos = 0.0;
-
- gint i,j,k;
-
- for (i=0; i<new_size; ++i)
- {
- gint end_pos = (gint) (pos + filter_support);
- if (end_pos > old_size-1)
- end_pos = old_size-1;
-
- gint start_pos = end_pos - fir_filter_size + 1;
-
- if (start_pos < 0)
- start_pos = 0;
-
- offsets[i] = start_pos;
-
- /* The following code ensures that the coefficients add to
exactly FPScale */
- gdouble total = 0.0;
-
- /* Ensure that we have a valid position */
- gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
-
- for (j=0; j<fir_filter_size; ++j)
- {
- /* Accumulate all coefficients */
- total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
- }
-
- g_assert(total > 0.0f);
-
- gdouble total2 = 0.0;
-
- for (k=0; k<fir_filter_size; ++k)
- {
- gdouble total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
- weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
-
- total2 = total3;
- }
- pos += pos_step;
- }
-
- guint y,x;
- gint *wg = weights;
-
- /* 24 pixels = 48 bytes/loop */
- gint end_x_sse = (end_x/24)*24;
-
- /* Subtract 32768 as it would appear after shift */
- gint add_round_sub = -(32768 << (FPScaleShift-1));
- /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
- add_round_sub += fir_filter_size * (FPScale >> 2);
-
- __m128i add_32 = _mm_set_epi32(add_round_sub, add_round_sub,
add_round_sub, add_round_sub);
- __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 0x80008000,
0x80008000);
-
- for (y = 0; y < new_size ; y++)
- {
- gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
- gushort *out = GET_PIXEL(output, 0, y);
- __m128i zero;
- zero = _mm_xor_si128(zero, zero);
- for (x = start_x; x <= (end_x_sse-24); x+=24)
- {
- /* Accumulators, set to 0 */
- __m128i acc1, acc2, acc3, acc1_h, acc2_h, acc3_h;
- acc1 = acc2 = acc3 = acc1_h = acc2_h = acc3_h = zero;
-
- for (i = 0; i < fir_filter_size; i++) {
- /* Load weight */
- __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
-
- /* Load source and prefetch next line */
- int pos = i * input->rowstride;
- __m128i src1i, src2i, src3i;
- __m128i* in_sse = (__m128i*)&in[pos];
- src1i = _mm_load_si128(in_sse);
- src2i = _mm_load_si128(in_sse+1);
- src3i = _mm_load_si128(in_sse+2);
- _mm_prefetch(&in[pos + 32], _MM_HINT_T0);
-
- /* Unpack to dwords */
- __m128i src1i_h, src2i_h, src3i_h;
- src1i_h = _mm_unpackhi_epi16(src1i, zero);
- src2i_h = _mm_unpackhi_epi16(src2i, zero);
- src3i_h = _mm_unpackhi_epi16(src3i, zero);
- src1i = _mm_unpacklo_epi16(src1i, zero);
- src2i = _mm_unpacklo_epi16(src2i, zero);
- src3i = _mm_unpacklo_epi16(src3i, zero);
-
- /*Shift down to 15 bit for multiplication */
- src1i_h = _mm_srli_epi16(src1i_h, 1);
- src2i_h = _mm_srli_epi16(src2i_h, 1);
- src3i_h = _mm_srli_epi16(src3i_h, 1);
- src1i = _mm_srli_epi16(src1i, 1);
- src2i = _mm_srli_epi16(src2i, 1);
- src3i = _mm_srli_epi16(src3i, 1);
-
- /* Multiply my weight */
- src1i_h = _mm_madd_epi16(src1i_h, w);
- src2i_h = _mm_madd_epi16(src2i_h, w);
- src3i_h = _mm_madd_epi16(src3i_h, w);
- src1i = _mm_madd_epi16(src1i, w);
- src2i = _mm_madd_epi16(src2i, w);
- src3i = _mm_madd_epi16(src3i, w);
-
- /* Accumulate */
- acc1_h = _mm_add_epi32(acc1_h, src1i_h);
- acc2_h = _mm_add_epi32(acc2_h, src2i_h);
- acc3_h = _mm_add_epi32(acc3_h, src3i_h);
- acc1 = _mm_add_epi32(acc1, src1i);
- acc2 = _mm_add_epi32(acc2, src2i);
- acc3 = _mm_add_epi32(acc3, src3i);
- }
-
- /* Add rounder and subtract 32768 */
- acc1_h = _mm_add_epi32(acc1_h, add_32);
- acc2_h = _mm_add_epi32(acc2_h, add_32);
- acc3_h = _mm_add_epi32(acc3_h, add_32);
- acc1 = _mm_add_epi32(acc1, add_32);
- acc2 = _mm_add_epi32(acc2, add_32);
- acc3 = _mm_add_epi32(acc3, add_32);
-
- /* Shift down */
- acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
- acc2_h = _mm_srai_epi32(acc2_h, FPScaleShift - 1);
- acc3_h = _mm_srai_epi32(acc3_h, FPScaleShift - 1);
- acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
- acc2 = _mm_srai_epi32(acc2, FPScaleShift - 1);
- acc3 = _mm_srai_epi32(acc3, FPScaleShift - 1);
-
- /* Pack to signed shorts */
- acc1 = _mm_packs_epi32(acc1, acc1_h);
- acc2 = _mm_packs_epi32(acc2, acc2_h);
- acc3 = _mm_packs_epi32(acc3, acc3_h);
-
- /* Shift sign to unsinged shorts */
- acc1 = _mm_xor_si128(acc1, signxor);
- acc2 = _mm_xor_si128(acc2, signxor);
- acc3 = _mm_xor_si128(acc3, signxor);
-
- /* Store result */
- __m128i* sse_dst = (__m128i*)&out[x];
- _mm_store_si128(sse_dst, acc1);
- _mm_store_si128(sse_dst + 1, acc2);
- _mm_store_si128(sse_dst + 2, acc3);
- in += 24;
- }
-
- /* Process remaining pixels */
- for (; x < end_x; x++)
- {
- gint acc1 = 0;
- for (i = 0; i < fir_filter_size; i++)
- {
- acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
- }
- out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
- in++;
- }
- wg += fir_filter_size;
- }
- g_free(weights);
- g_free(offsets);
-}
-#endif /* defined (__SSE2__) */
-#elif defined (__SSE2__)
-
-static void
-ResizeV_SSE2(ResampleInfo *info)
-{
- const RS_IMAGE16 *input = info->input;
- const RS_IMAGE16 *output = info->output;
- const guint old_size = info->old_size;
- const guint new_size = info->new_size;
- const guint start_x = info->dest_offset_other * input->pixelsize;
- const guint end_x = info->dest_end_other * input->pixelsize;
-
- gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
- gdouble filter_step = MIN(1.0 / pos_step, 1.0);
- gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
- gint fir_filter_size = (gint) (ceil(filter_support*2));
-
- if (old_size <= fir_filter_size)
- return ResizeV_fast(info);
-
- gint *weights = g_new(gint, new_size * fir_filter_size);
- gint *offsets = g_new(gint, new_size);
-
- gdouble pos = 0.0;
-
- gint i,j,k;
-
- for (i=0; i<new_size; ++i)
- {
- gint end_pos = (gint) (pos + filter_support);
- if (end_pos > old_size-1)
- end_pos = old_size-1;
-
- gint start_pos = end_pos - fir_filter_size + 1;
-
- if (start_pos < 0)
- start_pos = 0;
-
- offsets[i] = start_pos;
-
- /* The following code ensures that the coefficients add to
exactly FPScale */
- gdouble total = 0.0;
-
- /* Ensure that we have a valid position */
- gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
-
- for (j=0; j<fir_filter_size; ++j)
- {
- /* Accumulate all coefficients */
- total += lanczos_weight((start_pos+j - ok_pos) *
filter_step);
- }
-
- g_assert(total > 0.0f);
-
- gdouble total2 = 0.0;
-
- for (k=0; k<fir_filter_size; ++k)
- {
- gdouble total3 = total2 + lanczos_weight((start_pos+k -
ok_pos) * filter_step) / total;
- weights[i*fir_filter_size+k] = ((gint)
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
-
- total2 = total3;
- }
- pos += pos_step;
- }
-
- guint y,x;
- gint *wg = weights;
-
- /* 8 pixels = 16 bytes/loop */
- gint end_x_sse = (end_x/8)*8;
-
- /* Rounder after accumulation, half because input is scaled down */
- gint add_round_sub = (FPScale >> 2);
- /* Subtract 32768 as it would appear after shift */
- add_round_sub -= (32768 << (FPScaleShift-1));
- /* 0.5 pixel value is lost to rounding times fir_filter_size,
compensate */
- add_round_sub += fir_filter_size * (FPScale >> 2);
-
- for (y = 0; y < new_size ; y++)
- {
- gushort *in = GET_PIXEL(input, start_x / input->pixelsize,
offsets[y]);
- gushort *out = GET_PIXEL(output, 0, y);
- __m128i zero;
- zero = _mm_xor_si128(zero, zero);
- for (x = start_x; x <= (end_x_sse-8); x+=8)
- {
- /* Accumulators, set to 0 */
- __m128i acc1, acc1_h;
- acc1 = acc1_h = zero;
-
- for (i = 0; i < fir_filter_size; i++) {
- /* Load weight */
- __m128i w =
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
- /* Load source */
- __m128i src1i;
- __m128i* in_sse = (__m128i*)&in[i *
input->rowstride];
- src1i = _mm_load_si128(in_sse);
- /* Unpack to dwords */
- __m128i src1i_h;
- src1i_h = _mm_unpackhi_epi16(src1i, zero);
- src1i = _mm_unpacklo_epi16(src1i, zero);
-
- /*Shift down to 15 bit for multiplication */
- src1i_h = _mm_srli_epi16(src1i_h, 1);
- src1i = _mm_srli_epi16(src1i, 1);
-
- /* Multiply my weight */
- src1i_h = _mm_madd_epi16(src1i_h, w);
- src1i = _mm_madd_epi16(src1i, w);
-
- /* Accumulate */
- acc1_h = _mm_add_epi32(acc1_h, src1i_h);
- acc1 = _mm_add_epi32(acc1, src1i);
- }
- __m128i add_32 = _mm_set_epi32(add_round_sub,
add_round_sub, add_round_sub, add_round_sub);
- __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000,
0x80008000, 0x80008000);
-
- /* Add rounder and subtract 32768 */
- acc1_h = _mm_add_epi32(acc1_h, add_32);
- acc1 = _mm_add_epi32(acc1, add_32);
-
- /* Shift down */
- acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
- acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
-
- /* Pack to signed shorts */
- acc1 = _mm_packs_epi32(acc1, acc1_h);
-
- /* Shift sign to unsinged shorts */
- acc1 = _mm_xor_si128(acc1, signxor);
-
- /* Store result */
- __m128i* sse_dst = (__m128i*)&out[x];
- _mm_store_si128(sse_dst, acc1);
- in += 8;
- }
-
- /* Process remaining pixels */
- for (; x < end_x; x++)
- {
- gint acc1 = 0;
- for (i = 0; i < fir_filter_size; i++)
- {
- acc1 += in[i * input->rowstride] *
*(gshort*)&wg[i];
- }
- out[x] = clampbits((acc1 + (FPScale / 2)) >>
FPScaleShift, 16);
- in++;
- }
- wg += fir_filter_size;
- }
- g_free(weights);
- g_free(offsets);
-}
-
-#else // not defined (__SSE2__)
-
-static void
-ResizeV_SSE2(ResampleInfo *info)
-{
- ResizeV(info);
-}
-
-#endif // not defined (__x86_64__) and not defined (__SSE2__)
-
-static void
ResizeH_compatible(ResampleInfo *info)
{
const RS_IMAGE16 *input = info->input;
@@ -1207,7 +843,7 @@
g_free(offsets);
}
-static void
+void
ResizeV_fast(ResampleInfo *info)
{
const RS_IMAGE16 *input = info->input;
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit