Author: post
Date: 2009-12-31 00:41:09 +0100 (Thu, 31 Dec 2009)
New Revision: 2931

Added:
   branches/rawstudio-ng-color/plugins/resample/resample-sse2.c
Modified:
   branches/rawstudio-ng-color/plugins/resample/Makefile.am
   branches/rawstudio-ng-color/plugins/resample/resample.c
Log:
Resampler: Move SSE2 assembler to separate file.

Modified: branches/rawstudio-ng-color/plugins/resample/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/resample/Makefile.am    2009-12-30 
23:14:23 UTC (rev 2930)
+++ branches/rawstudio-ng-color/plugins/resample/Makefile.am    2009-12-30 
23:41:09 UTC (rev 2931)
@@ -1,15 +1,8 @@
 plugindir = $(libdir)/rawstudio/plugins
 
-if CAN_COMPILE_SSE2
-SSE_FLAG=-msse2
-else
-SSE_FLAG=
-endif
-
 AM_CFLAGS =\
        -Wall\
-       -O4\
-       $(SSE_FLAG)
+       -O4
 
 AM_CXXFLAGS = $(AM_CFLAGS)
 
@@ -23,6 +16,16 @@
 
 libdir = $(datadir)/rawstudio/plugins/
 
-resample_la_LIBADD = @PACKAGE_LIBS@
+resample_la_LIBADD = @PACKAGE_LIBS@ resample-sse2.lo
 resample_la_LDFLAGS = -module -avoid-version
-resample_la_SOURCES = resample.c
+resample.lo: resample.c
+       $(LTCOMPILE) -DEXIT_CODE=0 -c resample.c
+
+resample-sse2.lo: resample-sse2.c
+if CAN_COMPILE_SSE2
+SSE_FLAG=-msse2
+else
+SSE_FLAG=
+endif
+       $(LTCOMPILE) $(SSE_FLAG) -DEXIT_CODE=1 -c resample-sse2.c
+

Added: branches/rawstudio-ng-color/plugins/resample/resample-sse2.c
===================================================================
--- branches/rawstudio-ng-color/plugins/resample/resample-sse2.c                
                (rev 0)
+++ branches/rawstudio-ng-color/plugins/resample/resample-sse2.c        
2009-12-30 23:41:09 UTC (rev 2931)
@@ -0,0 +1,440 @@
+/*
+ * Copyright (C) 2006-2009 Anders Brander <[email protected]> and
+ * Anders Kvist <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, 
USA.
+ */
+
+/* Plugin tmpl version 4 */
+
+#include <rawstudio.h>
+#include <math.h>
+
+
+/* Special Vertical SSE2 resampler, that has massive parallism.
+ * An important restriction is that "info->dest_offset_other", must result
+ * in a 16 byte aligned memory pointer.
+ */
+
+typedef struct {
+       RS_IMAGE16 *input;                      /* Input Image to Resampler */
+       RS_IMAGE16 *output;                     /* Output Image from Resampler 
*/
+       guint old_size;                         /* Old dimension in the 
direction of the resampler*/
+       guint new_size;                         /* New size in the direction of 
the resampler */
+       guint dest_offset_other;        /* Where in the unchanged direction 
should we begin writing? */
+       guint dest_end_other;           /* Where in the unchanged direction 
should we stop writing? */
+       guint (*resample_support)();
+       gdouble (*resample_func)(gdouble);
+       GThread *threadid;
+       gboolean use_compatible;        /* Use compatible resampler if 
pixelsize != 4 */
+       gboolean use_fast;              /* Use nearest neighbour resampler, 
also compatible*/
+} ResampleInfo;
+
+extern void ResizeV(ResampleInfo *info);
+extern void ResizeV_fast(ResampleInfo *info);
+static inline guint clampbits(gint x, guint n) { guint32 _y_temp; if( 
(_y_temp=x>>n) ) x = ~_y_temp >> (32-n); return x;}
+
+static guint
+lanczos_taps()
+{
+       return 3;
+}
+
+static gdouble
+sinc(gdouble value)
+{
+       if (value != 0.0)
+       {
+               value *= M_PI;
+               return sin(value) / value;
+       }
+       else
+               return 1.0;
+}
+
+static gdouble
+lanczos_weight(gdouble value)
+{
+       value = fabs(value);
+       if (value < lanczos_taps())
+       {
+               return (sinc(value) * sinc(value / lanczos_taps()));
+       }
+       else
+               return 0.0;
+}
+
+const static gint FPScale = 16384; /* fixed point scaler */
+const static gint FPScaleShift = 14; /* fixed point scaler */
+
+
+#if defined (__x86_64__)
+#include <emmintrin.h>
+
+void
+ResizeV_SSE2(ResampleInfo *info)
+{
+       const RS_IMAGE16 *input = info->input;
+       const RS_IMAGE16 *output = info->output;
+       const guint old_size = info->old_size;
+       const guint new_size = info->new_size;
+       const guint start_x = info->dest_offset_other * input->pixelsize;
+       const guint end_x = info->dest_end_other * input->pixelsize;
+
+       gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
+       gdouble filter_step = MIN(1.0 / pos_step, 1.0);
+       gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
+       gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+       if (old_size <= fir_filter_size)
+               return ResizeV_fast(info);
+
+       gint *weights = g_new(gint, new_size * fir_filter_size);
+       gint *offsets = g_new(gint, new_size);
+
+       gdouble pos = 0.0;
+
+       gint i,j,k;
+
+       for (i=0; i<new_size; ++i)
+       {
+               gint end_pos = (gint) (pos + filter_support);
+               if (end_pos > old_size-1)
+                       end_pos = old_size-1;
+
+               gint start_pos = end_pos - fir_filter_size + 1;
+
+               if (start_pos < 0)
+                       start_pos = 0;
+
+               offsets[i] = start_pos;
+
+               /* The following code ensures that the coefficients add to 
exactly FPScale */
+               gdouble total = 0.0;
+
+               /* Ensure that we have a valid position */
+               gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
+
+               for (j=0; j<fir_filter_size; ++j)
+               {
+                       /* Accumulate all coefficients */
+                       total += lanczos_weight((start_pos+j - ok_pos) * 
filter_step);
+               }
+
+               g_assert(total > 0.0f);
+
+               gdouble total2 = 0.0;
+
+               for (k=0; k<fir_filter_size; ++k)
+               {
+                       gdouble total3 = total2 + lanczos_weight((start_pos+k - 
ok_pos) * filter_step) / total;
+                       weights[i*fir_filter_size+k] = ((gint) 
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+                       
+                       total2 = total3;
+               }
+               pos += pos_step;
+       }
+
+       guint y,x;
+       gint *wg = weights;
+
+       /* 24 pixels = 48 bytes/loop */
+       gint end_x_sse = (end_x/24)*24;
+       
+       /* Subtract 32768 as it would appear after shift */
+       gint add_round_sub = -(32768 << (FPScaleShift-1));
+       /* 0.5 pixel value is lost to rounding times fir_filter_size, 
compensate */
+       add_round_sub += fir_filter_size * (FPScale >> 2);
+       
+       __m128i add_32 = _mm_set_epi32(add_round_sub, add_round_sub, 
add_round_sub, add_round_sub);
+       __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 0x80008000, 
0x80008000);
+
+       for (y = 0; y < new_size ; y++)
+       {
+               gushort *in = GET_PIXEL(input, start_x / input->pixelsize, 
offsets[y]);
+               gushort *out = GET_PIXEL(output, 0, y);
+               __m128i zero;
+               zero = _mm_setzero_si128();
+               for (x = start_x; x <= (end_x_sse-24); x+=24)
+               {
+                       /* Accumulators, set to 0 */
+                       __m128i acc1, acc2,  acc3, acc1_h, acc2_h, acc3_h;
+                       acc1 = acc2 = acc3 = acc1_h = acc2_h = acc3_h = zero;
+
+                       for (i = 0; i < fir_filter_size; i++) {
+                               /* Load weight */
+                               __m128i w = 
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+                               
+                               /* Load source and prefetch next line */
+                               int pos = i * input->rowstride;
+                               __m128i src1i, src2i, src3i;
+                               __m128i* in_sse =  (__m128i*)&in[pos];
+                               src1i = _mm_load_si128(in_sse);
+                               src2i = _mm_load_si128(in_sse+1);
+                               src3i = _mm_load_si128(in_sse+2);
+                               _mm_prefetch(&in[pos + 32], _MM_HINT_T0);
+                               
+                               /* Unpack to dwords */
+                               __m128i src1i_h, src2i_h, src3i_h;
+                               src1i_h = _mm_unpackhi_epi16(src1i, zero);
+                               src2i_h = _mm_unpackhi_epi16(src2i, zero);
+                               src3i_h = _mm_unpackhi_epi16(src3i, zero);
+                               src1i = _mm_unpacklo_epi16(src1i, zero);
+                               src2i = _mm_unpacklo_epi16(src2i, zero);
+                               src3i = _mm_unpacklo_epi16(src3i, zero);
+                               
+                               /*Shift down to 15 bit for multiplication */
+                               src1i_h = _mm_srli_epi16(src1i_h, 1);
+                               src2i_h = _mm_srli_epi16(src2i_h, 1);
+                               src3i_h = _mm_srli_epi16(src3i_h, 1);
+                               src1i = _mm_srli_epi16(src1i, 1);
+                               src2i = _mm_srli_epi16(src2i, 1);
+                               src3i = _mm_srli_epi16(src3i, 1);
+                               
+                               /* Multiply my weight */
+                               src1i_h = _mm_madd_epi16(src1i_h, w);
+                               src2i_h = _mm_madd_epi16(src2i_h, w);
+                               src3i_h = _mm_madd_epi16(src3i_h, w);
+                               src1i = _mm_madd_epi16(src1i, w);
+                               src2i = _mm_madd_epi16(src2i, w);
+                               src3i = _mm_madd_epi16(src3i, w);
+
+                               /* Accumulate */
+                               acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+                               acc2_h = _mm_add_epi32(acc2_h, src2i_h);
+                               acc3_h = _mm_add_epi32(acc3_h, src3i_h);
+                               acc1 = _mm_add_epi32(acc1, src1i);
+                               acc2 = _mm_add_epi32(acc2, src2i);
+                               acc3 = _mm_add_epi32(acc3, src3i);
+                       }
+                       
+                       /* Add rounder and subtract 32768 */
+                       acc1_h = _mm_add_epi32(acc1_h, add_32);
+                       acc2_h = _mm_add_epi32(acc2_h, add_32);
+                       acc3_h = _mm_add_epi32(acc3_h, add_32);
+                       acc1 = _mm_add_epi32(acc1, add_32);
+                       acc2 = _mm_add_epi32(acc2, add_32);
+                       acc3 = _mm_add_epi32(acc3, add_32);
+                       
+                       /* Shift down */
+                       acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
+                       acc2_h = _mm_srai_epi32(acc2_h, FPScaleShift - 1);
+                       acc3_h = _mm_srai_epi32(acc3_h, FPScaleShift - 1);
+                       acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
+                       acc2 = _mm_srai_epi32(acc2, FPScaleShift - 1);
+                       acc3 = _mm_srai_epi32(acc3, FPScaleShift - 1);
+                       
+                       /* Pack to signed shorts */
+                       acc1 = _mm_packs_epi32(acc1, acc1_h);
+                       acc2 = _mm_packs_epi32(acc2, acc2_h);
+                       acc3 = _mm_packs_epi32(acc3, acc3_h);
+
+                       /* Shift sign to unsinged shorts */
+                       acc1 = _mm_xor_si128(acc1, signxor);
+                       acc2 = _mm_xor_si128(acc2, signxor);
+                       acc3 = _mm_xor_si128(acc3, signxor);
+
+                       /* Store result */
+                       __m128i* sse_dst = (__m128i*)&out[x];
+                       _mm_store_si128(sse_dst, acc1);
+                       _mm_store_si128(sse_dst + 1, acc2);
+                       _mm_store_si128(sse_dst + 2, acc3);
+                       in += 24;
+               }
+               
+               /* Process remaining pixels */
+               for (; x < end_x; x++)
+               {
+                       gint acc1 = 0;
+                       for (i = 0; i < fir_filter_size; i++)
+                       {
+                               acc1 += in[i * input->rowstride] * 
*(gshort*)&wg[i];
+                       }
+                       out[x] = clampbits((acc1 + (FPScale / 2)) >> 
FPScaleShift, 16);
+                       in++;
+               }
+               wg += fir_filter_size;
+       }
+       g_free(weights);
+       g_free(offsets);
+}
+
+#elif defined (__SSE2__)
+#include <emmintrin.h>
+
+void
+ResizeV_SSE2(ResampleInfo *info)
+{
+       const RS_IMAGE16 *input = info->input;
+       const RS_IMAGE16 *output = info->output;
+       const guint old_size = info->old_size;
+       const guint new_size = info->new_size;
+       const guint start_x = info->dest_offset_other * input->pixelsize;
+       const guint end_x = info->dest_end_other * input->pixelsize;
+
+       gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
+       gdouble filter_step = MIN(1.0 / pos_step, 1.0);
+       gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
+       gint fir_filter_size = (gint) (ceil(filter_support*2));
+
+       if (old_size <= fir_filter_size)
+               return ResizeV_fast(info);
+
+       gint *weights = g_new(gint, new_size * fir_filter_size);
+       gint *offsets = g_new(gint, new_size);
+
+       gdouble pos = 0.0;
+
+       gint i,j,k;
+
+       for (i=0; i<new_size; ++i)
+       {
+               gint end_pos = (gint) (pos + filter_support);
+               if (end_pos > old_size-1)
+                       end_pos = old_size-1;
+
+               gint start_pos = end_pos - fir_filter_size + 1;
+
+               if (start_pos < 0)
+                       start_pos = 0;
+
+               offsets[i] = start_pos;
+
+               /* The following code ensures that the coefficients add to 
exactly FPScale */
+               gdouble total = 0.0;
+
+               /* Ensure that we have a valid position */
+               gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
+
+               for (j=0; j<fir_filter_size; ++j)
+               {
+                       /* Accumulate all coefficients */
+                       total += lanczos_weight((start_pos+j - ok_pos) * 
filter_step);
+               }
+
+               g_assert(total > 0.0f);
+
+               gdouble total2 = 0.0;
+
+               for (k=0; k<fir_filter_size; ++k)
+               {
+                       gdouble total3 = total2 + lanczos_weight((start_pos+k - 
ok_pos) * filter_step) / total;
+                       weights[i*fir_filter_size+k] = ((gint) 
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
+                       
+                       total2 = total3;
+               }
+               pos += pos_step;
+       }
+
+       guint y,x;
+       gint *wg = weights;
+
+       /* 8 pixels = 16 bytes/loop */
+       gint end_x_sse = (end_x/8)*8;
+       
+       /* Rounder after accumulation, half because input is scaled down */
+       gint add_round_sub = (FPScale >> 2);
+       /* Subtract 32768 as it would appear after shift */
+       add_round_sub -= (32768 << (FPScaleShift-1));
+       /* 0.5 pixel value is lost to rounding times fir_filter_size, 
compensate */
+       add_round_sub += fir_filter_size * (FPScale >> 2);
+
+       for (y = 0; y < new_size ; y++)
+       {
+               gushort *in = GET_PIXEL(input, start_x / input->pixelsize, 
offsets[y]);
+               gushort *out = GET_PIXEL(output, 0, y);
+               __m128i zero;
+               zero = _mm_setzero_si128();
+               for (x = start_x; x <= (end_x_sse-8); x+=8)
+               {
+                       /* Accumulators, set to 0 */
+                       __m128i acc1, acc1_h;
+                       acc1 = acc1_h = zero;
+
+                       for (i = 0; i < fir_filter_size; i++) {
+                               /* Load weight */
+                               __m128i w = 
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
+                               /* Load source */
+                               __m128i src1i;
+                               __m128i* in_sse =  (__m128i*)&in[i * 
input->rowstride];
+                               src1i = _mm_load_si128(in_sse);
+                               /* Unpack to dwords */
+                               __m128i src1i_h;
+                               src1i_h = _mm_unpackhi_epi16(src1i, zero);
+                               src1i = _mm_unpacklo_epi16(src1i, zero);
+                               
+                               /*Shift down to 15 bit for multiplication */
+                               src1i_h = _mm_srli_epi16(src1i_h, 1);
+                               src1i = _mm_srli_epi16(src1i, 1);
+                               
+                               /* Multiply my weight */
+                               src1i_h = _mm_madd_epi16(src1i_h, w);
+                               src1i = _mm_madd_epi16(src1i, w);
+
+                               /* Accumulate */
+                               acc1_h = _mm_add_epi32(acc1_h, src1i_h);
+                               acc1 = _mm_add_epi32(acc1, src1i);
+                       }
+                       __m128i add_32 = _mm_set_epi32(add_round_sub, 
add_round_sub, add_round_sub, add_round_sub);
+                       __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 
0x80008000, 0x80008000);
+                       
+                       /* Add rounder and subtract 32768 */
+                       acc1_h = _mm_add_epi32(acc1_h, add_32);
+                       acc1 = _mm_add_epi32(acc1, add_32);
+                       
+                       /* Shift down */
+                       acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
+                       acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
+                       
+                       /* Pack to signed shorts */
+                       acc1 = _mm_packs_epi32(acc1, acc1_h);
+
+                       /* Shift sign to unsinged shorts */
+                       acc1 = _mm_xor_si128(acc1, signxor);
+
+                       /* Store result */
+                       __m128i* sse_dst = (__m128i*)&out[x];
+                       _mm_store_si128(sse_dst, acc1);
+                       in += 8;
+               }
+               
+               /* Process remaining pixels */
+               for (; x < end_x; x++)
+               {
+                       gint acc1 = 0;
+                       for (i = 0; i < fir_filter_size; i++)
+                       {
+                               acc1 += in[i * input->rowstride] * 
*(gshort*)&wg[i];
+                       }
+                       out[x] = clampbits((acc1 + (FPScale / 2)) >> 
FPScaleShift, 16);
+                       in++;
+               }
+               wg += fir_filter_size;
+       }
+       g_free(weights);
+       g_free(offsets);
+}
+
+#else // not defined (__SSE2__)
+
+static void
+ResizeV_SSE2(ResampleInfo *info)
+{
+       ResizeV(info);
+}
+
+#endif // not defined (__x86_64__) and not defined (__SSE2__)
+
+

Modified: branches/rawstudio-ng-color/plugins/resample/resample.c
===================================================================
--- branches/rawstudio-ng-color/plugins/resample/resample.c     2009-12-30 
23:14:23 UTC (rev 2930)
+++ branches/rawstudio-ng-color/plugins/resample/resample.c     2009-12-30 
23:41:09 UTC (rev 2931)
@@ -21,9 +21,6 @@
 
 #include <rawstudio.h>
 #include <math.h>
-#if defined (__SSE2__)
-#include <emmintrin.h>
-#endif /* __SSE2__ */
 
 
 #define RS_TYPE_RESAMPLE (rs_resample_type)
@@ -81,15 +78,15 @@
 static gint get_width(RSFilter *filter);
 static gint get_height(RSFilter *filter);
 static void ResizeH(ResampleInfo *info);
-static void ResizeV(ResampleInfo *info);
-static void ResizeV_SSE2(ResampleInfo *info);
+void ResizeV(ResampleInfo *info);
+extern void ResizeV_SSE2(ResampleInfo *info);
 static void ResizeH_compatible(ResampleInfo *info);
 static void ResizeV_compatible(ResampleInfo *info);
 static void ResizeH_fast(ResampleInfo *info);
-static void ResizeV_fast(ResampleInfo *info);
+void ResizeV_fast(ResampleInfo *info);
 
 static RSFilterClass *rs_resample_parent_class = NULL;
-inline guint clampbits(gint x, guint n) { guint32 _y_temp; if( (_y_temp=x>>n) 
) x = ~_y_temp >> (32-n); return x;}
+static inline guint clampbits(gint x, guint n) { guint32 _y_temp; if( 
(_y_temp=x>>n) ) x = ~_y_temp >> (32-n); return x;}
 
 G_MODULE_EXPORT void
 rs_plugin_load(RSPlugin *plugin)
@@ -561,7 +558,7 @@
 
 }
 
-static void
+void
 ResizeV(ResampleInfo *info)
 {
        const RS_IMAGE16 *input = info->input;
@@ -658,368 +655,7 @@
 
 }
 
-/* Special Vertical SSE2 resampler, that has massive parallism.
- * An important restriction is that "info->dest_offset_other", must result
- * in a 16 byte aligned memory pointer.
- */
-
-#if defined (__x86_64__)
-#if defined (__SSE2__)
-
 static void
-ResizeV_SSE2(ResampleInfo *info)
-{
-       const RS_IMAGE16 *input = info->input;
-       const RS_IMAGE16 *output = info->output;
-       const guint old_size = info->old_size;
-       const guint new_size = info->new_size;
-       const guint start_x = info->dest_offset_other * input->pixelsize;
-       const guint end_x = info->dest_end_other * input->pixelsize;
-
-       gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
-       gdouble filter_step = MIN(1.0 / pos_step, 1.0);
-       gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
-       gint fir_filter_size = (gint) (ceil(filter_support*2));
-
-       if (old_size <= fir_filter_size)
-               return ResizeV_fast(info);
-
-       gint *weights = g_new(gint, new_size * fir_filter_size);
-       gint *offsets = g_new(gint, new_size);
-
-       gdouble pos = 0.0;
-
-       gint i,j,k;
-
-       for (i=0; i<new_size; ++i)
-       {
-               gint end_pos = (gint) (pos + filter_support);
-               if (end_pos > old_size-1)
-                       end_pos = old_size-1;
-
-               gint start_pos = end_pos - fir_filter_size + 1;
-
-               if (start_pos < 0)
-                       start_pos = 0;
-
-               offsets[i] = start_pos;
-
-               /* The following code ensures that the coefficients add to 
exactly FPScale */
-               gdouble total = 0.0;
-
-               /* Ensure that we have a valid position */
-               gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
-
-               for (j=0; j<fir_filter_size; ++j)
-               {
-                       /* Accumulate all coefficients */
-                       total += lanczos_weight((start_pos+j - ok_pos) * 
filter_step);
-               }
-
-               g_assert(total > 0.0f);
-
-               gdouble total2 = 0.0;
-
-               for (k=0; k<fir_filter_size; ++k)
-               {
-                       gdouble total3 = total2 + lanczos_weight((start_pos+k - 
ok_pos) * filter_step) / total;
-                       weights[i*fir_filter_size+k] = ((gint) 
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
-                       
-                       total2 = total3;
-               }
-               pos += pos_step;
-       }
-
-       guint y,x;
-       gint *wg = weights;
-
-       /* 24 pixels = 48 bytes/loop */
-       gint end_x_sse = (end_x/24)*24;
-       
-       /* Subtract 32768 as it would appear after shift */
-       gint add_round_sub = -(32768 << (FPScaleShift-1));
-       /* 0.5 pixel value is lost to rounding times fir_filter_size, 
compensate */
-       add_round_sub += fir_filter_size * (FPScale >> 2);
-       
-       __m128i add_32 = _mm_set_epi32(add_round_sub, add_round_sub, 
add_round_sub, add_round_sub);
-       __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 0x80008000, 
0x80008000);
-
-       for (y = 0; y < new_size ; y++)
-       {
-               gushort *in = GET_PIXEL(input, start_x / input->pixelsize, 
offsets[y]);
-               gushort *out = GET_PIXEL(output, 0, y);
-               __m128i zero;
-               zero = _mm_xor_si128(zero, zero);
-               for (x = start_x; x <= (end_x_sse-24); x+=24)
-               {
-                       /* Accumulators, set to 0 */
-                       __m128i acc1, acc2,  acc3, acc1_h, acc2_h, acc3_h;
-                       acc1 = acc2 = acc3 = acc1_h = acc2_h = acc3_h = zero;
-
-                       for (i = 0; i < fir_filter_size; i++) {
-                               /* Load weight */
-                               __m128i w = 
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
-                               
-                               /* Load source and prefetch next line */
-                               int pos = i * input->rowstride;
-                               __m128i src1i, src2i, src3i;
-                               __m128i* in_sse =  (__m128i*)&in[pos];
-                               src1i = _mm_load_si128(in_sse);
-                               src2i = _mm_load_si128(in_sse+1);
-                               src3i = _mm_load_si128(in_sse+2);
-                               _mm_prefetch(&in[pos + 32], _MM_HINT_T0);
-                               
-                               /* Unpack to dwords */
-                               __m128i src1i_h, src2i_h, src3i_h;
-                               src1i_h = _mm_unpackhi_epi16(src1i, zero);
-                               src2i_h = _mm_unpackhi_epi16(src2i, zero);
-                               src3i_h = _mm_unpackhi_epi16(src3i, zero);
-                               src1i = _mm_unpacklo_epi16(src1i, zero);
-                               src2i = _mm_unpacklo_epi16(src2i, zero);
-                               src3i = _mm_unpacklo_epi16(src3i, zero);
-                               
-                               /*Shift down to 15 bit for multiplication */
-                               src1i_h = _mm_srli_epi16(src1i_h, 1);
-                               src2i_h = _mm_srli_epi16(src2i_h, 1);
-                               src3i_h = _mm_srli_epi16(src3i_h, 1);
-                               src1i = _mm_srli_epi16(src1i, 1);
-                               src2i = _mm_srli_epi16(src2i, 1);
-                               src3i = _mm_srli_epi16(src3i, 1);
-                               
-                               /* Multiply my weight */
-                               src1i_h = _mm_madd_epi16(src1i_h, w);
-                               src2i_h = _mm_madd_epi16(src2i_h, w);
-                               src3i_h = _mm_madd_epi16(src3i_h, w);
-                               src1i = _mm_madd_epi16(src1i, w);
-                               src2i = _mm_madd_epi16(src2i, w);
-                               src3i = _mm_madd_epi16(src3i, w);
-
-                               /* Accumulate */
-                               acc1_h = _mm_add_epi32(acc1_h, src1i_h);
-                               acc2_h = _mm_add_epi32(acc2_h, src2i_h);
-                               acc3_h = _mm_add_epi32(acc3_h, src3i_h);
-                               acc1 = _mm_add_epi32(acc1, src1i);
-                               acc2 = _mm_add_epi32(acc2, src2i);
-                               acc3 = _mm_add_epi32(acc3, src3i);
-                       }
-                       
-                       /* Add rounder and subtract 32768 */
-                       acc1_h = _mm_add_epi32(acc1_h, add_32);
-                       acc2_h = _mm_add_epi32(acc2_h, add_32);
-                       acc3_h = _mm_add_epi32(acc3_h, add_32);
-                       acc1 = _mm_add_epi32(acc1, add_32);
-                       acc2 = _mm_add_epi32(acc2, add_32);
-                       acc3 = _mm_add_epi32(acc3, add_32);
-                       
-                       /* Shift down */
-                       acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
-                       acc2_h = _mm_srai_epi32(acc2_h, FPScaleShift - 1);
-                       acc3_h = _mm_srai_epi32(acc3_h, FPScaleShift - 1);
-                       acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
-                       acc2 = _mm_srai_epi32(acc2, FPScaleShift - 1);
-                       acc3 = _mm_srai_epi32(acc3, FPScaleShift - 1);
-                       
-                       /* Pack to signed shorts */
-                       acc1 = _mm_packs_epi32(acc1, acc1_h);
-                       acc2 = _mm_packs_epi32(acc2, acc2_h);
-                       acc3 = _mm_packs_epi32(acc3, acc3_h);
-
-                       /* Shift sign to unsinged shorts */
-                       acc1 = _mm_xor_si128(acc1, signxor);
-                       acc2 = _mm_xor_si128(acc2, signxor);
-                       acc3 = _mm_xor_si128(acc3, signxor);
-
-                       /* Store result */
-                       __m128i* sse_dst = (__m128i*)&out[x];
-                       _mm_store_si128(sse_dst, acc1);
-                       _mm_store_si128(sse_dst + 1, acc2);
-                       _mm_store_si128(sse_dst + 2, acc3);
-                       in += 24;
-               }
-               
-               /* Process remaining pixels */
-               for (; x < end_x; x++)
-               {
-                       gint acc1 = 0;
-                       for (i = 0; i < fir_filter_size; i++)
-                       {
-                               acc1 += in[i * input->rowstride] * 
*(gshort*)&wg[i];
-                       }
-                       out[x] = clampbits((acc1 + (FPScale / 2)) >> 
FPScaleShift, 16);
-                       in++;
-               }
-               wg += fir_filter_size;
-       }
-       g_free(weights);
-       g_free(offsets);
-}
-#endif /* defined (__SSE2__) */
-#elif defined (__SSE2__)
-
-static void
-ResizeV_SSE2(ResampleInfo *info)
-{
-       const RS_IMAGE16 *input = info->input;
-       const RS_IMAGE16 *output = info->output;
-       const guint old_size = info->old_size;
-       const guint new_size = info->new_size;
-       const guint start_x = info->dest_offset_other * input->pixelsize;
-       const guint end_x = info->dest_end_other * input->pixelsize;
-
-       gdouble pos_step = ((gdouble) old_size) / ((gdouble)new_size);
-       gdouble filter_step = MIN(1.0 / pos_step, 1.0);
-       gdouble filter_support = (gdouble) lanczos_taps() / filter_step;
-       gint fir_filter_size = (gint) (ceil(filter_support*2));
-
-       if (old_size <= fir_filter_size)
-               return ResizeV_fast(info);
-
-       gint *weights = g_new(gint, new_size * fir_filter_size);
-       gint *offsets = g_new(gint, new_size);
-
-       gdouble pos = 0.0;
-
-       gint i,j,k;
-
-       for (i=0; i<new_size; ++i)
-       {
-               gint end_pos = (gint) (pos + filter_support);
-               if (end_pos > old_size-1)
-                       end_pos = old_size-1;
-
-               gint start_pos = end_pos - fir_filter_size + 1;
-
-               if (start_pos < 0)
-                       start_pos = 0;
-
-               offsets[i] = start_pos;
-
-               /* The following code ensures that the coefficients add to 
exactly FPScale */
-               gdouble total = 0.0;
-
-               /* Ensure that we have a valid position */
-               gdouble ok_pos = MAX(0.0,MIN(old_size-1,pos));
-
-               for (j=0; j<fir_filter_size; ++j)
-               {
-                       /* Accumulate all coefficients */
-                       total += lanczos_weight((start_pos+j - ok_pos) * 
filter_step);
-               }
-
-               g_assert(total > 0.0f);
-
-               gdouble total2 = 0.0;
-
-               for (k=0; k<fir_filter_size; ++k)
-               {
-                       gdouble total3 = total2 + lanczos_weight((start_pos+k - 
ok_pos) * filter_step) / total;
-                       weights[i*fir_filter_size+k] = ((gint) 
(total3*FPScale+0.5) - (gint) (total2*FPScale+0.5)) & 0xffff;
-                       
-                       total2 = total3;
-               }
-               pos += pos_step;
-       }
-
-       guint y,x;
-       gint *wg = weights;
-
-       /* 8 pixels = 16 bytes/loop */
-       gint end_x_sse = (end_x/8)*8;
-       
-       /* Rounder after accumulation, half because input is scaled down */
-       gint add_round_sub = (FPScale >> 2);
-       /* Subtract 32768 as it would appear after shift */
-       add_round_sub -= (32768 << (FPScaleShift-1));
-       /* 0.5 pixel value is lost to rounding times fir_filter_size, 
compensate */
-       add_round_sub += fir_filter_size * (FPScale >> 2);
-
-       for (y = 0; y < new_size ; y++)
-       {
-               gushort *in = GET_PIXEL(input, start_x / input->pixelsize, 
offsets[y]);
-               gushort *out = GET_PIXEL(output, 0, y);
-               __m128i zero;
-               zero = _mm_xor_si128(zero, zero);
-               for (x = start_x; x <= (end_x_sse-8); x+=8)
-               {
-                       /* Accumulators, set to 0 */
-                       __m128i acc1, acc1_h;
-                       acc1 = acc1_h = zero;
-
-                       for (i = 0; i < fir_filter_size; i++) {
-                               /* Load weight */
-                               __m128i w = 
_mm_set_epi32(wg[i],wg[i],wg[i],wg[i]);
-                               /* Load source */
-                               __m128i src1i;
-                               __m128i* in_sse =  (__m128i*)&in[i * 
input->rowstride];
-                               src1i = _mm_load_si128(in_sse);
-                               /* Unpack to dwords */
-                               __m128i src1i_h;
-                               src1i_h = _mm_unpackhi_epi16(src1i, zero);
-                               src1i = _mm_unpacklo_epi16(src1i, zero);
-                               
-                               /*Shift down to 15 bit for multiplication */
-                               src1i_h = _mm_srli_epi16(src1i_h, 1);
-                               src1i = _mm_srli_epi16(src1i, 1);
-                               
-                               /* Multiply my weight */
-                               src1i_h = _mm_madd_epi16(src1i_h, w);
-                               src1i = _mm_madd_epi16(src1i, w);
-
-                               /* Accumulate */
-                               acc1_h = _mm_add_epi32(acc1_h, src1i_h);
-                               acc1 = _mm_add_epi32(acc1, src1i);
-                       }
-                       __m128i add_32 = _mm_set_epi32(add_round_sub, 
add_round_sub, add_round_sub, add_round_sub);
-                       __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 
0x80008000, 0x80008000);
-                       
-                       /* Add rounder and subtract 32768 */
-                       acc1_h = _mm_add_epi32(acc1_h, add_32);
-                       acc1 = _mm_add_epi32(acc1, add_32);
-                       
-                       /* Shift down */
-                       acc1_h = _mm_srai_epi32(acc1_h, FPScaleShift - 1 );
-                       acc1 = _mm_srai_epi32(acc1, FPScaleShift - 1);
-                       
-                       /* Pack to signed shorts */
-                       acc1 = _mm_packs_epi32(acc1, acc1_h);
-
-                       /* Shift sign to unsinged shorts */
-                       acc1 = _mm_xor_si128(acc1, signxor);
-
-                       /* Store result */
-                       __m128i* sse_dst = (__m128i*)&out[x];
-                       _mm_store_si128(sse_dst, acc1);
-                       in += 8;
-               }
-               
-               /* Process remaining pixels */
-               for (; x < end_x; x++)
-               {
-                       gint acc1 = 0;
-                       for (i = 0; i < fir_filter_size; i++)
-                       {
-                               acc1 += in[i * input->rowstride] * 
*(gshort*)&wg[i];
-                       }
-                       out[x] = clampbits((acc1 + (FPScale / 2)) >> 
FPScaleShift, 16);
-                       in++;
-               }
-               wg += fir_filter_size;
-       }
-       g_free(weights);
-       g_free(offsets);
-}
-
-#else // not defined (__SSE2__)
-
-static void
-ResizeV_SSE2(ResampleInfo *info)
-{
-       ResizeV(info);
-}
-
-#endif // not defined (__x86_64__) and not defined (__SSE2__)
-
-static void
 ResizeH_compatible(ResampleInfo *info)
 {
        const RS_IMAGE16 *input = info->input;
@@ -1207,7 +843,7 @@
        g_free(offsets);
 }
 
-static void
+void
 ResizeV_fast(ResampleInfo *info)
 {
        const RS_IMAGE16 *input = info->input;


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to