# HG changeset patch
# User Alexey Osipov <[email protected]>
# Date 1310760987 -25200
# Branch stabilize_optimize
# Node ID ac09f716b03da55b8a3e5bd47a0f38e377e6ece8
# Parent ca546347d93b572f79923401fad4b90a5334b53c
Using SSE2 optimized code for compareSubImg() and contrastSubImgYUV().
Those optimization are switchable at compile-time and honour config.h
HAVE_ASM_SSE2 define.
diff -r ca546347d93b -r ac09f716b03d filter/stabilize/filter_stabilize.c
--- a/filter/stabilize/filter_stabilize.c Fri Jul 15 17:10:40 2011 +0700
+++ b/filter/stabilize/filter_stabilize.c Sat Jul 16 03:16:27 2011 +0700
@@ -7,7 +7,7 @@
*
* Copyright (C) Alexey Osipov - Jule 2011
* simba at lerlan dot ru
- * speed optimizations
+ * speed optimizations including SSE2 code
*
* This file is part of transcode, a video stream processing tool
*
@@ -68,6 +68,44 @@
* this is really just for debugging and development */
// #define STABVERBOSE
+#ifdef HAVE_ASM_SSE2
+
+/* use SSE2 for compareSubImg */
+#define USE_SSE2_CMP
+
+/* use SSE2 for compareSubImg even more,
+ * sometimes this may be slower,
+ * enabling this also limit SSE_SUM_ROWS to 8 */
+#define USE_SSE2_CMP_HOR
+
+/* how many 16-byte rows to summ in SSE2 registers
+ * before output them to regular variable
+ * from 1 to 255,
+ * bigger values faster, but may cause registers overflow,
+ * which leads to incorrect transformation data.
+ * lower values not much slower, but safer
+ * if USE_SSE_HORIZ enabled, then this must not be larger than 8 */
+#define SSE2_CMP_SUM_ROWS 8
+
+/* use SSE2 for contrastSubImg (only YUV version)
+ * may be used without USE_SSE */
+#define USE_SSE2_YUV_CONTRAST
+
+
+#ifdef USE_SSE2_CMP
+#define NEED_EMMINTRIN
+#endif
+
+#ifdef USE_SSE2_YUV_CONTRAST
+#define NEED_EMMINTRIN
+#endif
+
+#ifdef NEED_EMMINTRIN
+#include <emmintrin.h>
+#endif
+
+#endif
+
#define MAXLONG ((unsigned long int)(-1))
typedef struct _field {
@@ -162,6 +200,9 @@
const Field* field,
int width, int height, int bytesPerPixel,int d_x,int d_y,
unsigned long int treshold);
double contrastSubImgYUV(StabData* sd, const Field* field);
+#ifdef USE_SSE2_YUV_CONTRAST
+double contrastSubImgYUVSSE(unsigned char* const I, const Field* field, int
width, int height);
+#endif
double contrastSubImgRGB(StabData* sd, const Field* field);
double contrastSubImg(unsigned char* const I, const Field* field,
int width, int height, int bytesPerPixel);
@@ -314,26 +355,117 @@
int s2 = field->size / 2;
unsigned long int sum = 0;
+#ifdef USE_SSE2_CMP
+ static unsigned char mask[16] = {0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00};
+ unsigned char row = 0;
+#ifndef USE_SSE2_CMP_HOR
+ unsigned char summes[16];
+ int i;
+#endif
+ __m128i xmmsum, xmmmask;
+ xmmsum = _mm_setzero_si128();
+ xmmmask = _mm_loadu_si128(mask);
+#endif
+
p1=I1 + ((field->x - s2) + (field->y - s2)*width)*bytesPerPixel;
p2=I2 + ((field->x - s2 + d_x) + (field->y - s2 +
d_y)*width)*bytesPerPixel;
- // TODO: use some mmx or sse stuff here
for (j = 0; j < field->size; j++){
+#ifdef USE_SSE2_CMP
+ for (k = 0; k < field->size * bytesPerPixel; k+=16){
+ {
+ __m128i xmm0, xmm1, xmm2;
+ xmm0 = _mm_loadu_si128(p1);
+ xmm1 = _mm_loadu_si128(p2);
+
+ xmm2 = _mm_subs_epu8(xmm0, xmm1);
+ xmm0 = _mm_subs_epu8(xmm1, xmm0);
+ xmm0 = _mm_adds_epu8(xmm0, xmm2);
+
+ xmm1 = _mm_and_si128(xmm0, xmmmask);
+ xmm0 = _mm_srli_si128(xmm0, 1);
+ xmm0 = _mm_and_si128(xmm0, xmmmask);
+
+ xmmsum = _mm_adds_epu16(xmmsum, xmm0);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+ }
+
+ p1+=16;
+ p2+=16;
+
+ row++;
+ if (row == SSE2_CMP_SUM_ROWS) {
+ row = 0;
+#ifdef USE_SSE2_CMP_HOR
+ {
+ __m128i xmm1;
+
+ xmm1 = _mm_srli_si128(xmmsum, 8);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+
+ xmm1 = _mm_srli_si128(xmmsum, 4);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+
+ xmm1 = _mm_srli_si128(xmmsum, 2);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+
+ sum += _mm_extract_epi16(xmmsum, 0);
+ }
+#else
+ _mm_storeu_si128((__m128i*)summes, xmmsum);
+ for(i = 0; i < 16; i+=2)
+ sum += summes[i] + summes[i+1]*256;
+#endif
+ xmmsum = _mm_setzero_si128();
+ }
+#else
for (k = 0; k < field->size * bytesPerPixel; k++) {
sum += abs((int)*p1 - (int)*p2);
p1++;
p2++;
+#endif
}
if (sum > treshold)
break;
p1 += (width - field->size) * bytesPerPixel;
p2 += (width - field->size) * bytesPerPixel;
}
+
+#if (SSE2_CMP_SUM_ROWS != 1) && (SSE2_CMP_SUM_ROWS != 2) && (SSE2_CMP_SUM_ROWS
!= 4) && (SSE2_CMP_SUM_ROWS != 8) && (SSE2_CMP_SUM_ROWS != 16)
+ //process all data left unprocessed
+ //this part can be safely ignored if
+ //SSE_SUM_ROWS = {1, 2, 4, 8, 16}
+#ifdef USE_SSE2_CMP_HOR
+ {
+ __m128i xmm1;
+
+ xmm1 = _mm_srli_si128(xmmsum, 8);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+
+ xmm1 = _mm_srli_si128(xmmsum, 4);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+
+ xmm1 = _mm_srli_si128(xmmsum, 2);
+ xmmsum = _mm_adds_epu16(xmmsum, xmm1);
+
+ sum += _mm_extract_epi16(xmmsum, 0);
+ }
+#else
+ _mm_storeu_si128((__m128i*)summes, xmmsum);
+ for(i = 0; i < 16; i+=2)
+ sum += summes[i] + summes[i+1]*256;
+#endif
+#endif
+
return sum;
}
/** \see contrastSubImg called with bytesPerPixel=1*/
double contrastSubImgYUV(StabData* sd, const Field* field){
+#ifdef USE_SSE2_YUV_CONTRAST
+ return contrastSubImgYUVSSE(sd->curr,field,sd->width,sd->height);
+#else
return contrastSubImg(sd->curr,field,sd->width,sd->height,1);
+#endif
}
/**
@@ -347,6 +479,63 @@
+ contrastSubImg(I+2,field,sd->width,sd->height,3))/3;
}
+
+#ifdef USE_SSE2_YUV_CONTRAST
+/**
+ \see contrastSubImg using SSE2 optimization, YUV only
+ */
+double contrastSubImgYUVSSE(unsigned char* const I, const Field* field,
+ int width, int height)
+{
+ int k, j;
+ unsigned char* p = NULL;
+ int s2 = field->size / 2;
+
+ static unsigned char full[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+
+ p = I + ((field->x - s2) + (field->y - s2)*width);
+
+ __m128i mmin, mmax;
+
+ mmin = _mm_loadu_si128(full);
+ mmax = _mm_setzero_si128();
+
+ for (j = 0; j < field->size; j++){
+ for (k = 0; k < field->size; k += 16) {
+ __m128i xmm0;
+ xmm0 = _mm_loadu_si128(p);
+ mmin = _mm_min_epu8(mmin, xmm0);
+ mmax = _mm_max_epu8(mmax, xmm0);
+ p += 16;
+ }
+ p += (width - field->size);
+ }
+
+ __m128i xmm1;
+ xmm1 = _mm_srli_si128(mmin, 8);
+ mmin = _mm_min_epu8(mmin, xmm1);
+ xmm1 = _mm_srli_si128(mmin, 4);
+ mmin = _mm_min_epu8(mmin, xmm1);
+ xmm1 = _mm_srli_si128(mmin, 2);
+ mmin = _mm_min_epu8(mmin, xmm1);
+ xmm1 = _mm_srli_si128(mmin, 1);
+ mmin = _mm_min_epu8(mmin, xmm1);
+ unsigned char mini = (unsigned char)_mm_extract_epi16(mmin, 0);
+
+ xmm1 = _mm_srli_si128(mmax, 8);
+ mmax = _mm_max_epu8(mmax, xmm1);
+ xmm1 = _mm_srli_si128(mmax, 4);
+ mmax = _mm_max_epu8(mmax, xmm1);
+ xmm1 = _mm_srli_si128(mmax, 2);
+ mmax = _mm_max_epu8(mmax, xmm1);
+ xmm1 = _mm_srli_si128(mmax, 1);
+ mmax = _mm_max_epu8(mmax, xmm1);
+ unsigned char maxi = (unsigned char)_mm_extract_epi16(mmax, 0);
+
+ return (maxi-mini)/(maxi+mini+0.1); // +0.1 to avoid division by 0
+}
+#endif
+
/**
calculates Michelson-contrast in the given small part of the given image
@@ -366,7 +555,7 @@
unsigned char maxi = 0;
p = I + ((field->x - s2) + (field->y - s2)*width)*bytesPerPixel;
- // TODO: use some mmx or sse stuff here
+
for (j = 0; j < field->size; j++){
for (k = 0; k < field->size * bytesPerPixel; k++) {
mini = (mini < *p) ? mini : *p;
@@ -1000,7 +1189,9 @@
// shift and size: shakiness 1: height/40; 10: height/4
sd->maxshift = TC_MAX(4,(TC_MIN(sd->width,
sd->height)*sd->shakiness)/40);
sd->field_size = TC_MAX(4,(TC_MIN(sd->width,
sd->height)*sd->shakiness)/40);
-
+#if defined(USE_SSE2_CMP) || defined(USE_SSE2_YUV_CONTRAST)
+ sd->field_size = (sd->field_size / 16 + 1) * 16; //must be multiple of
16 pixels for SSE2
+#endif
tc_log_info(MOD_NAME, "Fieldsize: %i, Maximal translation: %i pixel",
sd->field_size, sd->maxshift);
if (sd->algo==1) {