Author: post
Date: 2012-02-13 18:08:23 +0100 (Mon, 13 Feb 2012)
New Revision: 4133
Modified:
trunk/plugins/dcp/dcp-sse4.c
Log:
Prefer pinsrq for loading SSE4 quadword when looking up.
Modified: trunk/plugins/dcp/dcp-sse4.c
===================================================================
--- trunk/plugins/dcp/dcp-sse4.c 2012-02-13 16:31:58 UTC (rev 4132)
+++ trunk/plugins/dcp/dcp-sse4.c 2012-02-13 17:08:23 UTC (rev 4133)
@@ -37,6 +37,13 @@
static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0,
65535.0, 65535.0, 65535.0};
+/* Insert two floats into high qword of register using pinsrq, GCC doesn't
detect this */
+static inline __m128
+_mm_insert_two_high(__m128 low, const gfloat* pos)
+{
+ return _mm_castsi128_ps(_mm_insert_epi64(_mm_castps_si128(low), *(long
long*)pos, 1));
+}
+
static inline __m128
sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
{
@@ -232,17 +239,17 @@
{
/* Convert v to lookup values and interpolate */
__m128 mul = _mm_mul_ps(value, _mm_load_ps(_thousand_24_ps));
- __m128i lookup = _mm_cvtps_epi32(mul);
+ __m128i lookup = _mm_slli_epi32(_mm_cvtps_epi32(mul),1);
/* Calculate fractions */
__m128 frac = _mm_sub_ps(mul, _mm_floor_ps(mul));
__m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
/* Load two adjacent curve values and interpolate between them */
- __m128 p0p1 =
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[_mm_extract_epi32(lookup,0)*2]));
- __m128 p2p3 =
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[_mm_extract_epi32(lookup,2)*2]));
- p0p1 = _mm_loadh_pi(p0p1,
(__m64*)&tone_lut[_mm_extract_epi32(lookup,1)*2]);
- p2p3 = _mm_loadh_pi(p2p3,
(__m64*)&tone_lut[_mm_extract_epi32(lookup,3)*2]);
+ __m128 p0p1 =
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[_mm_extract_epi32(lookup,0)]));
+ __m128 p2p3 =
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[_mm_extract_epi32(lookup,2)]));
+ p0p1 = _mm_insert_two_high(p0p1,
&tone_lut[_mm_extract_epi32(lookup,1)]);
+ p2p3 = _mm_insert_two_high(p2p3,
&tone_lut[_mm_extract_epi32(lookup,3)]);
/* Pack all lower values in v0, high in v1 and interpolate */
__m128 v0 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(2,0,2,0));
@@ -333,6 +340,7 @@
#define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \
N[0] = A; N[1] = A; N[2] = A; N[3] = A;
+
static gfloat _twofiftysix_ps[4] __attribute__ ((aligned (16))) =
{255.9999f,255.9999f,255.9999f,255.9999f};
@@ -566,8 +574,8 @@
/* Load two adjacent curve values and
interpolate between them */
__m128 p0p1 =
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[_mm_extract_epi32(lookup,0)]));
__m128 p2p3 =
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[_mm_extract_epi32(lookup,2)]));
- p0p1 = _mm_loadh_pi(p0p1,
(__m64*)&dcp->curve_samples[_mm_extract_epi32(lookup,1)]);
- p2p3 = _mm_loadh_pi(p2p3,
(__m64*)&dcp->curve_samples[_mm_extract_epi32(lookup,3)]);
+ p0p1 = _mm_insert_two_high(p0p1,
&dcp->curve_samples[_mm_extract_epi32(lookup,1)]);
+ p2p3 = _mm_insert_two_high(p2p3,
&dcp->curve_samples[_mm_extract_epi32(lookup,3)]);
/* Pack all lower values in v0, high in v1 and
interpolate */
__m128 v0 = _mm_shuffle_ps(p0p1, p2p3,
_MM_SHUFFLE(2,0,2,0));
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit