This gets rid of the variable-length scratch buffer by filtering 16
pixels at a time and writing directly to the destination. The extra
loads this requires to load the source values are compensated by not
doing a round-trip to memory before shifting.
Signed-off-by: Mans Rullgard
---
libswscale/ppc/swscale_altivec.c | 154 +--
1 file changed, 65 insertions(+), 89 deletions(-)
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 0e66ec1..9aba120 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -32,78 +32,37 @@
#define vzero vec_splat_s32(0)
-static inline void altivec_packIntArrayToCharArray(int *val, uint8_t *dest,
- int dstW)
+#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do { \
+vector signed short l2 = vec_ld(((x) << 1) + 16, src); \
+vector signed short ls = vec_perm(l1, l2, perm); \
+vector signed int i1 = vec_mule(filter, ls); \
+vector signed int i2 = vec_mulo(filter, ls); \
+vector signed int vf1 = vec_mergeh(i1, i2); \
+vector signed int vf2 = vec_mergel(i1, i2); \
+d1 = vec_add(d1, vf1); \
+d2 = vec_add(d2, vf2); \
+l1 = l2;\
+} while (0)
+
+static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest,
+ const uint8_t *dither, int offset, int x)
{
-register int i;
+register int i, j;
+DECLARE_ALIGNED(16, int, val)[16];
+vector signed int vo1, vo2, vo3, vo4;
+vector unsigned short vs1, vs2;
+vector unsigned char vf;
vector unsigned int altivec_vectorShiftInt19 =
vec_add(vec_splat_u32(10), vec_splat_u32(9));
-if ((uintptr_t)dest % 16) {
-/* badly aligned store, we force store alignment */
-/* and will handle load misalignment on val w/ vec_perm */
-vector unsigned char perm1;
-vector signed int v1;
-for (i = 0; (i < dstW) &&
- (((uintptr_t)dest + i) % 16); i++) {
-int t = val[i] >> 19;
-dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
-}
-perm1 = vec_lvsl(i << 2, val);
-v1= vec_ld(i << 2, val);
-for (; i < (dstW - 15); i += 16) {
-int offset = i << 2;
-vector signed int v2 = vec_ld(offset + 16, val);
-vector signed int v3 = vec_ld(offset + 32, val);
-vector signed int v4 = vec_ld(offset + 48, val);
-vector signed int v5 = vec_ld(offset + 64, val);
-vector signed int v12 = vec_perm(v1, v2, perm1);
-vector signed int v23 = vec_perm(v2, v3, perm1);
-vector signed int v34 = vec_perm(v3, v4, perm1);
-vector signed int v45 = vec_perm(v4, v5, perm1);
-
-vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
-vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
-vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
-vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
-vector unsigned short vs1 = vec_packsu(vA, vB);
-vector unsigned short vs2 = vec_packsu(vC, vD);
-vector unsigned char vf = vec_packsu(vs1, vs2);
-vec_st(vf, i, dest);
-v1 = v5;
-}
-} else { // dest is properly aligned, great
-for (i = 0; i < (dstW - 15); i += 16) {
-int offset = i << 2;
-vector signed int v1 = vec_ld(offset, val);
-vector signed int v2 = vec_ld(offset + 16, val);
-vector signed int v3 = vec_ld(offset + 32, val);
-vector signed int v4 = vec_ld(offset + 48, val);
-vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19);
-vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19);
-vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19);
-vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19);
-vector unsigned short vs1 = vec_packsu(v5, v6);
-vector unsigned short vs2 = vec_packsu(v7, v8);
-vector unsigned char vf = vec_packsu(vs1, vs2);
-vec_st(vf, i, dest);
-}
-}
-for (; i < dstW; i++) {
-int t = val[i] >> 19;
-dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
-}
-}
-// FIXME remove the usage of scratch buffers.
-static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest, int dstW,
- const uint8_t *d