diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/configure.in flac/configure.in
--- flac-1.1.0/configure.in	Wed Jan 15 14:56:07 2003
+++ flac/configure.in	Tue Mar 30 15:45:37 2004
@@ -208,6 +208,18 @@
 AC_DEFINE(FLAC__USE_3DNOW)
 fi
 
+AC_ARG_ENABLE(altivec,
+[  --disable-altivec              Disable Altivec optimizations],
+[case "${enableval}" in
+	yes) use_altivec=true ;;
+	no)  use_altivec=false ;;
+	*) AC_MSG_ERROR(bad value ${enableval} for --enable-altivec) ;;
+esac],[use_altivec=true])
+AM_CONDITIONAL(FLaC__USE_ALTIVEC, test x$use_altivec = xtrue)
+if test x$use_altivec = xtrue ; then
+AC_DEFINE(FLAC__USE_ALTIVEC)
+fi
+
 AC_ARG_ENABLE(local-xmms-plugin,
 [  --enable-local-xmms-plugin     Install XMMS plugin to ~/.xmms/Plugins instead of system location],
 [case "${enableval}" in
@@ -380,6 +392,7 @@
 AH_TEMPLATE(FLAC__NO_ASM,  [define to disable use of assembly code])
 AH_TEMPLATE(FLAC__SSE_OS,  [define if your operating system supports SSE instructions])
 AH_TEMPLATE(FLAC__USE_3DNOW,  [define to enable use of 3Dnow! instructions])
+AH_TEMPLATE(FLAC__USE_ALTIVEC,  [define to enable use of Altivec instructions])
 AH_TEMPLATE(ID3LIB_MAJOR,  [define to major version number of id3lib])
 AH_TEMPLATE(ID3LIB_MINOR,  [define to minor version number of id3lib])
 AH_TEMPLATE(ID3LIB_PATCH,  [define to patch level of id3lib])
@@ -388,6 +401,7 @@
 	Makefile \
 	src/Makefile \
 	src/libFLAC/Makefile \
+	src/libFLAC/ppc/Makefile \
 	src/libFLAC/ia32/Makefile \
 	src/libFLAC/include/Makefile \
 	src/libFLAC/include/private/Makefile \
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/Makefile.am flac/src/libFLAC/Makefile.am
--- flac-1.1.0/src/libFLAC/Makefile.am	Sat Jan 25 12:25:17 2003
+++ flac/src/libFLAC/Makefile.am	Fri Apr  2 11:12:35 2004
@@ -24,6 +24,13 @@
 
 if FLaC__NO_ASM
 else
+if FLaC__CPU_PPC
+ARCH_SUBDIRS = ppc
+# Fucking libtool... with this, it includes it twice when trying to link.
+# Oh well, just listing the lobjects works well enough.
+# libFLAC_la_LIBADD = ppc/libFLAC-altivec.la
+libFLAC_la_LIBADD = ppc/fixed_altivec.lo ppc/lpc_altivec.lo ppc/cpu.lo
+endif
 if FLaC__CPU_IA32
 if FLaC__HAS_NASM
 ARCH_SUBDIRS = ia32
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/cpu.c flac/src/libFLAC/cpu.c
--- flac-1.1.0/src/libFLAC/cpu.c	Thu Jan  2 01:18:54 2003
+++ flac/src/libFLAC/cpu.c	Wed Sep 15 03:20:08 2004
@@ -25,6 +25,8 @@
 #include <config.h>
 #endif
 
+const unsigned FLAC__CPUINFO_PPC_ALTIVEC = 0x00000001;
+
 const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV = 0x00008000;
 const unsigned FLAC__CPUINFO_IA32_CPUID_MMX = 0x00800000;
 const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
@@ -35,9 +37,25 @@
 const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW = 0x40000000;
 const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX = 0x00400000;
 
-
 void FLAC__cpu_info(FLAC__CPUInfo *info)
 {
+	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
+	info->use_asm = false;
+
+#ifdef FLAC__CPU_PPC
+	info->type = FLAC__CPUINFO_TYPE_PPC;
+	{
+		unsigned cpuinfo = FLAC__cpu_info_ppc();
+		info->data.ppc.altivec = (cpuinfo & FLAC__CPUINFO_PPC_ALTIVEC) ? true : false;
+#ifndef FLAC__NO_ASM
+		info->use_asm = true;
+#else
+		info->use_asm = false;
+#endif
+	}
+#else
+#endif
+
 #ifdef FLAC__CPU_IA32
 	info->type = FLAC__CPUINFO_TYPE_IA32;
 #if !defined FLAC__NO_ASM && defined FLAC__HAS_NASM
@@ -66,8 +84,5 @@
 #else
 	info->use_asm = false;
 #endif
-#else
-	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
-	info->use_asm = false;
 #endif
 }
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/include/private/cpu.h flac/src/libFLAC/include/private/cpu.h
--- flac-1.1.0/src/libFLAC/include/private/cpu.h	Thu Jan  2 01:18:55 2003
+++ flac/src/libFLAC/include/private/cpu.h	Tue Mar 30 09:27:38 2004
@@ -28,9 +28,17 @@
 
 typedef enum {
 	FLAC__CPUINFO_TYPE_IA32,
+	FLAC__CPUINFO_TYPE_PPC,
 	FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
+
+typedef struct {
+	FLAC__bool altivec;
+} FLAC__CPUInfo_PPC;
+
+extern const unsigned FLAC__CPUINFO_PPC_ALTIVEC;
+
 typedef struct {
 	FLAC__bool cmov;
 	FLAC__bool mmx;
@@ -56,6 +64,7 @@
 	FLAC__bool use_asm;
 	FLAC__CPUInfo_Type type;
 	union {
+		FLAC__CPUInfo_PPC ppc;
 		FLAC__CPUInfo_IA32 ia32;
 	} data;
 } FLAC__CPUInfo;
@@ -63,6 +72,9 @@
 void FLAC__cpu_info(FLAC__CPUInfo *info);
 
 #ifndef FLAC__NO_ASM
+#ifdef FLAC__CPU_PPC
+unsigned FLAC__cpu_info_ppc();
+#endif
 #ifdef FLAC__CPU_IA32
 #ifdef FLAC__HAS_NASM
 unsigned FLAC__cpu_info_asm_ia32();
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/include/private/fixed.h flac/src/libFLAC/include/private/fixed.h
--- flac-1.1.0/src/libFLAC/include/private/fixed.h	Thu Jan  2 01:18:55 2003
+++ flac/src/libFLAC/include/private/fixed.h	Wed Mar 31 04:58:45 2004
@@ -40,6 +40,9 @@
  */
 unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #ifndef FLAC__NO_ASM
+#ifdef FLAC__CPU_PPC
+unsigned FLAC__fixed_compute_best_predictor_altivec(const FLAC__int32 data[], unsigned data_len, FLAC__real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+#endif
 #ifdef FLAC__CPU_IA32
 #ifdef FLAC__HAS_NASM
 unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/include/private/lpc.h flac/src/libFLAC/include/private/lpc.h
--- flac-1.1.0/src/libFLAC/include/private/lpc.h	Thu Jan  2 01:18:55 2003
+++ flac/src/libFLAC/include/private/lpc.h	Wed Sep 15 03:14:36 2004
@@ -40,6 +40,9 @@
  */
 void FLAC__lpc_compute_autocorrelation(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 #ifndef FLAC__NO_ASM
+#ifdef FLAC__CPU_PPC
+void FLAC__lpc_compute_autocorrelation_altivec(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+#endif
 #ifdef FLAC__CPU_IA32
 #ifdef FLAC__HAS_NASM
 void FLAC__lpc_compute_autocorrelation_asm_ia32(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
@@ -109,6 +112,10 @@
 void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 #ifndef FLAC__NO_ASM
+#ifdef FLAC__CPU_PPC
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16bit_altivec(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_altivec(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+#endif
 #ifdef FLAC__CPU_IA32
 #ifdef FLAC__HAS_NASM
 void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
@@ -135,6 +142,10 @@
 void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #ifndef FLAC__NO_ASM
+#ifdef FLAC__CPU_PPC
+void FLAC__lpc_restore_signal_altivec(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+void FLAC__lpc_restore_signal_16bit_altivec(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+#endif
 #ifdef FLAC__CPU_IA32
 #ifdef FLAC__HAS_NASM
 void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/ppc/Makefile.am flac/src/libFLAC/ppc/Makefile.am
--- flac-1.1.0/src/libFLAC/ppc/Makefile.am	Wed Dec 31 18:00:00 1969
+++ flac/src/libFLAC/ppc/Makefile.am	Fri Apr  9 01:06:46 2004
@@ -0,0 +1,11 @@
+
+# -W flag is not good with -faltivec
+CFLAGS = -faltivec `echo @CFLAGS@ | sed -e 's|-W ||'` -Wno-uninitialized -I$(srcdir)/../include
+
+noinst_LTLIBRARIES = libFLAC-altivec.la
+
+libFLAC_altivec_la_SOURCES = \
+	lpc_altivec.c \
+	fixed_altivec.c \
+	cpu.c \
+	altivec.h
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/ppc/altivec.h flac/src/libFLAC/ppc/altivec.h
--- flac-1.1.0/src/libFLAC/ppc/altivec.h	Wed Dec 31 18:00:00 1969
+++ flac/src/libFLAC/ppc/altivec.h	Wed Sep 15 03:59:10 2004
@@ -0,0 +1,68 @@
+/*
+ * The defines here are derived from Apple's examples at:
+ * http://developer.apple.com/hardware/ve/algorithms.html
+ */
+#ifndef _ALTIVEC_H_
+#define _ALTIVEC_H_
+
+#include <machine/types.h>
+
+typedef vector signed char vs8;
+typedef vector signed short vs16;
+typedef vector signed int vs32;
+typedef vector unsigned char vu8;
+typedef vector unsigned short vu16;
+typedef vector unsigned int vu32;
+typedef vector float vf32;
+
+/* Check the relative or absolute alignment of vectors.
+ */
+#define VecRelAligned(a, b) (!((uintptr_t)(a) - (uintptr_t)(b) & 15))
+#define VecAligned(a) (!((uintptr_t)(a) & 15))
+
+/* Round addresses to the nearest vector.
+ */
+#define VecRoundUp(a) (__typeof__(a))((uintptr_t)a + 15 & ~15);
+#define VecRoundDown(a) (__typeof__(a))((uintptr_t)a & ~15);
+
+#define a16 __attribute__ ((aligned (16)))
+
+/* Load the value at address a into every element of vector v.
+ * Note: a must me naturally aligned.
+ */
+#define VecLoad4(v, a) \
+{ \
+	vu8 s = vec_lvsl(0, a); \
+	v = vec_lde(0, a); \
+	s = (vu8)vec_splat((__typeof__(v))s, 0); \
+	v = vec_perm(v, v, s); \
+}
+
+/* The result vector r is the sum across the other 4 vectors.
+ */
+#define VecAddAcross4(r, a, b, c, d) \
+{ \
+	__typeof__(r) ac01, ac23, bd01, bd23; \
+	ac01 = vec_mergeh(a, c); \
+	ac23 = vec_mergel(a, c); \
+	bd01 = vec_mergeh(b, d); \
+	bd23 = vec_mergel(b, d); \
+	b = vec_add(ac01, ac23); \
+	d = vec_add(bd01, bd23); \
+	a = vec_mergeh(b, d); \
+	c = vec_mergel(b, d); \
+	r = vec_add(a, c); \
+}
+
+/* Each element of the result vector r is the sum across the vector v.
+ */
+#define VecAddAcross1(r, v) \
+{ \
+	__typeof__(v) t; \
+	t = vec_sld(v, v, 8); \
+	r = vec_add(t, v); \
+	t = vec_sld(r, r, 4); \
+	r = vec_add(t, r); \
+}
+
+#endif
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/ppc/cpu.c flac/src/libFLAC/ppc/cpu.c
--- flac-1.1.0/src/libFLAC/ppc/cpu.c	Wed Dec 31 18:00:00 1969
+++ flac/src/libFLAC/ppc/cpu.c	Fri Apr  9 01:03:39 2004
@@ -0,0 +1,41 @@
+/*
+ * This function is derived Apple's example code at:
+ * http://developer.apple.com/hardware/ve/g3_compatibility.html
+ */
+#include <signal.h>
+#include <setjmp.h>
+#include "private/cpu.h"
+
+volatile int testing, flags;
+static sigjmp_buf env;
+
+void sig_ill_handler(int sig) 
+{
+	flags &= ~testing;
+	siglongjmp(env, 0);
+}
+
+unsigned FLAC__cpu_info_ppc(void)
+{
+	sigset_t signame;
+	struct sigaction sa_new, sa_old;
+	
+	sigemptyset(&signame);
+	sigaddset(&signame, SIGILL);
+	
+	sa_new.sa_handler = sig_ill_handler;
+	sa_new.sa_mask = signame;
+	sa_new.sa_flags = 0;
+	
+	sigaction(SIGILL, &sa_new, &sa_old);
+	
+	flags = FLAC__CPUINFO_PPC_ALTIVEC;
+	
+	testing = FLAC__CPUINFO_PPC_ALTIVEC;
+	if (!sigsetjmp(env, 0))
+		asm volatile ("vor v0, v0, v0");
+	
+	sigaction(SIGILL, &sa_old, &sa_new);
+	
+    return flags;
+}
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/ppc/fixed_altivec.c flac/src/libFLAC/ppc/fixed_altivec.c
--- flac-1.1.0/src/libFLAC/ppc/fixed_altivec.c	Wed Dec 31 18:00:00 1969
+++ flac/src/libFLAC/ppc/fixed_altivec.c	Wed Sep 15 03:59:10 2004
@@ -0,0 +1,113 @@
+#include <math.h>
+
+#include "FLAC/ordinals.h"
+#include "FLAC/format.h"
+#include "FLAC/assert.h"
+
+#include "private/fixed.h"
+#include "altivec.h"
+
+unsigned FLAC__fixed_compute_best_predictor_altivec(const FLAC__int32 data[], unsigned data_len, FLAC__real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
+{
+    float a16 float_data_len = (float)data_len;
+    unsigned order;
+    vu32 te0, te1;
+    
+    FLAC__ASSERT(VecAligned(data));
+    FLAC__ASSERT(VecAligned(residual_bits_per_sample));
+    FLAC__ASSERT(FLAC__MAX_FIXED_ORDER == 4);
+
+    {
+        vs32 e0, e1, e2, e3, e4, le0, le1, le2, le3;
+        vu32 te2, te3, te4, zero = (vu32)(0);
+        FLAC__int32 a16 last_error[4];
+        const FLAC__int32 *end = data + data_len;
+        
+        last_error[0] = data[-1];
+        last_error[1] = data[-1] - data[-2];
+        last_error[2] = last_error[1] - (data[-2] - data[-3]);
+        last_error[3] = last_error[2] - (data[-2] - 2 * data[-3] + data[-4]);
+        
+        le0 = vec_ld(0, last_error);
+        le3 = vec_splat(le0, 3);
+        le2 = vec_splat(le0, 2);
+        le1 = vec_splat(le0, 1);
+        le0 = vec_splat(le0, 0);
+        
+        te0 = te1 = te2 = te3 = te4 = zero;
+        e0 = vec_ld(0, data);
+        while (data < end) {
+            e1 = vec_sub(e0, vec_sld(le0, e0, 12));
+            e2 = vec_sub(e1, vec_sld(le1, e1, 12));
+            e3 = vec_sub(e2, vec_sld(le2, e2, 12));
+            e4 = vec_sub(e3, vec_sld(le3, e3, 12));
+            te0 = vec_add((vu32)vec_abs(e0), te0);
+            te1 = vec_add((vu32)vec_abs(e1), te1);
+            te2 = vec_add((vu32)vec_abs(e2), te2);
+            te3 = vec_add((vu32)vec_abs(e3), te3);
+            te4 = vec_add((vu32)vec_abs(e4), te4);
+            data += 4;
+            le0 = e0; le1 = e1; le2 = e2; le3 = e3;
+            e0 = vec_ld(0, data);
+        }
+        /* If the end was not vector aligned, some calculations from the final
+         * loop must be undone.  Note that e0 is overwritten above; use le0.
+         */
+        if (data > end) {
+            vs32 m = vec_perm((vs32)(0), (vs32)(-1), vec_lvsr(0, end));
+            
+            te0 = vec_sub(te0, (vu32)vec_abs(vec_and(le0, m)));
+            te1 = vec_sub(te1, (vu32)vec_abs(vec_and(e1, m)));
+            te2 = vec_sub(te2, (vu32)vec_abs(vec_and(e2, m)));
+            te3 = vec_sub(te3, (vu32)vec_abs(vec_and(e3, m)));
+            te4 = vec_sub(te4, (vu32)vec_abs(vec_and(e4, m)));
+        }
+
+        VecAddAcross4(te0, te0, te1, te2, te3);
+        VecAddAcross1(te1, te4); /* leaves result in all elements */
+        if (vec_all_le(te1, te0))
+            order = 4;
+        else if (vec_all_le(vec_splat(te0, 0), te0))
+            order = 0;
+        else if (vec_all_le(vec_splat(te0, 1), te0))
+            order = 1;
+        else if (vec_all_le(vec_splat(te0, 2), te0))
+            order = 2;
+        else
+            order = 3;
+    }    
+#if 1 /* Altivec log estimate should be accurate enough here. */
+    {
+        vf32 te03, te4, r03, r4, zero = (vf32)(0), mln2 = (vf32)(M_LN2);
+
+        /* r4 = recipricol of data_len, with one refinement */
+        r4 = vec_splat(vec_ld(0, &float_data_len), 0);
+        te4 = vec_re(r4);
+        r4 = vec_madd(vec_nmsub(te4, r4, (vf32)(1)), te4, te4);
+        
+        te03 = vec_ctf(te0, 0);
+        te4 = vec_ctf(te1, 0);
+        
+        r03 = vec_madd(vec_madd(te03, r4, zero), mln2, zero);
+        r03 = vec_sel(zero, vec_loge(r03), vec_cmpgt(te03, zero));
+        vec_st(r03, 0, residual_bits_per_sample);
+        
+        r4 = vec_madd(vec_madd(vec_ctf(te1, 0), r4, zero), mln2, zero);
+        r4 = vec_sel(zero, vec_loge(r4), vec_cmpgt(te4, zero));
+        vec_ste(r4, 16, residual_bits_per_sample);
+    }
+#else /* Use log(). */
+    {
+        FLAC__uint32 a16 te[5];
+        
+        vec_st(te0, 0, te);
+        vec_ste(te1, 16, te);
+        residual_bits_per_sample[0] = (float)((te[0] > 0) ? log(M_LN2 * (double)te[0] / (double)data_len) / M_LN2 : 0.0);
+        residual_bits_per_sample[1] = (float)((te[1] > 0) ? log(M_LN2 * (double)te[1] / (double)data_len) / M_LN2 : 0.0);
+        residual_bits_per_sample[2] = (float)((te[2] > 0) ? log(M_LN2 * (double)te[2] / (double)data_len) / M_LN2 : 0.0);
+        residual_bits_per_sample[3] = (float)((te[3] > 0) ? log(M_LN2 * (double)te[3] / (double)data_len) / M_LN2 : 0.0);
+        residual_bits_per_sample[4] = (float)((te[4] > 0) ? log(M_LN2 * (double)te[4] / (double)data_len) / M_LN2 : 0.0);
+    }
+#endif
+    return(order);
+}
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/ppc/lpc_altivec.c flac/src/libFLAC/ppc/lpc_altivec.c
--- flac-1.1.0/src/libFLAC/ppc/lpc_altivec.c	Wed Dec 31 18:00:00 1969
+++ flac/src/libFLAC/ppc/lpc_altivec.c	Wed Sep 15 03:59:10 2004
@@ -0,0 +1,631 @@
+#include <unistd.h>
+
+#include "FLAC/assert.h"
+#include "FLAC/ordinals.h"
+
+#include "private/lpc.h"
+#include "altivec.h"
+
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16bit_altivec(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+    const FLAC__int32 *h, *scalar_end, *end = data + data_len;
+    int sum, j;
+
+    FLAC__ASSERT(VecRelAligned(data, residual));
+    FLAC__ASSERT(VecAligned(qlp_coeff));
+    FLAC__ASSERT(order > 0);
+
+    scalar_end = (order > 12) ? end : VecRoundUp(data);
+
+    /* This is the scalar algorithm */
+    while (data < scalar_end) {
+        sum = 0; h = data;
+        for (j = 0; j < order; j++)
+            sum += qlp_coeff[j] * *(--h);
+        *(residual++) = *(data++) - (sum >> lp_quantization);
+    }
+    
+    /* If there is still data, proceed with the vector algorithm */
+    if (data < end) {
+        static void *oa[] = { &&o12, &&o34, &&o56, &&o78, &&o9A, &&oBC };
+        register void *o = oa[order - 1 >> 1];
+        vu8 p0, p1;
+        vs16 q10, q32, q54, q76, q98, qBA, hB8, h74, h30, d03, d47;
+        vu32 lpq;
+        vs32 s03, s47;
+
+        VecLoad4(lpq, (unsigned int *)&lp_quantization);
+
+        /* p0 and p1 are used to load the qlp coefficients such that:
+         *     q10 = c1 c0 c1 c0 c1 c0 c1 c0
+         *     q32 = c3 c2 c3 c2 c3 c2 c3 c2
+         *     qNM =  .  .  .  .  .  .  .  .
+         */
+        p0 = (vu8)( 6, 7, 2, 3, 6, 7, 2, 3, 6, 7, 2, 3, 6, 7, 2, 3);
+        p1 = vec_add(p0, (vu8)(8));
+        /*
+         * Load the 4 highest qlp coefficients into q32 for now, and zero the
+         * unused elements.
+         */ 
+        q54 = vec_perm((vs16)(-1), (vs16)(0), vec_lvsl(0, (int *)(-4 * order)));
+        q32 = vec_and((vs16)vec_ld(-4, qlp_coeff + order), q54);
+        /*
+         * This switch loads the necessary qlp coefficients and data history
+         * into the q* and h* vectors.  They are arranged like so:
+         *     d03 = data[ 0] - data[ 3],    d47 = data[ 4] - data[ 7]
+         *     h74 = data[-8] - data[-5],    h30 = data[-4] - data[-1]
+         *     hNM = . . .
+         */
+        switch (order + 3 & ~3) {
+            case 12:
+                q98 = vec_perm(q32, q32, p0);
+                qBA = vec_perm(q32, q32, p1);
+                hB8 = (vs16)vec_ld(-48, data);
+
+                q32 = vec_ld(16, (short *)qlp_coeff);
+            case 8:
+                q54 = vec_perm(q32, q32, p0);
+                q76 = vec_perm(q32, q32, p1);
+                h74 = (vs16)vec_ld(-32, data);
+
+                q32 = vec_ld(0, (short *)qlp_coeff);
+            case 4:
+                q10 = vec_perm(q32, q32, p0);
+                q32 = vec_perm(q32, q32, p1);
+                h30 = (vs16)vec_ld(-16, data);
+
+                d03 = (vs16)vec_ld(0, data);
+                d47 = (vs16)vec_ld(16, data);
+        }
+
+        /* Below, p0 and p1 arrange the data for the following calculation.
+         * This example shows part of the loop processing the vector for
+         * d0 to d3.  Each vec_msum() contributes results for 2 terms of the
+         * sum, hence odd orders require the next qlp coefficient to be zero.
+         * (Actually, each loop processes two adjacent vectors.)
+         *              . . .      . . .      . . .      . . .
+         *                +          +          +          +
+         *      q32 =  c3   c2    c3   c2    c3   c2    c3   c2
+         *              *   *      *   *      *   *      *   *
+         *      (p1)   h4   h3    h3   h2    h2   h1    h1   d0
+         *                +          +          +          +
+         *      q10 =  c1   c0    c1   c0    c1   c0    c1   c0
+         *              *   *      *   *      *   *      *   *
+         *      (p0)   h2   h1    h1   d0    d0   d1    d1   d2
+         */
+        p0 = (vu8)(10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27);
+        p1 = vec_sub(p0, (vu8)(8));
+        
+        s03 = s47 = (vs32)(0);
+
+        goto *o;
+#define compute(label, q, h0, d0, d1, p) \
+label:  s03 = vec_msum(q, vec_perm(h0, d0, p), s03); \
+        s47 = vec_msum(q, vec_perm(d0, d1, p), s47);
+        compute(oBC, qBA, hB8, h74, h30, p1);
+        compute(o9A, q98, hB8, h74, h30, p0);
+        compute(o78, q76, h74, h30, d03, p1);
+        compute(o56, q54, h74, h30, d03, p0);
+        compute(o34, q32, h30, d03, d47, p1);
+        compute(o12, q10, h30, d03, d47, p0);
+#undef compute
+        s03 = vec_sra(s03, lpq);
+        s03 = vec_sub((vs32)d03, s03);
+        s47 = vec_sra(s47, lpq);
+        s47 = vec_sub((vs32)d47, s47);
+        hB8 = h30; h74 = d03; h30 = d47;
+        vec_st(s03, 0, residual);
+        vec_st(s47, 16, residual);
+        data += 8;
+        residual += 8;
+        d03 = (vs16)vec_ld(0, data);
+        d47 = (vs16)vec_ld(16, data);
+        s03 = s47 = (vs32)(0);
+        if (data < end)
+            goto *o;
+    }
+}
+
+void FLAC__lpc_compute_residual_from_qlp_coefficients_altivec(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+    const FLAC__int32 *h, *scalar_end, *end = data + data_len;
+    int sum, j;
+    
+    FLAC__ASSERT(VecRelAligned(data, residual));
+    FLAC__ASSERT(VecAligned(qlp_coeff));
+    FLAC__ASSERT(order > 0);
+    
+    scalar_end = (order > 12) ? end : VecRoundUp(data);
+    
+    /* This is the scalar algorithm */
+    while (data < scalar_end) {
+        sum = 0; h = data;
+        for (j = 0; j < order; j++)
+            sum += qlp_coeff[j] * *(--h);
+        *(residual++) = *(data++) - (sum >> lp_quantization);
+    }
+    
+    /* If there is still data, proceed with the vector algorithm */
+    if (data < end) {
+        static void *oa[] = { &&o12, &&o34, &&o56, &&o78, &&o9A, &&oBC };
+        register void *o = oa[order - 1 >> 1];
+        vu8 p0H, p0L, p1H, p1L;
+        vu16 qBAH, q98H, q76H, q54H, q32H, q10H;
+        vu16 qBAL, q98L, q76L, q54L, q32L, q10L; 
+        vu16 hB8, h74, h30, d03, t;
+        vu32 lpq, sixteen = vec_splat_u32(-16);
+        vs32 s03, sh, sh2;
+        
+        VecLoad4(lpq, (unsigned int *)&lp_quantization);
+        
+        /* p0X and p1X are used to load the qlp coefficients such that:
+         *     q10X = c1 c0 c1 c0 c1 c0 c1 c0
+         *     q32X = c3 c2 c3 c2 c3 c2 c3 c2
+         *     qNMX =  .  .  .  .  .  .  .  .
+         * where X is H for the high half word, and L for the low half word.
+         */
+        p0H = (vu8)( 4, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 1);
+        p0L = vec_add(p0H, (vu8)(2));
+        p1H = vec_add(p0H, (vu8)(8));
+        p1L = vec_add(p1H, (vu8)(2));
+        /*
+         * Load the 4 highest qlp coefficients into t for now, and zero the
+         * unused elements.
+         */ 
+        t = vec_perm((vu16)(-1), (vu16)(0), vec_lvsl(0, (int *)(-4 * order)));
+        t = vec_and((vu16)vec_ld(-4, qlp_coeff + order), t);
+        /*
+         * This switch loads the necessary qlp coefficients, data, and history
+         * into the q*, d*, and h* vectors.  They are arranged like so:
+         *     d03 = data[ 0] - data[ 3],    d47 = data[ 4] - data[ 7]
+         *     h74 = data[-8] - data[-5],    h30 = data[-4] - data[-1]
+         *     hNM = . . .
+         */
+        switch (order + 3 & ~3) {
+            case 12:
+                qBAH = vec_perm(t, t, p1H); qBAL = vec_perm(t, t, p1L);
+                q98H = vec_perm(t, t, p0H); q98L = vec_perm(t, t, p0L);
+                hB8 = (vu16)vec_ld(-48, data);
+                
+                t = (vu16)vec_ld(16, qlp_coeff);
+            case 8:
+                q76H = vec_perm(t, t, p1H); q76L = vec_perm(t, t, p1L);
+                q54H = vec_perm(t, t, p0H); q54L = vec_perm(t, t, p0L);
+                h74 = (vu16)vec_ld(-32, data);
+                
+                t = (vu16)vec_ld(0, qlp_coeff);
+            case 4:
+                q32H = vec_perm(t, t, p1H); q32L = vec_perm(t, t, p1L);
+                q10H = vec_perm(t, t, p0H); q10L = vec_perm(t, t, p0L);
+                h30 = (vu16)vec_ld(-16, data);
+                
+                d03 = (vu16)vec_ld(0, data);
+        }
+
+        /* Below, p0X and p1X arrange the data for the following calculation.
+         * This example shows part of the loop processing the vector for
+         * d0 to d3.  Each vec_msum() contributes results for 2 terms of the
+         * sum, hence odd orders require the next qlp coefficient to be zero.
+         * The below vec_msum()'s compute the sum of terms for this expansion:
+         *
+         *     c1 * h1 + c0 * h0 = c1l * h1l + (c1l * h1h + c1h * h1l) << 16 +
+         *                         c0l * h0l + (c0l * h0h + c0h * h0l) << 16
+         *
+         * Reordering the terms allows them to be computed in 3 vec_msum()'s,
+         * for each 2 orders, and allows for only a single shift and final sum.
+         *                . . .        . . .        . . .        . . .
+         *      q10L =  c1l   c0l    c1l   c0l    c1l   c0l    c1l   c0l
+         * s03:          *     *      *     *      *     *      *     *
+         *      (p0L)   h1l   h0l    h0l   d0l    d0l   d1l    d1l   d2l
+         *
+         *      q10L =  c1l   c0l    c1l   c0l    c1l   c0l    c1l   c0l
+         *  sh:          *     *      *     *      *     *      *     *
+         *      (p0H)   h1h   h0h    h0h   d0h    d0h   d1h    d1h   d2h
+         *                  +            +            +            +
+         *      q10H =  c1h   c0h    c1h   c0h    c1h   c0h    c1h   c0h
+         * sh2:          *     *      *     *      *     *      *     *
+         *      (p0L)   h1l   h0l    h0l   d0l    d0l   d1l    d1l   d2l
+         */
+        p0H = (vu8)( 8, 9,12,13,12,13,16,17,16,17,20,21,20,21,24,25);
+        p0L = vec_add(p0H, (vu8)(2));
+        p1H = vec_sub(p0H, (vu8)(8));
+        p1L = vec_add(p1H, (vu8)(2));
+        
+        s03 = sh = sh2 = (vs32)(0);
+
+        goto *o;
+#define compute(label, q, h, d, p) \
+label:      sh = (vs32)vec_msum(q##L, vec_perm(h, d, p##H), (vu32)sh);         \
+            s03 = (vs32)vec_msum(q##L, t = vec_perm(h, d, p##L), (vu32)s03);   \
+            sh2 = (vs32)vec_msum(q##H, t, (vu32)sh2);
+        compute(oBC, qBA, hB8, h74, p1);
+        compute(o9A, q98, hB8, h74, p0);
+        compute(o78, q76, h74, h30, p1);
+        compute(o56, q54, h74, h30, p0);
+        compute(o34, q32, h30, d03, p1);
+        compute(o12, q10, h30, d03, p0);
+#undef compute
+        s03 = vec_add(vec_sl(vec_add(sh, sh2), sixteen), s03);
+
+        s03 = vec_sra(s03, lpq);
+        s03 = vec_sub((vs32)d03, s03);
+        hB8 = h74; h74 = h30; h30 = d03;
+        vec_stl(s03, 0, residual);
+        data += 4;
+        residual += 4;
+        d03 = (vu16)vec_ld(0, data);
+        s03 = sh = sh2 = (vs32)(0);
+        if (data < end)
+            goto *o;
+    }
+}
+
+void FLAC__lpc_compute_autocorrelation_altivec(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+    FLAC__real *end = (FLAC__real *)data + data_len;
+
+    FLAC__ASSERT(lag > 0);
+    FLAC__ASSERT(lag <= data_len);
+    FLAC__ASSERT(VecAligned(data));
+    FLAC__ASSERT(VecAligned(autoc));
+
+    if (lag > 13) {
+        int left, i, j;
+
+        /* Scalar algorithm */
+        for (i = 0; i < lag; i++)
+            autoc[i] = 0.0;
+        for (left = data_len - lag; left--; data++) {
+            for(i = 0; i < lag; i++)
+                autoc[i] += data[0] * data[i];
+        }
+        for (left = end - data; left--; data++) {
+            j = (lag < left) ? lag : left;
+            for (i = 0; i < j; i++)
+                autoc[i] += data[0] * data[i];
+        }
+    } else {
+        static void *la[] = { NULL, &&l1, &&l2, &&l3, &&l4, &&l5, &&l6,
+            &&l7, &&l8, &&l9, &&lA, &&lB, &&lC, &&lD };
+        register void *l = la[lag];
+        vf32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC;
+        vf32 d03, d47, d8B, dCF, zero = (vf32)(0);
+
+        d03 = vec_ld(0, data);
+        d47 = vec_ld(16, data);
+        r0 = r1 = r2 = r3 = r4 = zero;
+
+        /* The vector algorithm may consume data past the end.  If the data
+         * is zeroed, the results will not be affected.  data is const, though
+         * this is unlikely to cause any problems.
+         */
+        if (!VecAligned(end)) {
+            /* The end is not vector aligned. Partially zero the final vector.
+             * Note, these loads and stores are rounded down.
+             */
+            dCF = vec_ld(0, end);
+            dCF = vec_perm(dCF, zero, vec_lvsl(0, end));
+            vec_st(dCF, 0, end);
+        }
+        /* The offset 15 for this store will cause it to round up one vector if
+         * end was not aligned.  Otherwise, it will zero from end.
+         */
+        vec_st(zero, 15, end);
+
+        if (lag > 5) {
+            d8B = vec_ld(32, data);
+            dCF = vec_ld(48, data);
+            r5 = r6 = r7 = r8 = r9 = rA = rB = rC = zero;
+            vec_st(zero, 31, end);
+            vec_st(zero, 47, end);
+        }
+
+        goto *l;
+lD:     rC = vec_madd(d03, dCF, rC);
+lC:     rB = vec_madd(d03, vec_sld(d8B, dCF, 12), rB);
+lB:     rA = vec_madd(d03, vec_sld(d8B, dCF,  8), rA);
+lA:     r9 = vec_madd(d03, vec_sld(d8B, dCF,  4), r9);
+l9:     r8 = vec_madd(d03, d8B, r8);
+l8:     r7 = vec_madd(d03, vec_sld(d47, d8B, 12), r7);
+l7:     r6 = vec_madd(d03, vec_sld(d47, d8B,  8), r6);
+l6:     r5 = vec_madd(d03, vec_sld(d47, d8B,  4), r5);
+        r4 = vec_madd(d03, d47, r4);
+        r3 = vec_madd(d03, vec_sld(d03, d47, 12), r3);
+        r2 = vec_madd(d03, vec_sld(d03, d47,  8), r2);
+        r1 = vec_madd(d03, vec_sld(d03, d47,  4), r1);
+        r0 = vec_madd(d03, d03, r0);
+        data += 4;
+        d03 = d47; d47 = d8B; d8B = dCF; dCF = vec_ld(48, data);
+        if (data < end)
+            goto *l;
+
+        goto done;
+l5:     r4 = vec_madd(d03, d47, r4);
+l4:     r3 = vec_madd(d03, vec_sld(d03, d47, 12), r3);
+l3:     r2 = vec_madd(d03, vec_sld(d03, d47,  8), r2);
+l2:     r1 = vec_madd(d03, vec_sld(d03, d47,  4), r1);
+l1:     r0 = vec_madd(d03, d03, r0);
+        data += 4;
+        d03 = d47; d47 = vec_ld(16, data);
+        if (data < end)
+            goto *l;
+done:
+        /* Sum across result vectors and store. */
+        switch (lag + 3 & ~3) {
+            case 16:
+                VecAddAcross1(rC, rC);
+                vec_ste(rC, 0, autoc + 12);
+            case 12:
+                VecAddAcross4(r8, r8, r9, rA, rB);
+                vec_st(r8, 0, autoc + 8);
+            case 8:
+                VecAddAcross4(r4, r4, r5, r6, r7);
+                vec_st(r4, 0, autoc + 4);
+            case 4:
+                VecAddAcross4(r0, r0, r1, r2, r3);
+                vec_st(r0, 0, autoc);
+        }
+    }
+}
+
+void FLAC__lpc_restore_signal_16bit_altivec(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
+    FLAC__int32 *h, *scalar_end, *end = data + data_len;
+    int sum, j;
+
+    FLAC__ASSERT(order > 0);
+    FLAC__ASSERT(VecRelAligned(data, residual));
+
+    scalar_end = (order < 2 || order > 16) ? end : VecRoundUp(data);
+
+    /* This is the scalar algorithm */
+    while (data < scalar_end) {
+        sum = 0; h = data;
+        for (j = 0; j < order; j++)
+            sum += qlp_coeff[j] * *(--h);
+        *(data++) = *(residual++) + (sum >> lp_quantization);
+    }
+
+    /* If there is data remaining, proceed with the vector algorithm */
+    if (data < end) {
+        vu8 p;
+        vs16 qF8, q70, hF8, h70;
+        vs32 r, s, zero = (vs32)(0);
+        vu32 lpq;
+        FLAC__int32 a16 qc[16];
+        
+        data_len = (end - data + 3) >> 2; /* vector data length rounded up */
+
+        VecLoad4(lpq, (unsigned int *)&lp_quantization);
+
+        /* qc[] = qlp_coeff[] reversed, aligned, and padded with enough
+         * zeros to complete the vector.
+         */
+        sum = order; j = 16;
+        do {
+            qc[--j] = *(qlp_coeff++);
+        } while (--sum);
+        while (j & 3)
+            qc[--j] = 0;
+        
+        /* This switch loads the necessary qlp coefficients and data history
+         * into the q* and h* vectors.  They are arranged like so:
+         *     qF8 = qlp[15] - qlp[8],     q70 = qlp[7] - qlp[0]
+         *     hF8 = data[-16] - data[-9], h70 = data[-8] - data[-1]
+         */
+        r = s = zero;
+        switch (order + 3 & ~3) {
+            case 16:
+                r = vec_ld(0, qc);
+                s = vec_ld(-64, data);
+            case 12:
+                qF8 = vec_pack(r, vec_ld(16, qc));
+                hF8 = vec_pack(s, vec_ld(-48, data));
+            case 8:
+                r = vec_ld(32, qc);
+                s = vec_ld(-32, data);
+            case 4:
+                q70 = vec_pack(r, vec_ld(48, qc));
+                h70 = vec_pack(s, vec_ld(-16, data));
+        }
+
+        /* p is used to shift the history vector to the left one element, and
+         * to insert the recently calculated data element s.  Keep in mind,
+         * restore*() only computes one data element at a time: the vec_sums()
+         * leaves the sum in the high word, and the remaining calculation of s
+         * is entirely serial.
+         */
+        p = (vu8)( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,30,31);
+
+        if (order > 8) {
+            do {
+                r = vec_ld(0, residual);
+#define restore16(x)                                                           \
+            s = vec_msum(qF8, hF8, zero);                                      \
+            s = vec_sums(vec_msum(q70, h70, s), zero);                         \
+            s = vec_add(vec_splat(r, x), vec_sra(s, lpq));                     \
+            hF8 = vec_sld(hF8, h70, 2); h70 = vec_perm(h70, (vs16)s, p);
+                restore16(0);
+                restore16(1);
+                restore16(2);
+                restore16(3);
+#undef restore16
+                vec_st(vec_unpackl(h70), 0, data);
+                data += 4; residual += 4;
+            } while (--data_len);
+        } else {
+            do {
+                r = vec_ld(0, residual);
+#define restore8(x)                                                            \
+            s = vec_sums(vec_msum(q70, h70, zero), zero);                      \
+            s = vec_add(vec_splat(r, x), vec_sra(s, lpq));                     \
+            h70 = vec_perm(h70, (vs16)s, p);
+                restore8(0);
+                restore8(1);
+                restore8(2);
+                restore8(3);
+#undef restore8
+                vec_st(vec_unpackl(h70), 0, data);
+                data += 4; residual += 4;
+            } while (--data_len);
+        }
+    }
+}
+
+void FLAC__lpc_restore_signal_altivec(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
+    FLAC__int32 *h, *scalar_end, *end = data + data_len;
+    int sum, j;
+    
+    FLAC__ASSERT(order > 0);
+    FLAC__ASSERT(VecRelAligned(data, residual));
+    
+    scalar_end = (order < 3 || order > 16) ? end : VecRoundUp(data);
+    
+    /* This is the scalar algorithm */
+    while (data < scalar_end) {
+        sum = 0; h = data;
+        for (j = 0; j < order; j++)
+            sum += qlp_coeff[j] * *(--h);
+        *(data++) = *(residual++) + (sum >> lp_quantization);
+    }
+    
+    /* If there is data remaining, proceed with the vector algorithm */
+    if (data < end) {
+        vu16 qF8H, qF8L, q70H, q70L, hF8H, hF8L, h70H, h70L, q30, h30;
+        vs32 r, s, sh, sh2, zero = (vs32)(0);
+        vu32 lpq, sixteen = vec_splat_u32(-16);
+        vu8 p0, p1;
+        FLAC__int32 a16 qc[16];
+
+        data_len = (end - data + 3) >> 2; /* vector data length rounded up */
+
+        VecLoad4(lpq, (unsigned int *)&lp_quantization);
+        
+        /* qc[] = qlp_coeff[] reversed, aligned, and padded with enough
+         * zeros to complete the vector.
+         */
+        sum = order; j = 16;
+        do {
+            qc[--j] = *(qlp_coeff++);
+        } while (--sum);
+        while (j & 7)
+            qc[--j] = 0;
+        
+        /* This switch loads the necessary qlp coefficients and data history
+         * into the q70/h70 vectors, and if needed, the qF8/hF8 vectors.  They
+         * are arranged like so:
+         *     qF8X = qlp[15] - qlp[8],     q70X = qlp[7] - qlp[0]
+         *     hF8X = data[-16] - data[-9], h70X = data[-8] - data[-1]
+         * (Where X is H for the high half word, and L is for the low.)
+         *        
+         * p0: permute vector to pack the high half words
+         */
+        p0 = (vu8)( 0, 1, 4, 5, 8, 9,12,13,16,17,20,21,24,25,28,29);
+        r = s = zero;
+        switch (order + 3 & ~3) {
+            case 16:
+                r = vec_ld(0, qc);
+                s = vec_ld(-64, data);
+            case 12:
+                qF8L = (vu16)vec_pack(r, sh = vec_ld(16, qc));
+                qF8H = (vu16)vec_perm(r, sh, p0);
+                hF8L = (vu16)vec_pack(s, sh2 = vec_ld(-48, data));
+                hF8H = (vu16)vec_perm(s, sh2, p0);                
+            case 8:
+                r = vec_ld(32, qc);
+                s = vec_ld(-32, data);
+                q70L = (vu16)vec_pack(r, sh = vec_ld(48, qc));
+                q70H = (vu16)vec_perm(r, sh, p0);
+                h70L = (vu16)vec_pack(s, sh2 = vec_ld(-16, data));
+                h70H = (vu16)vec_perm(s, sh2, p0);
+                
+                /* p0: shift history and insert low half word of result
+                 * p1: shift history and insert high half word of result
+                 */
+                p0 = (vu8)( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,30,31);
+                p1 = (vu8)( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,28,29);
+                break;
+            case 4:
+                /* order 4 is slightly different: it operates on the entire
+                 * words of the coefficients and data.
+                 */
+                q30 = (vu16)vec_ld(48, qc);
+                h30 = (vu16)vec_ld(-16, data);
+                
+                /* p0: shift in the entire word of the result
+                 * p1: permute vector to swap half words
+                 */
+                p0 = (vu8)( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,28,29,30,31);
+                p1 = (vu8)vec_rl((vu32)vec_lvsl(0, (int *)0), sixteen);
+        }
+
+        /* restore8,16() work almost identically to the 16bit version, although
+         * with the extended multiplication described above in the comments of
+         * FLAC__lpc_compute_residual_from_qlp_coefficients_altivec().
+         *
+         * restore4() uses a simpler form with no mixed vec_msum()'s.
+         */
+        switch (order + 3 & ~3) {
+            case 16:
+            case 12:
+                do {
+                    r = vec_ld(0, residual);
+#define restore16(x)                                                           \
+            sh = (vs32)vec_msum(q70L, h70H, vec_msum(qF8L, hF8H, (vu32)zero)); \
+            s  = (vs32)vec_msum(q70L, h70L, vec_msum(qF8L, hF8L, (vu32)zero)); \
+            sh2 = (vs32)vec_msum(q70H, h70L, vec_msum(qF8H, hF8L, (vu32)zero));\
+            s = vec_sums(vec_add(vec_sl(vec_add(sh, sh2), sixteen), s), zero); \
+            s = vec_add(vec_splat(r, x), vec_sra(s, lpq));                     \
+            hF8H = vec_sld(hF8H, h70H, 2); h70H = vec_perm(h70H, (vu16)s, p1); \
+            hF8L = vec_sld(hF8L, h70L, 2); h70L = vec_perm(h70L, (vu16)s, p0);
+                    restore16(0);
+                    restore16(1);
+                    restore16(2);
+                    restore16(3);
+#undef restore16
+                    vec_st((vs32)vec_mergel(h70H, h70L), 0, data);
+                    data += 4; residual += 4;
+                } while (--data_len);
+                break;
+            case 8:
+                do {
+                    r = vec_ld(0, residual);
+#define restore8(x)                                                            \
+            sh = (vs32)vec_msum(q70L, h70H, (vu32)zero);                       \
+            s  = (vs32)vec_msum(q70L, h70L, (vu32)zero);                       \
+            sh = (vs32)vec_msum(q70H, h70L, (vu32)sh);                         \
+            s = vec_sums(vec_add(vec_sl(sh, sixteen), s), zero);               \
+            s = vec_add(vec_splat(r, x), vec_sra(s, lpq));                     \
+            h70H = vec_perm(h70H, (vu16)s, p1);                                \
+            h70L = vec_perm(h70L, (vu16)s, p0);
+                    restore8(0);
+                    restore8(1);
+                    restore8(2);
+                    restore8(3);
+#undef restore8
+                    vec_st((vs32)vec_mergel(h70H, h70L), 0, data);
+                    data += 4; residual += 4;
+                } while (--data_len);
+                break;
+            case 4:
+                do {
+                    r = vec_ld(0, residual);
+#define restore4(x)                                                            \
+            s = (vs32)vec_add(vec_mulo(q30, h30), (vu32)zero);                 \
+            sh = (vs32)vec_msum(q30, vec_perm(h30, h30, p1), (vu32)zero);      \
+            s = vec_sums(vec_add(vec_sl(sh, sixteen), s), zero);               \
+            s = vec_add(vec_splat(r, x), vec_sra(s, lpq));                     \
+            h30 = vec_perm(h30, (vu16)s, p0);
+                    restore4(0);
+                    restore4(1);
+                    restore4(2);
+                    restore4(3);
+#undef restore4
+                    vec_st((vs32)h30, 0, data);
+                    data += 4; residual += 4;
+                } while (--data_len);
+            default:
+        }
+    }
+}
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/stream_decoder.c flac/src/libFLAC/stream_decoder.c
--- flac-1.1.0/src/libFLAC/stream_decoder.c	Tue Jan 14 00:57:37 2003
+++ flac/src/libFLAC/stream_decoder.c	Wed Sep 15 03:14:30 2004
@@ -272,6 +272,15 @@
 	/* now override with asm where appropriate */
 #ifndef FLAC__NO_ASM
 	if(decoder->private_->cpuinfo.use_asm) {
+#ifdef FLAC__CPU_PPC
+		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_PPC);
+#ifdef FLAC__USE_ALTIVEC
+		if(decoder->private_->cpuinfo.data.ppc.altivec) {
+			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_altivec;
+			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16bit_altivec;
+		}
+#endif
+#endif
 #ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #ifdef FLAC__HAS_NASM
@@ -727,8 +736,11 @@
 		 * output arrays have a buffer of up to 3 zeroes in front
 		 * (at negative indices) for alignment purposes; we use 4
 		 * to keep the data well-aligned.
+		 *
+		 * FLAC__lpc_restore_signal_16bit_altivec() may access up to 1 vector
+		 * (16 bytes) past the end of the output and residual.
 		 */
-		tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*(size+4));
+		tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*(size+8));
 		if(tmp == 0) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
@@ -736,7 +748,7 @@
 		memset(tmp, 0, sizeof(FLAC__int32)*4);
 		decoder->private_->output[i] = tmp + 4;
 
-		tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*size);
+		tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*(size+4));
 		if(tmp == 0) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
@@ -1852,8 +1864,12 @@
 
 	decoder->private_->frame.subframes[channel].type = FLAC__SUBFRAME_TYPE_LPC;
 
-	subframe->residual = decoder->private_->residual[channel];
+	/*
+	 * Note: FLAC__lpc_restore_signal_16bit_altivec() requires residual and output to
+	 * be relatively 16 byte aligned.  Since it is called with output+order, use residual+order as well.
+	 */
 	subframe->order = order;
+	subframe->residual = decoder->private_->residual[channel]+order;
 
 	/* read warm-up samples */
 	for(u = 0; u < order; u++) {
@@ -1903,7 +1919,7 @@
 	/* read residual */
 	switch(subframe->entropy_coding_method.type) {
 		case FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE:
-			if(!read_residual_partitioned_rice_(decoder, order, subframe->entropy_coding_method.data.partitioned_rice.order, &decoder->private_->partitioned_rice_contents[channel], decoder->private_->residual[channel]))
+			if(!read_residual_partitioned_rice_(decoder, order, subframe->entropy_coding_method.data.partitioned_rice.order, &decoder->private_->partitioned_rice_contents[channel], decoder->private_->residual[channel]+order))
 				return false;
 			break;
 		default:
@@ -1914,11 +1930,11 @@
 	memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
 	if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
 		if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
-			decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+			decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel]+order, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 		else
-			decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+			decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel]+order, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 	else
-		decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+		decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel]+order, decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 
 	return true;
 }
diff -Nru --exclude=CVS --exclude=flac.pbproj flac-1.1.0/src/libFLAC/stream_encoder.c flac/src/libFLAC/stream_encoder.c
--- flac-1.1.0/src/libFLAC/stream_encoder.c	Thu Jan  9 23:27:24 2003
+++ flac/src/libFLAC/stream_encoder.c	Wed Sep 15 03:14:30 2004
@@ -730,6 +730,17 @@
 	/* now override with asm where appropriate */
 #ifndef FLAC__NO_ASM
 	if(encoder->private_->cpuinfo.use_asm) {
+#ifdef FLAC__CPU_PPC
+		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_PPC);
+#ifdef FLAC__USE_ALTIVEC
+		if(encoder->private_->cpuinfo.data.ppc.altivec) {
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_altivec;
+			encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_altivec;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_altivec;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16bit_altivec;
+		}
+#endif
+#endif
 #ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #ifdef FLAC__HAS_NASM
@@ -1554,28 +1565,35 @@
 	 * requires that the input arrays (in our case the integer signals)
 	 * have a buffer of up to 3 zeroes in front (at negative indices) for
 	 * alignment purposes; we use 4 to keep the data well-aligned.
+	 *
+	 * FLAC__lpc_compute_residual_from_qlp_coefficients_16bit_altivec()
+	 * may access up to 2 vectors (32 bytes) past the end of the integer
+	 * signal and residual.
+	 *
+	 * FLAC__lpc_compute_autocorrelation_altivec() may access up to 4 vectors
+	 * (64 bytes) past the end of the real signal.
 	 */
 
 	for(i = 0; ok && i < encoder->protected_->channels; i++) {
-		ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size+4, &encoder->private_->integer_signal_unaligned[i], &encoder->private_->integer_signal[i]);
-		ok = ok && FLAC__memory_alloc_aligned_real_array(new_size, &encoder->private_->real_signal_unaligned[i], &encoder->private_->real_signal[i]);
+		ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size+12, &encoder->private_->integer_signal_unaligned[i], &encoder->private_->integer_signal[i]);
+		ok = ok && FLAC__memory_alloc_aligned_real_array(new_size+20, &encoder->private_->real_signal_unaligned[i], &encoder->private_->real_signal[i]);
 		memset(encoder->private_->integer_signal[i], 0, sizeof(FLAC__int32)*4);
 		encoder->private_->integer_signal[i] += 4;
 	}
 	for(i = 0; ok && i < 2; i++) {
-		ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size+4, &encoder->private_->integer_signal_mid_side_unaligned[i], &encoder->private_->integer_signal_mid_side[i]);
-		ok = ok && FLAC__memory_alloc_aligned_real_array(new_size, &encoder->private_->real_signal_mid_side_unaligned[i], &encoder->private_->real_signal_mid_side[i]);
+		ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size+12, &encoder->private_->integer_signal_mid_side_unaligned[i], &encoder->private_->integer_signal_mid_side[i]);
+		ok = ok && FLAC__memory_alloc_aligned_real_array(new_size+20, &encoder->private_->real_signal_mid_side_unaligned[i], &encoder->private_->real_signal_mid_side[i]);
 		memset(encoder->private_->integer_signal_mid_side[i], 0, sizeof(FLAC__int32)*4);
 		encoder->private_->integer_signal_mid_side[i] += 4;
 	}
 	for(channel = 0; ok && channel < encoder->protected_->channels; channel++) {
 		for(i = 0; ok && i < 2; i++) {
-			ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size, &encoder->private_->residual_workspace_unaligned[channel][i], &encoder->private_->residual_workspace[channel][i]);
+			ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size+12, &encoder->private_->residual_workspace_unaligned[channel][i], &encoder->private_->residual_workspace[channel][i]);
 		}
 	}
 	for(channel = 0; ok && channel < 2; channel++) {
 		for(i = 0; ok && i < 2; i++) {
-			ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size, &encoder->private_->residual_workspace_mid_side_unaligned[channel][i], &encoder->private_->residual_workspace_mid_side[channel][i]);
+			ok = ok && FLAC__memory_alloc_aligned_int32_array(new_size+12, &encoder->private_->residual_workspace_mid_side_unaligned[channel][i], &encoder->private_->residual_workspace_mid_side[channel][i]);
 		}
 	}
 	ok = ok && FLAC__memory_alloc_aligned_uint32_array(new_size, &encoder->private_->abs_residual_unaligned, &encoder->private_->abs_residual);
@@ -2277,6 +2295,12 @@
 	if(ret != 0)
 		return 0; /* this is a hack to indicate to the caller that we can't do lp at this order on this subframe */
 
+	/*
+	 * Note: Altivec lpc_compute_residual_from_qlp_coefficients() requires that signal and residual
+	 * are relatively 16 byte aligned.  Since it is called with signal+order, use residual+order as well.
+	 */
+	residual += order;
+	
 	if(subframe_bps + qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
 		if(subframe_bps <= 16 && qlp_coeff_precision <= 16)
 			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit(signal+order, residual_samples, qlp_coeff, order, quantization, residual);