Revised patch.
[snip]
+@smallexample
+static void
+dgemv_kernel_4x4 (long n, const double *ap, long lda,
+ const double *x, double *y, double alpha)
+@{
+ double *a0;
+ double *a1;
+ double *a2;
+ double *a3;
+
+ __asm__
+ (
+ "lxvd2x 34, 0, %10 \n\t" // x0, x1
+ "lxvd2x 35, %11, %10 \n\t" // x2, x3
+ "xxspltd 32, %x9, 0 \n\t" // alpha, alpha
+ "sldi %6, %13, 3 \n\t" // lda * sizeof (double)
+ "xvmuldp 34, 34, 32 \n\t" // x0 * alpha, x1 * alpha
+ "xvmuldp 35, 35, 32 \n\t" // x2 * alpha, x3 * alpha
+ "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
+ "add %6, %6, %6 \n\t" // 2 * lda
+ "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
+ "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha
+ "xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha
+ "xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha
+ "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
+ "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
+ ...
+ "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
+ "#a0=%3 a1=%4 a2=%5 a3=%6"
+ :
+ "+m" (*y),
+ "+r" (n), // 1
+ "+b" (y), // 2
+ "=b" (a0), // 3
+ "=b" (a1), // 4
+ "=&b" (a2), // 5
+ "=&b" (a3) // 6
+ :
+ "m" (*x),
+ "m" (*ap),
+ "d" (alpha), // 9
+ "r" (x), // 10
+ "b" (16), // 11
+ "3" (ap), // 12
+ "4" (lda) // 13
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+@}
+@end smallexample