One is that the pixel count wasn't being shifted correctly for 8bpp or 16bpp
images in the narrow case. The fix is illustrated by src_8_8:
Before After
Mean StdDev Mean StdDev Change Confidence
L1592.6 40.4 615.1 75.63.8% 56% (insignificant)
L2235.05.5 230.5 19.8 -1.9% 48% (insignificant)
M 229.22.2 229.01.5 -0.1% 20% (insignificant)
HT 60.00.4 62.40.64.0% 100.0%
VT 52.90.5 53.40.50.9% 94.5% (insignificant)
R 45.20.4 47.70.75.6% 100.0%
RT 12.00.4 12.11.60.8% 14% (insignificant)
The second one meant that only the source pointer was being used for preloads
for mid-width rectangles (typically between 32-160 bytes). A routine that
illustrates this is over__, where the destination buffer is supposed
to be preloaded:
Before After
Mean StdDev Mean StdDev Change Confidence
L1 37.60.4 37.90.31.0% 99.5%
L2 30.80.5 30.80.50.1% 22% (insignificant)
M 25.80.0 25.80.00.0% 21% (insignificant)
HT 14.40.1 15.50.18.0% 100.0%
VT 13.80.1 14.60.16.2% 100.0%
R 14.30.1 15.70.1 10.3% 100.0%
RT 6.70.47.60.4 12.5% 100.0%
This bug also explains why medium-width rectangle prefetch was a regression
for over_n_8_. Now it can be re-enabled, and results are:
Before After
Mean StdDev Mean StdDev Change Confidence
L1 22.80.2 22.80.2 -0.2% 41% (insignificant)
L2 21.80.1 21.80.10.1% 37% (insignificant)
M 22.20.0 22.20.1 -0.1% 56% (insignificant)
HT 12.30.1 14.10.1 14.4% 100.0%
VT 11.70.1 13.30.6 13.8% 100.0%
R 10.90.1 12.80.5 17.1% 100.0%
RT 5.90.16.50.1 11.3% 100.0%
---
pixman/pixman-arm-simd-asm.S |2 +-
pixman/pixman-arm-simd-asm.h | 22 ++
2 files changed, 7 insertions(+), 17 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 8700da9..f043826 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -576,7 +576,7 @@ generate_composite_function \
generate_composite_function \
pixman_composite_over_n_8__asm_armv6, 0, 8, 32 \
-FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR |
FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_ONLY_PRELOAD_WIDE \
+FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR |
FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
2, /* prefetch distance */ \
over_n_8__init, \
over_n_8__newline, \
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index c1db3fc..ee70131 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -232,7 +232,7 @@
/* In these cases, each line for each channel is in either 1 or 2
cache lines */
PF bic,WK0, base, #31
PF pld,[WK0]
-PF add,WK1, base, X, LSL #2
+PF add,WK1, base, X, LSL #bpp_shift
PF sub,WK1, WK1, #1
PF bic,WK1, WK1, #31
PF cmp,WK1, WK0
@@ -240,9 +240,9 @@
PF pld,[WK1]
90:
.else
-PF bic,WK0, SRC, #31
+PF bic,WK0, base, #31
PF pld,[WK0]
-PF add,WK1, SRC, X, lsl #bpp_shift
+PF add,WK1, base, X, lsl #bpp_shift
PF sub,WK1, WK1, #1
PF bic,WK1, WK1, #31
PF cmp,WK1, WK0
@@ -399,18 +399,8 @@
preload_trailing mask_bpp, mask_bpp_shift, MASK
preload_trailing dst_r_bpp, dst_bpp_shift, DST
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
-113:
-process_head , 16, 0, unaligned_src, unaligned_mask, 0
-process_tail , 16, 0
- .if !((flags) & FLAG_PROCESS_DOES_STORE)
-pixst , 16, 0, DST
- .endif
-subsX, X, #128/dst_w_bpp
-bhs 113b
-/* Trailing pixels */
-tst X, #128/dst_w_bpp - 1
-beq exit_label
-trailing_15bytes process_head, process_tail, unaligned_src,
unaligned_mask
+/* The remainder of the line is handled identically to the medium case
*/
+medium_case_inner_loop_and_trailing_pixels process_head,
process_tail, exit_label, unaligned_src, unaligned_mask
.endm
.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail,
exit_label, unaligned_src, unaligned_mask
@@ -723,7 +713,7 @@ fname:
sub X, X, #128/dst_w_bpp /* simplifies inner loop termination
*/
tst DST, #15
beq 164f
-rsb WK0, DST, #0 /* bits 0-4 = number of leading bytes until
destination aligned */
+rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until
destination aligned */