This is adapted from the nearest scaled cover scanline fetcher, modified to pack output data in 16-bit units.
lowlevel-blt-bench -n src_0565_0565: Before After Mean StdDev Mean StdDev Confidence Change L1 119.6 4.1 72.5 1.1 100.0% -39.4% L2 45.2 1.4 55.4 2.0 100.0% +22.5% M 47.1 0.1 71.3 0.1 100.0% +51.4% HT 26.4 0.2 31.8 0.3 100.0% +20.3% VT 25.0 0.2 30.0 0.3 100.0% +20.3% R 22.6 0.2 27.6 0.2 100.0% +22.0% RT 9.7 0.2 10.3 0.2 100.0% +5.6% affine-bench * 0 0 1 src r5g6b5 r5g6b5: Before After Mean StdDev Mean StdDev Confidence Change 0.5 59.6 0.1 129.6 0.1 100.0% +117.2% 0.75 52.0 0.1 106.3 0.1 100.0% +104.6% 1.0 47.2 0.1 71.7 0.0 100.0% +52.0% 1.5 39.1 0.1 68.1 0.1 100.0% +74.2% 2.0 37.7 0.1 68.7 0.1 100.0% +82.2% --- pixman/pixman-arm-simd-asm-scaled.S | 4 ++ pixman/pixman-arm-simd-asm-scaled.h | 69 ++++++++++++++++++++++++++++++----- pixman/pixman-arm-simd.c | 11 ++++++ 3 files changed, 75 insertions(+), 9 deletions(-) diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S index fa743a5..1f3d933 100644 --- a/pixman/pixman-arm-simd-asm-scaled.S +++ b/pixman/pixman-arm-simd-asm-scaled.S @@ -170,6 +170,10 @@ generate_nearest_scaled_cover_function \ pixman_get_scanline_nearest_scaled_cover_x8r8g8b8_asm_armv6, 32, \ 2, 3 /* prefetch distances */, nop_macro, convert_x888_8888 +generate_nearest_scaled_cover_function \ + pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6, 16, \ + 2, no_mask /* prefetch distances */, nop_macro, nop_macro, 16 + .macro init_ge msr CPSR_s, #0x50000 .endm diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h index fb6eb44..66d2e12 100644 --- a/pixman/pixman-arm-simd-asm-scaled.h +++ b/pixman/pixman-arm-simd-asm-scaled.h @@ -92,7 +92,12 @@ .macro nearest_scaled_cover_enlarge_nomask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store adds ACCUM, ACCUM, UX + .if PIXEL_MERGE_OFFSET == 0 mov \reg, PIXEL + .else + orr \reg, \reg, PIXEL, lsl #PIXEL_MERGE_OFFSET + .endif + .set PIXEL_MERGE_OFFSET, (PIXEL_MERGE_OFFSET + out_bpp) & 31 \store branch cc, \exit_label, 1203f .ifnc "\may_be_final","" @@ -156,10 +161,20 @@ mov TMP, XHI adds XLO, XLO, UX, lsl #16 adc XHI, XHI, UX, lsr #16 + .if PIXEL_MERGE_OFFSET == 0 ldrx \bpp,, <\reg, [PTR]> + .else + ldrx \bpp,, <PIXEL2, [PTR]> + .endif eor TMP, TMP, XHI bics TMP, TMP, #255/\bpp + .if PIXEL_MERGE_OFFSET == 0 \convert \reg, TMP + .else + \convert PIXEL2, TMP + orr \reg, \reg, PIXEL2, lsl #PIXEL_MERGE_OFFSET + .endif + .set PIXEL_MERGE_OFFSET, (PIXEL_MERGE_OFFSET + out_bpp) & 31 \store branch eq, \exit_label, 1403f subs PLDS, PLDS, #32 @@ -183,7 +198,14 @@ \inner_loop \bpp, WK0, \convert, mask_is_0, 1, 1503f, <add DST, DST, #4> b 1503f .endif + .set PIXEL_MERGE_OFFSET, 0 + .if out_bpp == 32 1502: \inner_loop \bpp, WK0, \convert, mask_is_non_0, 1,, <str WK0, [DST], #4> + .elseif out_bpp == 16 +1502: \inner_loop \bpp, WK0, \convert, mask_is_non_0, 1,, <strh WK0, [DST], #2> + .else + .error "Output bits per pixel not supported" + .endif 1503: .endm @@ -204,15 +226,26 @@ \inner_loop \bpp, WK3, \convert, mask_is_0, 1, 1602f, <add DST, DST, #4*4> b 1602f .endif -1601: \inner_loop \bpp, WK0, \convert +1601: + .set PIXEL_MERGE_OFFSET, 0 + .rept 32 / out_bpp + \inner_loop \bpp, WK0, \convert + .endr + .rept 32 / out_bpp \inner_loop \bpp, WK1, \convert + .endr + .rept 32 / out_bpp \inner_loop \bpp, WK2, \convert + .endr + .rept 32 / out_bpp - 1 + \inner_loop \bpp, WK3, \convert + .endr \inner_loop \bpp, WK3, \convert,, 1,, <stmia DST!!, {WK0,WK1,WK2,WK3}> 1602: .endm .macro process bpp, has_mask, inner_loop, convert - cmp COUNT, #2 * 4 - 1 - 1 @ guaranteed at least one aligned half-cacheline output? + cmp COUNT, #2 * 128 / out_bpp - 1 - 1 @ guaranteed at least one aligned half-cacheline output? blo 1706f tst DST, #15 beq 1702f @@ -220,16 +253,21 @@ sub COUNT, COUNT, #1 tst DST, #15 bne 1701b -1702: sub COUNT, COUNT, #4 - 1 +1702: sub COUNT, COUNT, #128 / out_bpp - 1 + .if \has_mask tst MASK, #16 beq 1704f -1703: process4 \bpp, \has_mask, 0, \inner_loop, \convert - subs COUNT, COUNT, #4 + .endif +1703: +.if \has_mask + process4 \bpp, \has_mask, 0, \inner_loop, \convert + subs COUNT, COUNT, #128 / out_bpp bcc 1705f + .endif 1704: process4 \bpp, \has_mask, 1, \inner_loop, \convert - subs COUNT, COUNT, #4 + subs COUNT, COUNT, #128 / out_bpp bcs 1703b -1705: adds COUNT, COUNT, #4 - 1 +1705: adds COUNT, COUNT, #128 / out_bpp - 1 bcc 1707f @ drop through... 1706: process1 \bpp, \has_mask, 1, \inner_loop, \convert @@ -243,7 +281,8 @@ prefetch_distance_src_, \ prefetch_distance_mask_, \ init, \ - convert + convert, \ + out_bpp_ /* void fname(uint32_t width, * pixman_fixed_t x, @@ -260,6 +299,11 @@ pixman_asm_function fname */ .set prefetch_distance_src, prefetch_distance_src_ .set prefetch_distance_mask, prefetch_distance_mask_ + .ifc "out_bpp_","" + .set out_bpp, 32 + .else + .set out_bpp, out_bpp_ + .endif /* * Assign symbolic names to registers @@ -271,7 +315,8 @@ XLO .req a2 @ reduce only UX .req a3 DST .req a4 SRC .req v1 -MASK .req v2 +MASK .req v2 @ only when outputing 32bpp +PIXEL2 .req v2 @ only when outputing <32bpp and reducing PLDS .req v3 PIXEL .req v4 @ enlarge only XHI .req v4 @ reduce only @@ -290,6 +335,7 @@ TMP .req lr blo 1807f-4 \init mla WK2, COUNT, UX, X + .if out_bpp == 32 bics WK0, MASK, #31 beq 1801f @ Use a simplified preload process for the mask, @@ -300,6 +346,7 @@ TMP .req lr .set OFFSET, OFFSET + 32 .endr 1801: + .endif add WK0, SRC, X, lsr #16 - (log2_\bpp - 3) bic WK0, WK0, #31 pld [WK0] @@ -321,11 +368,13 @@ TMP .req lr mov ACCUM, X, lsl #16 mov UX, UX, lsl #16 bic SRC, SRC, #(\bpp-1)/8 + .if out_bpp == 32 teq MASK, #0 beq 1804f mov VALID, #0 process \bpp, 1, nearest_scaled_cover_enlarge_mask_innerloop, \convert 1804: + .endif ldrx \bpp,, <PIXEL, [SRC]> \convert PIXEL, TMP process \bpp, 0, nearest_scaled_cover_enlarge_nomask_innerloop, \convert @@ -336,9 +385,11 @@ TMP .req lr mov XHI, X, lsr #16 mov XLO, X, lsl #16 add XHI, XHI, TMP, lsr #log2_\bpp - 3 + .if out_bpp == 32 teq MASK, #0 beq 1806f process \bpp, 1, nearest_scaled_cover_reduce_mask_innerloop, \convert + .endif 1806: process \bpp, 0, nearest_scaled_cover_reduce_nomask_innerloop, \convert 1807: diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index 7bc1e39..fedb92d 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -268,7 +268,15 @@ cputype##_composite_nearest_scaled_cover_##name (pixman_implementation_t *imp, free (scanline_buffer); \ } +void +pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6(uint32_t width, + pixman_fixed_t x, + pixman_fixed_t ux, + uint16_t *dest, + const uint16_t *source); + BIND_NEAREST_SCALED_COVER_FAST_PATH_SRC_DST (armv6, src_8888_8888, SRC, src, uint32_t, uint32_t, a8r8g8b8, a8r8g8b8) +BIND_NEAREST_SCALED_COVER_FAST_PATH_SRC_DST (armv6, src_0565_0565, SRC, src, uint16_t, uint16_t, r5g6b5, r5g6b5) void pixman_composite_src_n_8888_asm_armv6 (int32_t w, @@ -479,6 +487,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca), + PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, r5g6b5, r5g6b5, src_0565_0565), + PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, b5g6r5, b5g6r5, src_0565_0565), + PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, a8r8g8b8, a8r8g8b8, src_8888_8888), PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, a8r8g8b8, x8r8g8b8, src_8888_8888), PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, x8r8g8b8, x8r8g8b8, src_8888_8888), -- 1.7.5.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman