[Pixman] [PATCH 3/5] ARMv6: Rewrite of non-scaled fast paths

Ben Avison Fri, 21 Dec 2012 10:51:13 -0800

There is very little in common with the previous revision of this source
file, but I present it as a patch nevertheless.


diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index b438001..8700da9 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,14 +1,14 @@
 /*
- * Copyright © 2008 Mozilla Corporation
- * Copyright © 2010 Nokia Corporation
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
  * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
+ * documentation, and that the name of the copyright holders not be used in
  * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Mozilla Corporation makes no
+ * specific, written prior permission.  The copyright holders make no
  * representations about the suitability of this software for any purpose.  It
  * is provided "as is" without express or implied warranty.
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  *
- * Author:  Jeff Muizelaar (j...@infidigm.net)
+ * Author:  Ben Avison (bavi...@riscosopen.org)
  *
  */

@@ -37,412 +37,552 @@
        .altmacro
        .p2align 2

-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-       .func fname
-       .global fname
-#ifdef __ELF__
-       .hidden fname
-       .type fname, %function
-#endif
-fname:
-.endm
+#include "pixman-arm-simd-asm.h"

-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ *   cond           ARM condition code for code block
+ *   numbytes       Number of output bytes that should be generated this time
+ *   firstreg       First WK register in which to place output
+ *   unaligned_src  Whether to use non-wordaligned loads of source image
+ *   unaligned_mask Whether to use non-wordaligned loads of mask image
+ *   preload        If outputting 16 bytes causes 64 bytes to be read, whether 
an extra preload should be output
  */

-pixman_asm_function pixman_composite_add_8_8_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       mov     r10, r1
-       sub     sp, sp, #4
-       subs    r10, r10, #1
-       mov     r11, r0
-       mov     r8, r2
-       str     r3, [sp]
-       ldr     r7, [sp, #36]
-       bcc     0f
-6:     cmp     r11, #0
-       beq     1f
-       orr     r3, r8, r7
-       tst     r3, #3
-       beq     2f
-       mov     r1, r8
-       mov     r0, r7
-       mov     r12, r11
-       b       3f
-5:     tst     r3, #3
-       beq     4f
-3:     ldrb    r2, [r0], #1
-       subs    r12, r12, #1
-       ldrb    r3, [r1]
-       uqadd8  r3, r2, r3
-       strb    r3, [r1], #1
-       orr     r3, r1, r0
-       bne     5b
-1:     ldr     r3, [sp]
-       add     r8, r8, r3
-       ldr     r3, [sp, #40]
-       add     r7, r7, r3
-10:    subs    r10, r10, #1
-       bcs     6b
-0:     add     sp, sp, #4
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-2:     mov     r12, r11
-       mov     r1, r8
-       mov     r0, r7
-4:     cmp     r12, #3
-       subgt   r6, r12, #4
-       movgt   r9, r12
-       lsrgt   r5, r6, #2
-       addgt   r3, r5, #1
-       movgt   r12, #0
-       lslgt   r4, r3, #2
-       ble     7f
-8:     ldr     r3, [r0, r12]
-       ldr     r2, [r1, r12]
-       uqadd8  r3, r3, r2
-       str     r3, [r1, r12]
-       add     r12, r12, #4
-       cmp     r12, r4
-       bne     8b
-       sub     r3, r9, #4
-       bic     r3, r3, #3
-       add     r3, r3, #4
-       subs    r12, r6, r5, lsl #2
-       add     r1, r1, r3
-       add     r0, r0, r3
-       beq     1b
-7:     mov     r4, #0
-9:     ldrb    r3, [r1, r4]
-       ldrb    r2, [r0, r4]
-       uqadd8  r3, r2, r3
-       strb    r3, [r1, r4]
-       add     r4, r4, #1
-       cmp     r4, r12
-       bne     9b
-       ldr     r3, [sp]
-       add     r8, r8, r3
-       ldr     r3, [sp, #40]
-       add     r7, r7, r3
-       b       10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       sub     sp, sp, #20
-       cmp     r1, #0
-       mov     r12, r2
-       str     r1, [sp, #12]
-       str     r0, [sp, #16]
-       ldr     r2, [sp, #52]
-       beq     0f
-       lsl     r3, r3, #2
-       str     r3, [sp]
-       ldr     r3, [sp, #56]
-       mov     r10, #0
-       lsl     r3, r3, #2
-       str     r3, [sp, #8]
-       mov     r11, r3
-       b       1f
-6:     ldr     r11, [sp, #8]
-1:     ldr     r9, [sp]
-       mov     r0, r12
-       add     r12, r12, r9
-       mov     r1, r2
-       str     r12, [sp, #4]
-       add     r2, r2, r11
-       ldr     r12, [sp, #16]
-       ldr     r3, =0x00800080
-       ldr     r9, =0xff00ff00
-       mov     r11, #255
-       cmp     r12, #0
-       beq     4f
-5:     ldr     r5, [r1], #4
-       ldr     r4, [r0]
-       sub     r8, r11, r5, lsr #24
-       uxtb16  r6, r4
-       uxtb16  r7, r4, ror #8
-       mla     r6, r6, r8, r3
-       mla     r7, r7, r8, r3
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       and     r7, r7, r9
-       uxtab16 r6, r7, r6, ror #8
-       uqadd8  r5, r6, r5
-       str     r5, [r0], #4
-       subs    r12, r12, #1
-       bne     5b
-4:     ldr     r3, [sp, #12]
-       add     r10, r10, #1
-       cmp     r10, r3
-       ldr     r12, [sp, #4]
-       bne     6b
-0:     add     sp, sp, #20
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       sub     sp, sp, #28
-       cmp     r1, #0
-       str     r1, [sp, #12]
-       ldrb    r1, [sp, #71]
-       mov     r12, r2
-       str     r0, [sp, #16]
-       ldr     r2, [sp, #60]
-       str     r1, [sp, #24]
-       beq     0f
-       lsl     r3, r3, #2
-       str     r3, [sp, #20]
-       ldr     r3, [sp, #64]
-       mov     r10, #0
-       lsl     r3, r3, #2
-       str     r3, [sp, #8]
-       mov     r11, r3
-       b       1f
-5:     ldr     r11, [sp, #8]
-1:     ldr     r4, [sp, #20]
-       mov     r0, r12
-       mov     r1, r2
-       add     r12, r12, r4
-       add     r2, r2, r11
-       str     r12, [sp]
-       str     r2, [sp, #4]
-       ldr     r12, [sp, #16]
-       ldr     r2, =0x00800080
-       ldr     r3, [sp, #24]
-       mov     r11, #255
-       cmp     r12, #0
-       beq     3f
-4:     ldr     r5, [r1], #4
-       ldr     r4, [r0]
-       uxtb16  r6, r5
-       uxtb16  r7, r5, ror #8
-       mla     r6, r6, r3, r2
-       mla     r7, r7, r3, r2
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r5, r6, r7, lsl #8
-       uxtb16  r6, r4
-       uxtb16  r7, r4, ror #8
-       sub     r8, r11, r5, lsr #24
-       mla     r6, r6, r8, r2
-       mla     r7, r7, r8, r2
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r6, r6, r7, lsl #8
-       uqadd8  r5, r6, r5
-       str     r5, [r0], #4
-       subs    r12, r12, #1
-       bne     4b
-3:     ldr     r1, [sp, #12]
-       add     r10, r10, #1
-       cmp     r10, r1
-       ldr     r12, [sp]
-       ldr     r2, [sp, #4]
-       bne     5b
-0:     add     sp, sp, #28
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       sub     sp, sp, #28
-       cmp     r1, #0
-       ldr     r9, [sp, #60]
-       str     r1, [sp, #12]
-       bic     r1, r9, #-16777216
-       str     r1, [sp, #20]
-       mov     r12, r2
-       lsr     r1, r9, #8
-       ldr     r2, [sp, #20]
-       bic     r1, r1, #-16777216
-       bic     r2, r2, #65280
-       bic     r1, r1, #65280
-       str     r2, [sp, #20]
-       str     r0, [sp, #16]
-       str     r1, [sp, #4]
-       ldr     r2, [sp, #68]
-       beq     0f
-       lsl     r3, r3, #2
-       str     r3, [sp, #24]
-       mov     r0, #0
-       b       1f
-5:     ldr     r3, [sp, #24]
-1:     ldr     r4, [sp, #72]
-       mov     r10, r12
-       mov     r1, r2
-       add     r12, r12, r3
-       add     r2, r2, r4
-       str     r12, [sp, #8]
-       str     r2, [sp]
-       ldr     r12, [sp, #16]
-       ldr     r11, =0x00800080
-       ldr     r2, [sp, #4]
-       ldr     r3, [sp, #20]
-       cmp     r12, #0
-       beq     3f
-4:     ldrb    r5, [r1], #1
-       ldr     r4, [r10]
-       mla     r6, r3, r5, r11
-       mla     r7, r2, r5, r11
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r5, r6, r7, lsl #8
-       uxtb16  r6, r4
-       uxtb16  r7, r4, ror #8
-       mvn     r8, r5
-       lsr     r8, r8, #24
-       mla     r6, r6, r8, r11
-       mla     r7, r7, r8, r11
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r6, r6, r7, lsl #8
-       uqadd8  r5, r6, r5
-       str     r5, [r10], #4
-       subs    r12, r12, #1
-       bne     4b
-3:     ldr     r4, [sp, #12]
-       add     r0, r0, #1
-       cmp     r0, r4
-       ldr     r12, [sp, #8]
-       ldr     r2, [sp]
-       bne     5b
-0:     add     sp, sp, #28
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-.endfunc
+.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, 
unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm

-/*
- * Note: This code is only using armv5te instructions (not even armv6),
- *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
- *       be split into a few variants, tuned for each microarchitecture.
- *
- * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
- * have efficient write combining), it needs to be changed to use 16-byte
- * aligned writes using STM instruction.
- *
- * Nearest scanline scaler macro template uses the following arguments:
- *  fname                     - name of the function to generate
- *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
- *  t                         - type suffix for LDR/STR instructions
- *  prefetch_distance         - prefetch in the source image by that many
- *                              pixels ahead
- *  prefetch_braking_distance - stop prefetching when that many pixels are
- *                              remaining before the end of scanline
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+    3, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro /* process tail */
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+    3, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro /* process tail */
+
+generate_composite_function \
+    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+    3, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro /* process tail */
+
+/******************************************************************************/
+
+.macro src_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #8
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail  cond, numbytes, firstreg
+    WK4     .req    SRC
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+        pixst   cond, numbytes, 4, DST
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | 
FLAG_PROCESS_DOES_STORE \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8888_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | 
FLAG_PROCESS_DOES_STORE \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_0565_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | 
FLAG_PROCESS_DOES_STORE \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+/******************************************************************************/
+
+.macro src_x888_8888_pixel, cond, reg
+        orr&cond WK&reg, WK&reg, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, 
unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
+        src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+        src_x888_8888_pixel cond, %(firstreg+1)
+  .if numbytes == 16
+        src_x888_8888_pixel cond, %(firstreg+2)
+        src_x888_8888_pixel cond, %(firstreg+3)
+  .endif
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+    3, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    pixman_composite_src_x888_8888_process_head, \
+    pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+        /* Hold loop invariants in MASK and STRIDE_M */
+        ldr     MASK, =0x07E007E0
+        mov     STRIDE_M, #0xFF000000
+        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+        ldr     SCRATCH, =0x80008000
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+        and     SCRATCH, WK&reg1, MASK             @ 
00000GGGGGG0000000000gggggg00000
+        bic     WK&reg2, WK&reg1, MASK             @ 
RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 
00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK&reg1, WK&reg2, lsl #16          @ 
rrrrr000000bbbbb0000000000000000
+        mov     SCRATCH, SCRATCH, ror #19          @ 
GGGG0000ggggggggggg00000GGGGGGGG
+        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ 
RRRRR000000BBBBB0000000000000000
+        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 
rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 
RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ 
rrrrrrrr--------bbbbbbbb--------
+        sel     WK&reg1, WK&reg1, SCRATCH          @ 
rrrrrrrrggggggggbbbbbbbb--------
+        mov     SCRATCH, SCRATCH, ror #16          @ 
ggg00000GGGGGGGGGGGG0000gggggggg
+        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ 
RRRRRRRR--------BBBBBBBB--------
+        sel     WK&reg2, WK&reg2, SCRATCH          @ 
RRRRRRRRGGGGGGGGBBBBBBBB--------
+        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 
11111111rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 
11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+   It would however be preferable for an XRGB target, since we could knock off 
the last 2 instructions, but is that a common case?
+        and     SCRATCH, WK&reg1, MASK             @ 
00000GGGGGG0000000000gggggg00000
+        bic     WK&reg1, WK&reg1, MASK             @ 
RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 
00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK&reg2, WK&reg1, lsr #16          @ 
0000000000000000RRRRR000000BBBBB
+        mov     SCRATCH, SCRATCH, ror #27          @ 
GGGGGGGGGGGG0000ggggggggggg00000
+        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 
0000000000000000rrrrr000000bbbbb
+        mov     WK&reg2, WK&reg2, lsl #3           @ 
0000000000000RRRRR000000BBBBB000
+        mov     WK&reg1, WK&reg1, lsl #3           @ 
0000000000000rrrrr000000bbbbb000
+        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 
0000000000000RRRRRRRRRR0BBBBBBBB
+        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 
0000000000000rrrrrrrrrr0bbbbbbbb
+        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ 
--------RRRRRRRR--------BBBBBBBB
+        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ 
--------rrrrrrrr--------bbbbbbbb
+        sel     WK&reg2, SCRATCH, WK&reg2          @ 
--------RRRRRRRRGGGGGGGGBBBBBBBB
+        sel     WK&reg1, SCRATCH, WK&reg1          @ 
--------rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg2, WK&reg2, #0xFF000000      @ 
11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        orr     WK&reg1, WK&reg1, #0xFF000000      @ 
11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+        bic     SCRATCH, WK&reg, MASK              @ 
0000000000000000rrrrr000000bbbbb
+        and     WK&reg, WK&reg, MASK               @ 
000000000000000000000gggggg00000
+        mov     SCRATCH, SCRATCH, lsl #3           @ 
0000000000000rrrrr000000bbbbb000
+        mov     WK&reg, WK&reg, lsl #5             @ 
0000000000000000gggggg0000000000
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 
0000000000000rrrrrrrrrr0bbbbbbbb
+        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 
000000000000000gggggggggggg00000
+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ 
--------rrrrrrrr--------bbbbbbbb
+        sel     WK&reg, WK&reg, SCRATCH            @ 
--------rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg, WK&reg, #0xFF000000        @ 
11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, 
unaligned_mask, preload
+ .if numbytes == 16
+        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+        pixld   , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+        pixld   , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail   cond, numbytes, firstreg
+ .if numbytes == 16
+        src_0565_8888_2pixels firstreg, %(firstreg+1)
+        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+        src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+    3, /* prefetch distance */ \
+    src_0565_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_0565_8888_process_head, \
+    src_0565_8888_process_tail
+
+/******************************************************************************/
+
+.macro add_8_8_8pixels  cond, dst1, dst2
+        uqadd8&cond  WK&dst1, WK&dst1, MASK
+        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+.endm
+
+.macro add_8_8_4pixels  cond, dst
+        uqadd8&cond  WK&dst, WK&dst, MASK
+.endm
+
+.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, 
unaligned_mask, preload
+    WK4     .req    MASK
+    WK5     .req    STRIDE_M
+ .if numbytes == 16
+        pixld   cond, 8, 4, SRC, unaligned_src
+        pixld   cond, 16, firstreg, DST, 0
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+        pixld   cond, 8, 4, SRC, unaligned_src
+ .else
+        pixld   cond, numbytes, 4, SRC, unaligned_src
+        pixld   cond, numbytes, firstreg, DST, 0
+ .endif
+    .unreq  WK4
+    .unreq  WK5
+.endm
+
+.macro add_8_8_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .else
+        add_8_8_4pixels cond, firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER, \
+    2, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    add_8_8_process_head, \
+    add_8_8_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, 
unaligned_mask, preload
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
+        /* Since these colours a premultiplied by alpha, only 0 indicates 
transparent (any other colour with 0 in the alpha byte is luminous) */
+        teq     WK&reg0, #0
+ .if numbytes > 4
+        teqeq   WK&reg1, #0
+  .if numbytes > 8
+        teqeq   WK&reg2, #0
+        teqeq   WK&reg3, #0
+  .endif
+ .endif
+.endm
+
+.macro over_8888_8888_prepare  next
+        mov     WK&next, WK&next, lsr #24
+.endm
+
+.macro over_8888_8888_1pixel src, dst, offset, next
+        /* src = destination component multiplier */
+        rsb     WK&src, WK&src, #255
+        /* Split even/odd bytes of dst into SCRATCH/dst */
+        uxtb16  SCRATCH, WK&dst
+        uxtb16  WK&dst, WK&dst, ror #8
+        /* Multiply through, adding 0.5 to the upper byte of result for 
rounding */
+        mla     SCRATCH, SCRATCH, WK&src, MASK
+        mla     WK&dst, WK&dst, WK&src, MASK
+        /* Where we would have had a stall between the result of the first MLA 
and the shifter input,
+         * reload the complete source pixel */
+        ldr     WK&src, [SRC, #offset]
+        /* Multiply by 257/256 to approximate 256/255 */
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        /* In this stall, start processing the next pixel */
+ .if offset < -4
+        mov     WK&next, WK&next, lsr #24
+ .endif
+        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        /* Recombine even/odd bytes of multiplied destination */
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     WK&dst, SCRATCH, WK&dst
+        /* Saturated add of source to multiplied destination */
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+firstreg), 
%(5+firstreg), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        over_8888_8888_prepare  %(4+firstreg)
+ .set PROCESS_REG, firstreg
+ .set PROCESS_OFF, -numbytes
+ .rept numbytes / 4
+        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, 
%(5+PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+  .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | 
FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_8888_process_head, \
+    over_8888_8888_process_tail
+
+/******************************************************************************/
+
+/* Multiply each byte of a word by a byte.
+ * Useful when there aren't any obvious ways to fill the stalls with other 
instructions.
+ * word  Register containing 4 bytes
+ * byte  Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp   Scratch register
+ * half  Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
  */
+.macro mul_8888_8  word, byte, tmp, half
+        /* Split even/odd bytes of word apart */
+        uxtb16  tmp, word
+        uxtb16  word, word, ror #8
+        /* Multiply bytes together with rounding, then by 257/256 */
+        mla     tmp, tmp, byte, half
+        mla     word, word, byte, half /* 1 stall follows */
+        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
+        uxtab16 word, word, word, ror #8
+        /* Recombine bytes */
+        mov     tmp, tmp, ror #8
+        sel     word, tmp, word
+.endm
+
+/******************************************************************************/
+
+.macro over_8888_n_8888_init
+        /* Mask is constant */
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        /* Hold loop invariant in STRIDE_M */
+        ldr     STRIDE_M, =0x00800080
+        /* We only want the alpha bits of the constant mask */
+        mov     MASK, MASK, lsr #24
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, STRIDE_M, STRIDE_M
+        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, 
unaligned_mask, preload
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_n_8888_1pixel src, dst
+        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK&src, lsr #24
+        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), 
%(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        mov     WK6, #255
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+  .if numbytes == 16 && PROCESS_REG == 2
+        /* We're using WK6 and WK7 as temporaries, so half way through
+         * 4 pixels, reload the second two source pixels but this time
+         * into WK4 and WK5 */
+        ldmdb   SRC, {WK4, WK5}
+  .endif
+        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | 
FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_n_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_n_8888_process_head, \
+    over_8888_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8_8888_init
+        /* Source is constant, but splitting it into even/odd bytes is a loop 
invariant */
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        /* Not enough registers to hold this constant, but we still use it 
here to set GE[3:0] */
+        ldr     SCRATCH, =0x00800080
+        uxtb16  STRIDE_S, SRC
+        uxtb16  SRC, SRC, ror #8
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8_8888_newline
+        ldr     STRIDE_D, =0x00800080
+        b       1f
+ .ltorg
+1:
+.endm
+
+.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, 
unaligned_mask, preload
+    WK4     .req    STRIDE_M
+        pixld   , numbytes/4, 4, MASK, unaligned_mask
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+.endm
+
+.macro over_n_8_8888_1pixel src, dst
+        uxtb    Y, WK4, ror #src*8
+        /* Trailing part of multiplication of source */
+        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
+        mla     Y, SRC, Y, STRIDE_D
+        mov     ORIG_W, #255
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 Y, Y, Y, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sub     ORIG_W, ORIG_W, Y, lsr #24
+        sel     Y, SCRATCH, Y
+        /* Then multiply the destination */
+        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK&dst, WK&dst, Y
+.endm
+
+.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_M
+        teq     WK4, #0
+        beq     10f
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | 
FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_ONLY_PRELOAD_WIDE \
+    2, /* prefetch distance */ \
+    over_n_8_8888_init, \
+    over_n_8_8888_newline, \
+    nop_macro, /* cleanup */ \
+    over_n_8_8888_process_head, \
+    over_n_8_8888_process_tail
+
+/******************************************************************************/

-.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
-                                      prefetch_distance,        \
-                                      prefetch_braking_distance
-
-pixman_asm_function fname
-       W               .req    r0
-       DST             .req    r1
-       SRC             .req    r2
-       VX              .req    r3
-       UNIT_X          .req    ip
-       TMP1            .req    r4
-       TMP2            .req    r5
-       VXMASK          .req    r6
-       PF_OFFS         .req    r7
-       SRC_WIDTH_FIXED .req    r8
-
-       ldr     UNIT_X, [sp]
-       push    {r4, r5, r6, r7, r8, r10}
-       mvn     VXMASK, #((1 << bpp_shift) - 1)
-       ldr     SRC_WIDTH_FIXED, [sp, #28]
-
-       /* define helper macro */
-       .macro  scale_2_pixels
-               ldr&t       TMP1, [SRC, TMP1]
-               and     TMP2, VXMASK, VX, asr #(16 - bpp_shift)
-               adds    VX, VX, UNIT_X
-               str&t       TMP1, [DST], #(1 << bpp_shift)
-9:             subpls  VX, VX, SRC_WIDTH_FIXED
-               bpl     9b
-
-               ldr&t       TMP2, [SRC, TMP2]
-               and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
-               adds    VX, VX, UNIT_X
-               str&t       TMP2, [DST], #(1 << bpp_shift)
-9:             subpls  VX, VX, SRC_WIDTH_FIXED
-               bpl     9b
-       .endm
-
-       /* now do the scaling */
-       and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
-       adds    VX, VX, UNIT_X
-9:     subpls  VX, VX, SRC_WIDTH_FIXED
-       bpl     9b
-       subs    W, W, #(8 + prefetch_braking_distance)
-       blt     2f
-       /* calculate prefetch offset */
-       mov     PF_OFFS, #prefetch_distance
-       mla     PF_OFFS, UNIT_X, PF_OFFS, VX
-1:     /* main loop, process 8 pixels per iteration with prefetch */
-       pld     [SRC, PF_OFFS, asr #(16 - bpp_shift)]
-       add     PF_OFFS, UNIT_X, lsl #3
-       scale_2_pixels
-       scale_2_pixels
-       scale_2_pixels
-       scale_2_pixels
-       subs    W, W, #8
-       bge     1b
-2:
-       subs    W, W, #(4 - 8 - prefetch_braking_distance)
-       blt     2f
-1:     /* process the remaining pixels */
-       scale_2_pixels
-       scale_2_pixels
-       subs    W, W, #4
-       bge     1b
-2:
-       tst     W, #2
-       beq     2f
-       scale_2_pixels
-2:
-       tst     W, #1
-       ldrne&t     TMP1, [SRC, TMP1]
-       strne&t     TMP1, [DST]
-       /* cleanup helper macro */
-       .purgem scale_2_pixels
-       .unreq  DST
-       .unreq  SRC
-       .unreq  W
-       .unreq  VX
-       .unreq  UNIT_X
-       .unreq  TMP1
-       .unreq  TMP2
-       .unreq  VXMASK
-       .unreq  PF_OFFS
-       .unreq  SRC_WIDTH_FIXED
-       /* return */
-       pop     {r4, r5, r6, r7, r8, r10}
-       bx      lr
-.endfunc
-.endm
-
-generate_nearest_scanline_func \
-    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
-
-generate_nearest_scanline_func \
-    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
_______________________________________________
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman

[Pixman] [PATCH 3/5] ARMv6: Rewrite of non-scaled fast paths

Reply via email to