I've had this a long time but haven't submitted it yet.

I've tried to mirror the ia32 setup, so there should be a new subdirectory
src/libFLAC/ppc . The first two attachments go there. The third is a context
diff for src/libFLAC/Makefile.am .

I have some more modified files, which I figured I'd submit after the above
are checked in and working for somebody other than me. If you would prefer me
to submit them all at once, let me know.

I've tested this some and am reasonably confident of it. It seems to provide
a decent performance improvement for decoding.

--
Brady Patterson ([EMAIL PROTECTED])
Give us the Destruct-O-Beam!
; $Name$ $Revision$ $Date$

.text
        .align 2
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8

_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
;       r3: residual[]
;       r4: data_len
;       r5: qlp_coeff[]
;       r6: order
;       r7: lp_quantization
;       r8: data[]

; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
; bps<=15 for mid-side coding, since that uses an extra bit)

; these should be fast; the inner loop is unrolled (it takes no more than
; 3*(order%4) instructions, all of which are arithmetic), and all of the
; coefficients and all relevant history stay in registers, so the outer loop
; has only one load from memory (the residual)

; I haven't yet run this through simg4, so there may be some avoidable stalls,
; and there may be a somewhat more clever way to do the outer loop

; the branch mechanism may prevent dynamic loading; I still need to examine
; this issue, and there may be a more elegant method

        stmw r31,-4(r1)

        addi r9,r1,-28
        li r31,0xf
        andc r9,r9,r31 ; for quadword-aligned stack data

        slwi r6,r6,2 ; adjust for word size
        slwi r4,r4,2
        add r4,r4,r8 ; r4 = data+data_len

        mfspr r0,256 ; cache old vrsave
        addis r31,0,hi16(0xfffffc00)
        ori r31,r31,lo16(0xfffffc00)
        mtspr 256,r31 ; declare VRs in vrsave

        cmplw cr0,r8,r4 ; i<data_len
        bc 4,0,L1400

        ; load coefficients into v0-v7 and initial history into v8-v15
        li r31,0xf
        and r31,r8,r31 ; r31: data%4
        li r11,16
        subf r31,r31,r11 ; r31: 4-(data%4)
        slwi r31,r31,3 ; convert to bits for vsro
        li r10,-4
        stw r31,-4(r9)
        lvewx v0,r10,r9
        vspltisb v18,-1
        vsro v18,v18,v0 ; v18: mask vector

        li r31,0x8
        lvsl v0,0,r31
        vsldoi v0,v0,v0,12
        li r31,0xc
        lvsl v1,0,r31
        vspltisb v2,0
        vspltisb v3,-1
        vmrglw v2,v2,v3
        vsel v0,v1,v0,v2 ; v0: reversal permutation vector

        add r10,r5,r6
        lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
        vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector

        mr r11,r8
        lvsl v16,0,r11 ; v16: history alignment permutation vector

        lvx v0,0,r5
        addi r5,r5,16
        lvx v1,0,r5
        vperm v0,v0,v1,v17
        lvx v8,0,r11
        addi r11,r11,-16
        lvx v9,0,r11
        vperm v8,v9,v8,v16
        cmplw cr0,r5,r10
        bc 12,0,L1101
        vand v0,v0,v18
        addis r31,0,hi16(L1307)
        ori r31,r31,lo16(L1307)
        b L1199

L1101:
        addi r5,r5,16
        lvx v2,0,r5
        vperm v1,v1,v2,v17
        addi r11,r11,-16
        lvx v10,0,r11
        vperm v9,v10,v9,v16
        cmplw cr0,r5,r10
        bc 12,0,L1102
        vand v1,v1,v18
        addis r31,0,hi16(L1306)
        ori r31,r31,lo16(L1306)
        b L1199

L1102:
        addi r5,r5,16
        lvx v3,0,r5
        vperm v2,v2,v3,v17
        addi r11,r11,-16
        lvx v11,0,r11
        vperm v10,v11,v10,v16
        cmplw cr0,r5,r10
        bc 12,0,L1103
        vand v2,v2,v18
        addis r31,0,hi16(L1305)
        ori r31,r31,lo16(L1305)
        b L1199

L1103:
        addi r5,r5,16
        lvx v4,0,r5
        vperm v3,v3,v4,v17
        addi r11,r11,-16
        lvx v12,0,r11
        vperm v11,v12,v11,v16
        cmplw cr0,r5,r10
        bc 12,0,L1104
        vand v3,v3,v18
        addis r31,0,hi16(L1304)
        ori r31,r31,lo16(L1304)
        b L1199

L1104:
        addi r5,r5,16
        lvx v5,0,r5
        vperm v4,v4,v5,v17
        addi r11,r11,-16
        lvx v13,0,r11
        vperm v12,v13,v12,v16
        cmplw cr0,r5,r10
        bc 12,0,L1105
        vand v4,v4,v18
        addis r31,0,hi16(L1303)
        ori r31,r31,lo16(L1303)
        b L1199

L1105:
        addi r5,r5,16
        lvx v6,0,r5
        vperm v5,v5,v6,v17
        addi r11,r11,-16
        lvx v14,0,r11
        vperm v13,v14,v13,v16
        cmplw cr0,r5,r10
        bc 12,0,L1106
        vand v5,v5,v18
        addis r31,0,hi16(L1302)
        ori r31,r31,lo16(L1302)
        b L1199

L1106:
        addi r5,r5,16
        lvx v7,0,r5
        vperm v6,v6,v7,v17
        addi r11,r11,-16
        lvx v15,0,r11
        vperm v14,v15,v14,v16
        cmplw cr0,r5,r10
        bc 12,0,L1107
        vand v6,v6,v18
        addis r31,0,hi16(L1301)
        ori r31,r31,lo16(L1301)
        b L1199

L1107:
        addi r5,r5,16
        lvx v19,0,r5
        vperm v7,v7,v19,v17
        addi r11,r11,-16
        lvx v19,0,r11
        vperm v15,v19,v15,v16
        vand v7,v7,v18
        addis r31,0,hi16(L1300)
        ori r31,r31,lo16(L1300)

L1199:
        mtctr r31

        ; set up invariant vectors
        vspltish v16,0 ; v16: zero vector

        li r10,-12
        lvsr v17,r10,r8 ; v17: result shift vector
        lvsl v18,r10,r3 ; v18: residual shift back vector

        li r10,-4
        stw r7,-4(r9)
        lvewx v19,r10,r9 ; v19: lp_quantization vector

L1200:
        vmulosh v20,v0,v8 ; v20: sum vector
        bcctr 20,0

L1300:
        vmulosh v21,v7,v15
        vsldoi v15,v15,v14,4 ; increment history
        vaddsws v20,v20,v21

L1301:
        vmulosh v21,v6,v14
        vsldoi v14,v14,v13,4
        vaddsws v20,v20,v21

L1302:
        vmulosh v21,v5,v13
        vsldoi v13,v13,v12,4
        vaddsws v20,v20,v21

L1303:
        vmulosh v21,v4,v12
        vsldoi v12,v12,v11,4
        vaddsws v20,v20,v21

L1304:
        vmulosh v21,v3,v11
        vsldoi v11,v11,v10,4
        vaddsws v20,v20,v21

L1305:
        vmulosh v21,v2,v10
        vsldoi v10,v10,v9,4
        vaddsws v20,v20,v21

L1306:
        vmulosh v21,v1,v9
        vsldoi v9,v9,v8,4
        vaddsws v20,v20,v21

L1307:
        vsumsws v20,v20,v16 ; v20[3]: sum
        vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization

        lvewx v21,0,r3 ; v21[n]: *residual
        vperm v21,v21,v21,v18 ; v21[3]: *residual
        vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
        vsldoi v18,v18,v18,4 ; increment shift vector

        vperm v21,v20,v20,v17 ; v21[n]: shift for storage
        vsldoi v17,v17,v17,12 ; increment shift vector
        stvewx v21,0,r8

        vsldoi v20,v20,v20,12
        vsldoi v8,v8,v20,4 ; insert value onto history

        addi r3,r3,4
        addi r8,r8,4
        cmplw cr0,r8,r4 ; i<data_len
        bc 12,0,L1200

L1400:
        mtspr 256,r0 ; restore old vrsave
        lmw r31,-4(r1)
        blr

_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
;       r3: residual[]
;       r4: data_len
;       r5: qlp_coeff[]
;       r6: order
;       r7: lp_quantization
;       r8: data[]

; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
; this version assumes order<=8; it uses fewer vector registers, which should
; save time in context switches, and has less code, which may improve
; instruction caching

        stmw r31,-4(r1)

        addi r9,r1,-28
        li r31,0xf
        andc r9,r9,r31 ; for quadword-aligned stack data

        slwi r6,r6,2 ; adjust for word size
        slwi r4,r4,2
        add r4,r4,r8 ; r4 = data+data_len

        mfspr r0,256 ; cache old vrsave
        addis r31,0,hi16(0xffc00000)
        ori r31,r31,lo16(0xffc00000)
        mtspr 256,r31 ; declare VRs in vrsave

        cmplw cr0,r8,r4 ; i<data_len
        bc 4,0,L2400

        ; load coefficients into v0-v1 and initial history into v2-v3
        li r31,0xf
        and r31,r8,r31 ; r31: data%4
        li r11,16
        subf r31,r31,r11 ; r31: 4-(data%4)
        slwi r31,r31,3 ; convert to bits for vsro
        li r10,-4
        stw r31,-4(r9)
        lvewx v0,r10,r9
        vspltisb v6,-1
        vsro v6,v6,v0 ; v6: mask vector

        li r31,0x8
        lvsl v0,0,r31
        vsldoi v0,v0,v0,12
        li r31,0xc
        lvsl v1,0,r31
        vspltisb v2,0
        vspltisb v3,-1
        vmrglw v2,v2,v3
        vsel v0,v1,v0,v2 ; v0: reversal permutation vector

        add r10,r5,r6
        lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
        vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector

        mr r11,r8
        lvsl v4,0,r11 ; v4: history alignment permutation vector

        lvx v0,0,r5
        addi r5,r5,16
        lvx v1,0,r5
        vperm v0,v0,v1,v5
        lvx v2,0,r11
        addi r11,r11,-16
        lvx v3,0,r11
        vperm v2,v3,v2,v4
        cmplw cr0,r5,r10
        bc 12,0,L2101
        vand v0,v0,v6
        addis r31,0,hi16(L2301)
        ori r31,r31,lo16(L2301)
        b L2199

L2101:
        addi r5,r5,16
        lvx v7,0,r5
        vperm v1,v1,v7,v5
        addi r11,r11,-16
        lvx v7,0,r11
        vperm v3,v7,v3,v4
        vand v1,v1,v6
        addis r31,0,hi16(L2300)
        ori r31,r31,lo16(L2300)

L2199:
        mtctr r31

        ; set up invariant vectors
        vspltish v4,0 ; v4: zero vector

        li r10,-12
        lvsr v5,r10,r8 ; v5: result shift vector
        lvsl v6,r10,r3 ; v6: residual shift back vector

        li r10,-4
        stw r7,-4(r9)
        lvewx v7,r10,r9 ; v7: lp_quantization vector

L2200:
        vmulosh v8,v0,v2 ; v8: sum vector
        bcctr 20,0

L2300:
        vmulosh v9,v1,v3
        vsldoi v3,v3,v2,4
        vaddsws v8,v8,v9

L2301:
        vsumsws v8,v8,v4 ; v8[3]: sum
        vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization

        lvewx v9,0,r3 ; v9[n]: *residual
        vperm v9,v9,v9,v6 ; v9[3]: *residual
        vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
        vsldoi v6,v6,v6,4 ; increment shift vector

        vperm v9,v8,v8,v5 ; v9[n]: shift for storage
        vsldoi v5,v5,v5,12 ; increment shift vector
        stvewx v9,0,r8

        vsldoi v8,v8,v8,12
        vsldoi v2,v2,v8,4 ; insert value onto history

        addi r3,r3,4
        addi r8,r8,4
        cmplw cr0,r8,r4 ; i<data_len
        bc 12,0,L2200

L2400:
        mtspr 256,r0 ; restore old vrsave
        lmw r31,-4(r1)
        blr
SUFFIXES = .s .lo

.s.lo:
        $(LIBTOOL) --mode=compile as -force_cpusubtype_ALL -o $@ $<

noinst_LTLIBRARIES = libFLAC-asm.la
libFLAC_asm_la_SOURCES = \
        lpc_asm.s
Index: Makefile.am
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/Makefile.am,v
retrieving revision 1.44
diff -c -r1.44 Makefile.am
*** Makefile.am 27 May 2003 05:07:16 -0000      1.44
--- Makefile.am 16 Sep 2003 06:34:15 -0000
***************
*** 44,49 ****
--- 44,53 ----
  libFLAC_la_LIBADD = -lm
  endif
  endif
+ if FLaC__CPU_PPC
+ ARCH_SUBDIRS = ppc
+ libFLAC_la_LIBADD = ppc/libFLAC-asm.la
+ endif
  endif
  
  SUBDIRS = $(ARCH_SUBDIRS) include .

Reply via email to