[Flac-dev] two small-ish optimizations (death by a thousand cuts)

Eric Wong Wed, 02 Feb 2005 23:19:29 -0800

This lpc_restore_order was partially inspired by Miroslav's affd, though
my (not very great) ARM asm version resembled this, as well.


The other two reduce CPU array indexing overhead in loops a little.

Additionally, a request for help:

 My not very optimized lpc_restore_signal is at the below URL, I
 couldn't get the ldm* instructions to work as advertised, even though
 I've talked to several ARM asm hackers who said they looked right.  I
 can use the fp as a regular register since since I'm compiling without
 it.  Comments within should explain what I'm  having trouble with:
 
 http://archzoom.sourcecontrol.net/archzoom.cgi/[EMAIL 
PROTECTED]/flac--ipod--1.1.0--patch-19/src/libFLAC/arm/lpc_asm.s

-- 
Eric Wong

--- orig/src/libFLAC/lpc.c
+++ mod/src/libFLAC/lpc.c
@@ -293,6 +293,209 @@
 
 void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, 
const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 
data[])
 {
+       register const FLAC__int32 *qlp0 = &qlp_coeff[(order-1)];
+       register FLAC__int32 sum;
+       register const FLAC__int32 *history, *qlp;
+
+       history = &data[(-order)];
+
+       switch (order) {
+       case 12:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           + (qlp0[-6] * history[6])
+                           + (qlp0[-7] * history[7])
+                           + (qlp0[-8] * history[8])
+                           + (qlp0[-9] * history[9])
+                           + (qlp0[-10] * history[10])
+                           + (qlp0[-11] * history[11])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 11:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           + (qlp0[-6] * history[6])
+                           + (qlp0[-7] * history[7])
+                           + (qlp0[-8] * history[8])
+                           + (qlp0[-9] * history[9])
+                           + (qlp0[-10] * history[10])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 10:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           + (qlp0[-6] * history[6])
+                           + (qlp0[-7] * history[7])
+                           + (qlp0[-8] * history[8])
+                           + (qlp0[-9] * history[9])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 9:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           + (qlp0[-6] * history[6])
+                           + (qlp0[-7] * history[7])
+                           + (qlp0[-8] * history[8])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 8:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           + (qlp0[-6] * history[6])
+                           + (qlp0[-7] * history[7])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 7:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           + (qlp0[-6] * history[6])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               } 
+               return;
+       case 6:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           + (qlp0[-5] * history[5])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               } 
+               return;
+       case 5:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           + (qlp0[-4] * history[4])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               } 
+               return;
+       case 4:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           + (qlp0[-3] * history[3])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               } 
+               return;
+       case 3:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           + (qlp0[-2] * history[2])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 2:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * history[0])
+                           + (qlp0[-1] * history[1])
+                           ;
+                       ++history;
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               }
+               return;
+       case 1:
+               for( ; data_len != 0; --data_len) {
+                       sum = (qlp0[0] * (*(history++)));
+                       *(data++) = *(residual++) + (sum >> lp_quantization);
+               } 
+               return;
+       default:
+               { 
+                       /* handle everything else: (order > 12)
+                        * with Duff's Device to reduce jumps */
+                       const unsigned n0 = (order + 7)/8;
+                       const int tmp = 0 - order - 1;
+                       register const FLAC__int32 *qlpd = &qlp_coeff[order];
+                       for( ; data_len != 0; --data_len) {
+                               register unsigned n = n0;
+                               sum = 0;
+                               qlp = qlpd;
+                               history = &data[tmp];
+                       
+                               switch(order%8) {
+                               case 0: do {
+                                       sum += (*(--qlp)) * (*(++history));
+                               case 7: sum += (*(--qlp)) * (*(++history));
+                               case 6: sum += (*(--qlp)) * (*(++history));
+                               case 5: sum += (*(--qlp)) * (*(++history));
+                               case 4: sum += (*(--qlp)) * (*(++history));
+                               case 3: sum += (*(--qlp)) * (*(++history));
+                               case 2: sum += (*(--qlp)) * (*(++history));
+                               case 1: sum += (*(--qlp)) * (*(++history));
+                                       } while (--n);     
+                               }
+                               
+                               *(data++) = *(residual++) + (sum >> 
lp_quantization);
+                       }
+                       return;
+               }
+       }
+}
+
+#if 0
+void FLAC__lpc_restore_signal_orig(const FLAC__int32 residual[], unsigned 
data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, 
FLAC__int32 data[])
+{
 #ifdef FLAC__OVERFLOW_DETECT
        FLAC__int64 sumo;
 #endif
@@ -339,6 +542,7 @@
        }
        */
 }
+#endif /* 0 */
 
 void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned 
data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, 
FLAC__int32 data[])
 {

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -1466,6 +1469,7 @@
 {
        unsigned i, bits_ = bits;
        FLAC__uint32 v = 0;
+       FLAC__blurb *bbb;
 
        FLAC__ASSERT(0 != bb);
        FLAC__ASSERT(0 != bb->buffer);
@@ -1485,18 +1489,20 @@
 #if FLAC__BITS_PER_BLURB > 8
        if(bb->bits == 0 || bb->consumed_blurbs < bb->blurbs) { /*@@@ comment 
on why this is here*/
 #endif
+               bbb = &bb->buffer[bb->consumed_blurbs];
                if(bb->consumed_bits) {
                        i = FLAC__BITS_PER_BLURB - bb->consumed_bits;
                        if(i <= bits_) {
-                               v = bb->buffer[bb->consumed_blurbs] & 
(FLAC__BLURB_ALL_ONES >> bb->consumed_bits);
+                               v = (*bbb) & (FLAC__BLURB_ALL_ONES >> 
bb->consumed_bits);
                                bits_ -= i;
-                               CRC16_UPDATE_BLURB(bb, 
bb->buffer[bb->consumed_blurbs], bb->read_crc16);
+                               CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16);
+                               ++bbb;
                                bb->consumed_blurbs++;
                                bb->consumed_bits = 0;
                                /* we hold off updating bb->total_consumed_bits 
until the end */
                        }
                        else {
-                               *val = (bb->buffer[bb->consumed_blurbs] & 
(FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_);
+                               *val = ((*bbb) & (FLAC__BLURB_ALL_ONES >> 
bb->consumed_bits)) >> (i-bits_);
                                bb->consumed_bits += bits_;
                                bb->total_consumed_bits += bits_;
                                return true;
@@ -1516,9 +1522,10 @@
 #else
                while(bits_ >= FLAC__BITS_PER_BLURB) {
                        v <<= FLAC__BITS_PER_BLURB;
-                       v |= bb->buffer[bb->consumed_blurbs];
+                       v |= (*bbb);
                        bits_ -= FLAC__BITS_PER_BLURB;
-                       CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], 
bb->read_crc16);
+                       CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16);
+                       ++bbb;
                        bb->consumed_blurbs++;
                        /* bb->consumed_bits is already 0 */
                        /* we hold off updating bb->total_consumed_bits until 
the end */
@@ -1526,7 +1533,7 @@
 #endif
                if(bits_ > 0) {
                        v <<= bits_;
-                       v |= (bb->buffer[bb->consumed_blurbs] >> 
(FLAC__BITS_PER_BLURB-bits_));
+                       v |= ((*bbb) >> (FLAC__BITS_PER_BLURB-bits_));
                        bb->consumed_bits = bits_;
                        /* we hold off updating bb->total_consumed_bits until 
the end */
                }
--- orig/src/libFLAC/stream_decoder.c
+++ mod/src/libFLAC/stream_decoder.c
@@ -74,6 +74,7 @@
  ***********************************************************************/
 
 static void set_defaults_(FLAC__StreamDecoder *decoder);
+static inline void read_channel_coding(FLAC__StreamDecoder *decoder);
 static FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned 
size, unsigned channels);
 static FLAC__bool has_id_filtered_(FLAC__StreamDecoder *decoder, FLAC__byte 
*id);
 static FLAC__bool find_metadata_(FLAC__StreamDecoder *decoder);
@@ -776,6 +768,54 @@
        decoder->private_->metadata_filter_ids_count = 0;
 }
 
+/* Undo any special channel coding */
+static inline void read_channel_coding(FLAC__StreamDecoder *decoder)
+{
+       register FLAC__int32 left, right;
+       register unsigned i;
+       register FLAC__int32 *lchan, *rchan;
+       switch(decoder->private_->frame.header.channel_assignment) {
+               case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT:
+                       /* do nothing */
+                       break;
+               case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                       FLAC__ASSERT(decoder->private_->frame.header.channels 
== 2);
+                       lchan = &(decoder->private_->output[0])[0];
+                       rchan = &(decoder->private_->output[1])[0];
+                       for(i = decoder->private_->frame.header.blocksize; i != 
0; --i) {
+                               *rchan = *(lchan++) - *rchan;
+                               ++rchan;
+                       }
+                       break;
+               case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                       FLAC__ASSERT(decoder->private_->frame.header.channels 
== 2);
+                       lchan = &(decoder->private_->output[0])[0];
+                       rchan = &(decoder->private_->output[1])[0];
+                       for(i = decoder->private_->frame.header.blocksize; i != 
0; --i)
+                               *(lchan++) += *(rchan++);
+                       break;
+               case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE:
+                       FLAC__ASSERT(decoder->private_->frame.header.channels 
== 2);
+                       lchan = &(decoder->private_->output[0])[0];
+                       rchan = &(decoder->private_->output[1])[0];
+                       for(i = decoder->private_->frame.header.blocksize; i != 
0; --i) {
+                               register FLAC__int32 mid = *lchan;
+                               register FLAC__int32 side = *rchan;
+                               mid <<= 1;
+                               if(side & 1) /* i.e. if 'side' is odd... */
+                                       ++mid;
+                               left = mid + side;
+                               right = mid - side;
+                               *(lchan++) = left >> 1;
+                               *(rchan++) = right >> 1;
+                       }
+                       break;
+               default:
+                       FLAC__ASSERT(0);
+                       break;
+       }
+}
+
 FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, 
unsigned channels)
 {
        unsigned i;
@@ -1380,8 +1418,6 @@
 FLAC__bool read_frame_(FLAC__StreamDecoder *decoder, FLAC__bool *got_a_frame, 
FLAC__bool do_full_decode)
 {
        unsigned channel;
-       unsigned i;
-       FLAC__int32 mid, side, left, right;
        FLAC__uint16 frame_crc; /* the one we calculate from the input stream */
        FLAC__uint32 x;
 
@@ -1446,41 +1482,9 @@
        if(!FLAC__bitbuffer_read_raw_uint32(decoder->private_->input, &x, 
FLAC__FRAME_FOOTER_CRC_LEN, read_callback_, decoder))
                return false; /* the read_callback_ sets the state for us */
        if(frame_crc == (FLAC__uint16)x) {
-               if(do_full_decode) {
-                       /* Undo any special channel coding */
-                       
switch(decoder->private_->frame.header.channel_assignment) {
-                               case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT:
-                                       /* do nothing */
-                                       break;
-                               case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE:
-                                       
FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
-                                       for(i = 0; i < 
decoder->private_->frame.header.blocksize; i++)
-                                               decoder->private_->output[1][i] 
= decoder->private_->output[0][i] - decoder->private_->output[1][i];
-                                       break;
-                               case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-                                       
FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
-                                       for(i = 0; i < 
decoder->private_->frame.header.blocksize; i++)
-                                               decoder->private_->output[0][i] 
+= decoder->private_->output[1][i];
-                                       break;
-                               case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE:
-                                       
FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
-                                       for(i = 0; i < 
decoder->private_->frame.header.blocksize; i++) {
-                                               mid = 
decoder->private_->output[0][i];
-                                               side = 
decoder->private_->output[1][i];
-                                               mid <<= 1;
-                                               if(side & 1) /* i.e. if 'side' 
is odd... */
-                                                       mid++;
-                                               left = mid + side;
-                                               right = mid - side;
-                                               decoder->private_->output[0][i] 
= left >> 1;
-                                               decoder->private_->output[1][i] 
= right >> 1;
-                                       }
-                                       break;
-                               default:
-                                       FLAC__ASSERT(0);
-                                       break;
-                       }
-               }
+               if(do_full_decode)
+                       read_channel_coding(decoder);
+               
        }
        else {
                /* Bad frame, emit error and zero the output signal */


/EOF

signature.asc
Description: Digital signature

_______________________________________________
Flac-dev mailing list
Flac-dev@xiph.org
http://lists.xiph.org/mailman/listinfo/flac-dev

[Flac-dev] two small-ish optimizations (death by a thousand cuts)

Reply via email to