On 28 March 2011 05:09, Michael Hope <michael.h...@linaro.org> wrote:
> Hi there.  I'm looking for areas where the toolchain could generate
> faster code, and a good way of doing that is seeing how compiled code
> does against the best hand-written code.  I know of skia, ffmpeg,
> pixman, Orc, and efl - what others are out there?
>

hi Michael,

Great motivation to optimize the existing libraries by NEON !

As far as I know, Android depends on several libraries, and some of
them are computing bound:

- libpixelflinger -- a bit like pixman
  There is no official document about PixelFlinger, but you can always
check out its source:
    http://android.git.kernel.org/?p=platform/system/core.git;a=summary
  I submitted one NEON optimization patch for libpixelflinger to AOSP before:
    https://review.source.android.com//#change,16358

- zlib
  Using SIMD, we can optimize 'copy / repeat an existing sequence' in
LZ-style encoding.
  The reference Intel SSE2 optimization patch is attached in this mail.

Sincerely,
-jserv
diff -urNp zlib-1.2.5-orig/deflate.c zlib-1.2.5/deflate.c
--- zlib-1.2.5-orig/deflate.c   2010-04-20 12:12:21.000000000 +0800
+++ zlib-1.2.5/deflate.c        2010-07-26 03:53:34.000000000 +0800
@@ -49,6 +49,17 @@
 
 /* @(#) $Id$ */
 
+/* We can use 2-byte chunks only if 'unsigned short' has been defined
+ * appropriately and MAX_MATCH has the default value.
+ */
+#ifdef UNALIGNED_OK
+#  include <limits.h>
+#  include "zutil.h"
+#  if (MAX_MATCH != 258) || (USHRT_MAX != 0xffff)
+#    undef UNALIGNED_OK
+#  endif
+#endif
+
 #include "deflate.h"
 
 const char deflate_copyright[] =
@@ -1119,7 +1130,8 @@ local uInt longest_match(s, cur_match)
          * However the length of the match is limited to the lookahead, so
          * the output of deflate is not affected by the uninitialized values.
          */
-#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
+#ifdef UNALIGNED_OK
+
         /* This code assumes sizeof(unsigned short) == 2. Do not use
          * UNALIGNED_OK if your compiler uses a different size.
          */
diff -urNp zlib-1.2.5-orig/deflate.h zlib-1.2.5/deflate.h
--- zlib-1.2.5-orig/deflate.h   2010-04-19 12:00:46.000000000 +0800
+++ zlib-1.2.5/deflate.h        2010-07-26 03:53:34.000000000 +0800
@@ -251,9 +251,12 @@ typedef struct internal_state {
     ulg bits_sent;      /* bit length of compressed data sent mod 2^32 */
 #endif
 
-    ush bi_buf;
+    ulg bi_buf;
     /* Output buffer. bits are inserted starting at the bottom (least
-     * significant bits).
+     * significant bits).  Room for at least two short values to allow
+     * for a simpler overflow handling.  However, if more than 16 bits 
+     * have been buffered, it will be flushed and* and no more then 16 
+     * bits will be in use afterwards.
      */
     int bi_valid;
     /* Number of valid bits in bi_buf.  All bits above the last valid bit
@@ -274,6 +277,20 @@ typedef struct internal_state {
  */
 #define put_byte(s, c) {s->pending_buf[s->pending++] = (c);}
 
+/* Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pendingBuf.
+ */
+#if defined(LITTLE_ENDIAN) && defined(UNALIGNED_OK)
+#  define put_short(s, w) { \
+    *(ush*)(s->pending_buf + s->pending) = (ush)(w);\
+    s->pending += 2; \
+}
+#else
+#  define put_short(s, w) { \
+    put_byte(s, (uch)((w) & 0xff)); \
+    put_byte(s, (uch)((ush)(w) >> 8)); \
+}
+#endif
 
 #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
 /* Minimum amount of lookahead, except at the end of the input file.
diff -urNp zlib-1.2.5-orig/inffast.c zlib-1.2.5/inffast.c
--- zlib-1.2.5-orig/inffast.c   2010-04-19 12:16:23.000000000 +0800
+++ zlib-1.2.5/inffast.c        2010-07-26 03:53:34.000000000 +0800
@@ -1,5 +1,6 @@
 /* inffast.c -- fast decoding
- * Copyright (C) 1995-2008, 2010 Mark Adler
+ * Copyright (C) 1995-2004, 2010 Mark Adler
+ *               2010 Optimizations by Stefan Fuhrmann
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -10,16 +11,35 @@
 
 #ifndef ASMINF
 
+/* This is a highly optimized implementation of the decoder function for
+ * large code blocks. It cannot be used to decode close to the end of
+ * input nor output buffers (see below).
+ *
+ * Before trying to hand-tune assembly code for your target, you should
+ * make sure that alignment, endianess, word size optimizations etc. have
+ * already been enabled for the respective target platform. 
+ 
+ * For MS VC++ 2008, the performance gain of specialized code against
+ * DISABLE_INFLATE_FAST_OPTIMIZATIONS (base line) is as follows:
+ *
+ * x86 (32 bit):    +60% throughput
+ * x64 (64 bit):    +70% throughput
+ *
+ * Measurements were taken on a Core i7 CPU with a mix of small and large
+ * buffers (110MB total) or varying content and an average compression rate 
+ * of 2.2 .
+ */
+
 /* Allow machine dependent optimization for post-increment or pre-increment.
-   Based on testing to date,
-   Pre-increment preferred for:
-   - PowerPC G3 (Adler)
-   - MIPS R5000 (Randers-Pehrson)
-   Post-increment preferred for:
-   - none
-   No measurable difference:
-   - Pentium III (Anderson)
-   - M68060 (Nikl)
+ * Based on testing to date,
+ * Pre-increment preferred for:
+ * - PowerPC G3 (Adler)
+ * - MIPS R5000 (Randers-Pehrson)
+ * Post-increment preferred for:
+ * - none
+ * No measurable difference:
+ * - Pentium III (Anderson)
+ * - M68060 (Nikl)
  */
 #ifdef POSTINC
 #  define OFF 0
@@ -29,6 +49,212 @@
 #  define PUP(a) *++(a)
 #endif
 
+/* On a number of architectures, it is more efficient to
+ * read 64 bits from the input stream at once than only
+ * a 32 bit chunk.  That allows for fewer memory accesses
+ * and calculations as well as for more aggressive loop
+ * unrolling.
+ */
+#if defined(_M_X64) || defined(__x86_64)
+#  define HOLD_64BIT_CHUNKS
+#endif
+
+/* For debugging purposes, we may want to disable code
+ * optimizations as we won't be otherwise able to access 
+ * alternative code paths.
+ * Please note that undefining these features does affect
+ * this file only.
+ */
+#ifdef DISABLE_INFLATE_FAST_OPTIMIZATIONS
+#  ifdef UNALIGNED_OK
+#    undef UNALIGNED_OK
+#  endif
+#  ifdef HOLD_64BIT_CHUNKS
+#    undef HOLD_64BIT_CHUNKS
+#  endif
+#  ifdef LITTLE_ENDIAN
+#    undef LITTLE_ENDIAN
+#  endif
+#  ifdef USE_SSE2
+#    undef USE_SSE2
+#  endif
+#endif
+
+/* A reusable code-snippet.  It copies 'len' bytes from 'from' 
+ * to 'out'.  'len' must be 3 or larger.  This code will be used 
+ * when no optimization will is available.
+ */
+#define STANDARD_MIN3_COPY\
+    while (len > 2) {\
+        PUP(out) = PUP(from);\
+        PUP(out) = PUP(from);\
+        PUP(out) = PUP(from);\
+        len -= 3;\
+    }\
+    if (len) { \
+        PUP(out) = PUP(from);\
+        if (len > 1)\
+            PUP(out) = PUP(from);\
+    }
+
+/* A reusable code-snippet.  It copies data from 'from'to 'out'. 
+ * up to 'last' with the last chunk possibly exceeding 'last'
+ * by up to 15 bytes.
+ */
+#ifdef USE_SSE2
+#  include <emmintrin.h>
+#  define TRY_CHUNKY_COPY\
+    if ((dist >= sizeof (__m128i)) || (last <= out)) { \
+        do {\
+            _mm_storeu_si128 ((__m128i*)(out+OFF), \
+                              _mm_loadu_si128((const __m128i*)(from+OFF)));\
+            out += sizeof (__m128i);\
+            from += sizeof (__m128i);\
+        } while (out < last); \
+    }
+#else
+#  define TRY_CHUNKY_COPY\
+    if (dist >= sizeof(long) || (last <= out)) { \
+        do {\
+            *(long*)(out+OFF) = *(long*)(from+OFF);\
+            out += sizeof (long);\
+            from += sizeof (long);\
+        } while (out < last); \
+    }
+#endif
+
+/* The 'copy / repeat an existing sequence' is at the core of LZ-
+ * style encoding.  Therefore, whenever the CPU allows, we use few, 
+ * unaligned 4-byte copies instead of many single-byte accesses.
+ * 
+ * The local variable definition actually leads to better code
+ * being generated by the MS compiler.
+ */
+#ifdef UNALIGNED_OK
+#  define QUICK_COPY\
+    {\
+        unsigned char FAR *from = out - dist;\
+        unsigned char FAR *last = out + len;\
+        TRY_CHUNKY_COPY\
+        else {\
+            do { \
+                *(out+OFF+0) = *(from+OFF+0);\
+                *(out+OFF+1) = *(from+OFF+1);\
+                *(out+OFF+2) = *(from+OFF+2);\
+                from += 3;\
+                out += 3;\
+            } while (out < last);\
+        }\
+        out = last;\
+    }
+#else
+#  define QUICK_COPY\
+    from = out - dist;\
+    STANDARD_MIN3_COPY
+#endif
+
+/* Whenever we don't copy / repeat existing sequences, we add new
+ * literals.  This is the code snippet that will be used in an
+ * unrolled loop for extracting literals one-by-one. 
+ * We bail out if a non-literal has been found.  We also assume that
+ * the loop head already made sure we don't read / write beyond 
+ * buffer boundaries.
+ */
+#define EXTRACT_NEXT_IF_LITERAL\
+    here = lcode[hold & lmask];\
+    if (here.op != 0)\
+        goto dolen;\
+\
+    op = (unsigned)(here.bits);\
+    hold >>= op;\
+    bits -= op;\
+    Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?\
+            "inflate:         literal '%c'\n" :\
+            "inflate:         literal 0x%02x\n", here.val));\
+    PUP(out) = (unsigned char)(here.val);
+
+/* Unrolled loop content.  Using 32 bit chunks, we can unroll it
+ * only once because every step consumes up to 9 bits of the 
+ * input stream.  We got 25/57 bits (using 32/64 bit chunks) 
+ * entering the loop but must leave with at least 9 bits left 
+ * for the top of the main loop.
+ */
+#if defined(HOLD_64BIT_CHUNKS)
+#  define LITERAL_UNROLL_SIZE 5
+#  define UNROLLED_LITERAL_LOOP {\
+              EXTRACT_NEXT_IF_LITERAL \
+              EXTRACT_NEXT_IF_LITERAL \
+              EXTRACT_NEXT_IF_LITERAL \
+              EXTRACT_NEXT_IF_LITERAL \
+              EXTRACT_NEXT_IF_LITERAL \
+          }
+#else
+#  define LITERAL_UNROLL_SIZE 1
+#  define UNROLLED_LITERAL_LOOP { EXTRACT_NEXT_IF_LITERAL }
+#endif
+
+/* Chunk that can be prefetched from the input stream.
+ */
+#if defined(HOLD_64BIT_CHUNKS)
+#  define HOLD_TYPE unsigned long long
+#else
+#  define HOLD_TYPE unsigned long
+#endif
+
+/* Code snipped that reads a single byte from 'in' and
+ * adds it to the prefetched ('hold') data.
+ */
+#define PREFETCH_BYTE \
+    hold += (HOLD_TYPE)(PUP(in)) << bits;\
+    bits += 8;
+
+/* Code snipped completely filling the prefetch variable.
+ */
+#if defined(LITTLE_ENDIAN) && defined(UNALIGNED_OK)
+#  define TOP_UP_BITS \
+    {\
+        hold |= (*(HOLD_TYPE*)(in + OFF)) << bits;\
+        added = (sizeof (HOLD_TYPE) * 8 - bits) / 8;\
+        in += added;\
+        bits += added * 8; \
+    }
+#else
+#  if defined(HOLD_64BIT_CHUNKS)
+#    define TOP_UP_BITS\
+      if (bits < 33) {\
+          PREFETCH_BYTE\
+          PREFETCH_BYTE\
+          PREFETCH_BYTE\
+          PREFETCH_BYTE\
+      }\
+      if (bits < 49) {\
+          PREFETCH_BYTE\
+          PREFETCH_BYTE\
+      }\
+      if (bits < 57) {\
+          PREFETCH_BYTE\
+      }
+#  else
+#    define TOP_UP_BITS\
+      if (bits < 17) {\
+          PREFETCH_BYTE\
+          PREFETCH_BYTE\
+      }\
+      if (bits < 25) {\
+          PREFETCH_BYTE\
+      }
+#  endif
+#endif
+
+/* For 64 bit chunks, we don't need to prefetch a second
+ * time inside the main loop when decoding the distance.
+ */
+#if defined(HOLD_64BIT_CHUNKS)
+#  define TOP_UP_BITS_32
+#else
+#  define TOP_UP_BITS_32 TOP_UP_BITS
+#endif
+
 /*
    Decode literal, length, and distance codes and write out the resulting
    literal and match bytes until either not enough input or output is
@@ -40,8 +266,8 @@
    Entry assumptions:
 
         state->mode == LEN
-        strm->avail_in >= 6
-        strm->avail_out >= 258
+        strm->avail_in >= 8
+        strm->avail_out >= 273
         start >= strm->avail_out
         state->bits < 8
 
@@ -56,13 +282,15 @@
     - The maximum input bits used by a length/distance pair is 15 bits for the
       length code, 5 bits for the length extra, 15 bits for the distance code,
       and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
-      Therefore if strm->avail_in >= 6, then there is enough input to avoid
-      checking for available input while decoding.
+      However, we prefetch 1x8 or 2x4 bytes.  Therefore if strm->avail_in >= 8
+      is always true, then there is enough input to avoid checking for 
available 
+      input while decoding.
 
     - The maximum bytes that a single length/distance pair can output is 258
-      bytes, which is the maximum length that can be coded.  inflate_fast()
-      requires strm->avail_out >= 258 for each loop to avoid checking for
-      output space.
+      bytes, which is the maximum length that can be coded.  Another 15 bytes
+      padding are required to simplify copying in chunks of up to 16 bytes.
+      inflate_fast() requires strm->avail_out >= 273 for each loop to avoid 
+      checking for output space.
  */
 void ZLIB_INTERNAL inflate_fast(strm, start)
 z_streamp strm;
@@ -81,8 +309,9 @@ unsigned start;         /* inflate()'s s
     unsigned whave;             /* valid bytes in the window */
     unsigned wnext;             /* window write index */
     unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
-    unsigned long hold;         /* local strm->hold */
+    HOLD_TYPE hold;             /* local strm->hold */
     unsigned bits;              /* local strm->bits */
+    unsigned added;             /* number of bytes fetched in TOP_UP_BITS */
     code const FAR *lcode;      /* local strm->lencode */
     code const FAR *dcode;      /* local strm->distcode */
     unsigned lmask;             /* mask for first level of length codes */
@@ -97,10 +326,10 @@ unsigned start;         /* inflate()'s s
     /* copy state to local variables */
     state = (struct inflate_state FAR *)strm->state;
     in = strm->next_in - OFF;
-    last = in + (strm->avail_in - 5);
+    last = in + (strm->avail_in - 7);
     out = strm->next_out - OFF;
     beg = out - (start - strm->avail_out);
-    end = out + (strm->avail_out - 257);
+    end = out + (strm->avail_out - 272);
 #ifdef INFLATE_STRICT
     dmax = state->dmax;
 #endif
@@ -117,61 +346,47 @@ unsigned start;         /* inflate()'s s
 
     /* decode literals and length/distances until end-of-block or not enough
        input data or output space */
+    TOP_UP_BITS                                 /* bits = 32/64 */
     do {
-        if (bits < 15) {
-            hold += (unsigned long)(PUP(in)) << bits;
-            bits += 8;
-            hold += (unsigned long)(PUP(in)) << bits;
-            bits += 8;
-        }
+                                                /* bits >= 10/10 */
         here = lcode[hold & lmask];
       dolen:
         op = (unsigned)(here.bits);
         hold >>= op;
         bits -= op;
+        TOP_UP_BITS                             /* bits >= 25/57 */
+
         op = (unsigned)(here.op);
         if (op == 0) {                          /* literal */
             Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
                     "inflate:         literal '%c'\n" :
                     "inflate:         literal 0x%02x\n", here.val));
             PUP(out) = (unsigned char)(here.val);
+                                                /* bits >= 25/57 */
+            if (out + LITERAL_UNROLL_SIZE-1 < end && in < last)
+                UNROLLED_LITERAL_LOOP
+                                                /* bits >= 16/12 */
         }
         else if (op & 16) {                     /* length base */
             len = (unsigned)(here.val);
             op &= 15;                           /* number of extra bits */
             if (op) {
-                if (bits < op) {
-                    hold += (unsigned long)(PUP(in)) << bits;
-                    bits += 8;
-                }
                 len += (unsigned)hold & ((1U << op) - 1);
                 hold >>= op;
                 bits -= op;
             }
             Tracevv((stderr, "inflate:         length %u\n", len));
-            if (bits < 15) {
-                hold += (unsigned long)(PUP(in)) << bits;
-                bits += 8;
-                hold += (unsigned long)(PUP(in)) << bits;
-                bits += 8;
-            }
+                                                /* bits >= 10/42 */
             here = dcode[hold & dmask];
           dodist:
             op = (unsigned)(here.bits);
             hold >>= op;
             bits -= op;
+            TOP_UP_BITS_32                      /* bits >= 25/36 */
             op = (unsigned)(here.op);
             if (op & 16) {                      /* distance base */
                 dist = (unsigned)(here.val);
                 op &= 15;                       /* number of extra bits */
-                if (bits < op) {
-                    hold += (unsigned long)(PUP(in)) << bits;
-                    bits += 8;
-                    if (bits < op) {
-                        hold += (unsigned long)(PUP(in)) << bits;
-                        bits += 8;
-                    }
-                }
                 dist += (unsigned)hold & ((1U << op) - 1);
 #ifdef INFLATE_STRICT
                 if (dist > dmax) {
@@ -182,6 +397,7 @@ unsigned start;         /* inflate()'s s
 #endif
                 hold >>= op;
                 bits -= op;
+                                                /* bits >= 10/21 */
                 Tracevv((stderr, "inflate:         distance %u\n", dist));
                 op = (unsigned)(out - beg);     /* max distance in output */
                 if (dist > op) {                /* see if copy from window */
@@ -190,9 +406,9 @@ unsigned start;         /* inflate()'s s
                         if (state->sane) {
                             strm->msg =
                                 (char *)"invalid distance too far back";
-                            state->mode = BAD;
-                            break;
-                        }
+                        state->mode = BAD;
+                        break;
+                    }
 #ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
                         if (len <= op - whave) {
                             do {
@@ -253,31 +469,10 @@ unsigned start;         /* inflate()'s s
                             from = out - dist;  /* rest from output */
                         }
                     }
-                    while (len > 2) {
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    }
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    STANDARD_MIN3_COPY
                 }
-                else {
-                    from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                else {                          /* copy direct from output */
+                    QUICK_COPY
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
@@ -304,7 +499,7 @@ unsigned start;         /* inflate()'s s
             state->mode = BAD;
             break;
         }
-    } while (in < last && out < end);
+    } while (out < end && in < last);
 
     /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
     len = bits >> 3;
@@ -315,9 +510,9 @@ unsigned start;         /* inflate()'s s
     /* update state and return */
     strm->next_in = in + OFF;
     strm->next_out = out + OFF;
-    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+    strm->avail_in = (unsigned)(in < last ? 7 + (last - in) : 7 - (in - last));
     strm->avail_out = (unsigned)(out < end ?
-                                 257 + (end - out) : 257 - (out - end));
+                                 272 + (end - out) : 272 - (out - end));
     state->hold = hold;
     state->bits = bits;
     return;
@@ -335,6 +530,26 @@ unsigned start;         /* inflate()'s s
    - Swapping window/direct else
    - Larger unrolled copy loops (three is about right)
    - Moving len -= 3 statement into middle of loop
+
+   The critical code path is the following:
+      here = lcode[hold & lmask];
+      op = (unsigned)(here.bits);
+
+   It requires 
+      2 accesses to hold and lmask (0 ticks if in register, 
+                                    otherwise: 4 ticks typ = 1 + L1 latency)
+     +1 ALU latency (usually 1 tick)
+     +1 L1 latency (2..4 ticks, typ. 3 ticks)
+     +1 member access latency (0 ticks on some arch if 'bits' is the MSB, 
+                               2 ALU ops / 2 ticks otherwise)
+     -> 4 .. 12 ticks latency
+
+   Therefore, we "splice" the data prefetch code (hold) into the critical
+   path (a good compiler will interleave the data load from TOP_UP_BITS
+   with the lcode access).  All calculation be parallelized very well on
+   most architectures so that TOP_UP_BITS becomes relatively cheap at 4 or 
+   less ticks overhead with no branch mispredictions possible.  Also, 'hold'
+   will be readily available the next iteration. 
  */
 
 #endif /* !ASMINF */
diff -urNp zlib-1.2.5-orig/inftrees.c zlib-1.2.5/inftrees.c
--- zlib-1.2.5-orig/inftrees.c  2010-04-20 12:12:21.000000000 +0800
+++ zlib-1.2.5/inftrees.c       2010-07-26 03:53:34.000000000 +0800
@@ -138,13 +138,20 @@ unsigned short FAR *work;
         return -1;                      /* incomplete set */
 
     /* generate offsets into symbol table for each length for sorting */
-    offs[1] = 0;
-    for (len = 1; len < MAXBITS; len++)
-        offs[len + 1] = offs[len] + count[len];
+    {
+        unsigned short offset = 0;
+        offs[1] = 0;
+        for (len = 1; len < MAXBITS; len++) {
+            offset += count[len];
+            offs[len + 1] = offset;
+        }
+    }
 
     /* sort symbols by length, by symbol order within each length */
-    for (sym = 0; sym < codes; sym++)
-        if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+    for (sym = 0; sym < codes; sym++) {
+        unsigned len = lens[sym];
+        if (len != 0) work[offs[len]++] = (unsigned short)sym;
+    }
 
     /*
        Create and fill in decoding tables.  In this loop, the table being
@@ -215,14 +222,15 @@ unsigned short FAR *work;
     /* process all codes and make table entries */
     for (;;) {
         /* create table entry */
+        unsigned work_sym = work[sym];
         here.bits = (unsigned char)(len - drop);
-        if ((int)(work[sym]) < end) {
+        if ((int)(work_sym) < end) {
             here.op = (unsigned char)0;
-            here.val = work[sym];
+            here.val = work_sym;
         }
-        else if ((int)(work[sym]) > end) {
-            here.op = (unsigned char)(extra[work[sym]]);
-            here.val = base[work[sym]];
+        else if ((int)(work_sym) > end) {
+            here.op = (unsigned char)(extra[work_sym]);
+            here.val = base[work_sym];
         }
         else {
             here.op = (unsigned char)(32 + 64);         /* end of block */
diff -urNp zlib-1.2.5-orig/trees.c zlib-1.2.5/trees.c
--- zlib-1.2.5-orig/trees.c     2010-04-19 12:03:44.000000000 +0800
+++ zlib-1.2.5/trees.c  2010-07-26 03:53:34.000000000 +0800
@@ -175,15 +175,6 @@ local void gen_trees_header OF((void));
 #endif
 
 /* ===========================================================================
- * Output a short LSB first on the stream.
- * IN assertion: there is enough room in pendingBuf.
- */
-#define put_short(s, w) { \
-    put_byte(s, (uch)((w) & 0xff)); \
-    put_byte(s, (uch)((ush)(w) >> 8)); \
-}
-
-/* ===========================================================================
  * Send a value on a given number of bits.
  * IN assertion: length <= 16 and value fits in length bits.
  */
@@ -203,29 +194,23 @@ local void send_bits(s, value, length)
      * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
      * unused bits in value.
      */
-    if (s->bi_valid > (int)Buf_size - length) {
-        s->bi_buf |= (ush)value << s->bi_valid;
+    s->bi_buf |= (ulg)value << s->bi_valid;
+    s->bi_valid += (ulg)length;
+    if (s->bi_valid > Buf_size) {
         put_short(s, s->bi_buf);
-        s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
-        s->bi_valid += length - Buf_size;
-    } else {
-        s->bi_buf |= (ush)value << s->bi_valid;
-        s->bi_valid += length;
-    }
+        s->bi_buf >>= Buf_size;
+        s->bi_valid -= Buf_size;
+    } 
 }
 #else /* !DEBUG */
 
 #define send_bits(s, value, length) \
-{ int len = length;\
-  if (s->bi_valid > (int)Buf_size - len) {\
-    int val = value;\
-    s->bi_buf |= (ush)val << s->bi_valid;\
+{ s->bi_buf |= (ulg)(value) << s->bi_valid;\
+  s->bi_valid += (ulg)(length);\
+  if (s->bi_valid > Buf_size) {\
     put_short(s, s->bi_buf);\
-    s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
-    s->bi_valid += len - Buf_size;\
-  } else {\
-    s->bi_buf |= (ush)(value) << s->bi_valid;\
-    s->bi_valid += len;\
+    s->bi_buf >>= Buf_size;\
+    s->bi_valid -= Buf_size;\
   }\
 }
 #endif /* DEBUG */
@@ -1154,7 +1139,7 @@ local int detect_data_type(s)
             || s->dyn_ltree[13].Freq != 0)
         return Z_TEXT;
     for (n = 32; n < LITERALS; n++)
-        if (s->dyn_ltree[n].Freq != 0)
+            if (s->dyn_ltree[n].Freq != 0)
             return Z_TEXT;
 
     /* There are no "black-listed" or "white-listed" bytes:
diff -urNp zlib-1.2.5-orig/zconf.h zlib-1.2.5/zconf.h
--- zlib-1.2.5-orig/zconf.h     2010-04-19 01:58:06.000000000 +0800
+++ zlib-1.2.5/zconf.h  2010-07-26 03:53:34.000000000 +0800
@@ -160,10 +160,52 @@
 #ifdef SYS16BIT
 #  define MAXSEG_64K
 #endif
-#ifdef MSDOS
+
+/*
+ * Many machines allow efficient access to unaligned data, that is
+ * reading 2 or more bytes at once from a random and possibly unaligned
+ * memory address is *on average* more efficient than reading the data
+ * one byte at a time and then combining it.
+ */
+#if !defined(UNALIGNED_OK) && defined(MSDOS)
+#  define UNALIGNED_OK
+#endif
+#if !defined(UNALIGNED_OK) && (defined(_M_IX86) || defined(_M_X64))
 #  define UNALIGNED_OK
 #endif
+#if !defined(UNALIGNED_OK) && (defined(i386) || defined(__x86_64))
+#  define UNALIGNED_OK
+#endif
+
+/*
+ * Most information in compressed data streams is stored in LSB first
+ * (little endian) order. If that matches the machine byte order, we may
+ * apply certain optimizations.
+ */
+#if !defined(LITTLE_ENDIAN) && (defined(_M_IX86) || defined(_M_X64))
+#  define LITTLE_ENDIAN
+#endif
+#if !defined(LITTLE_ENDIAN) && (defined(i386) || defined(__x86_64))
+#  define LITTLE_ENDIAN
+#endif
+#if !defined(LITTLE_ENDIAN) && defined(__LITTLE_ENDIAN__)
+#  define LITTLE_ENDIAN
+#endif
+
+/*
+ * With the availability of SSE2, we can optimize certain functions
+ * by operating on large chunks of data at once.
+ */
+#if !defined(USE_SSE2) && defined(__GNUC__) && defined(__SSE2__)
+#  define USE_SSE2
+#endif
+#if !defined(USE_SSE2) && (defined(_M_X64) || (defined(_M_IX86_FP) && 
(_M_IX86_FP>=2)))
+#  define USE_SSE2
+#endif
 
+/*
+ * C standard level.
+ */
 #ifdef __STDC_VERSION__
 #  ifndef STDC
 #    define STDC
_______________________________________________
linaro-dev mailing list
linaro-dev@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-dev

Reply via email to