This is an automated email from the ASF dual-hosted git repository.

ronny pushed a commit to branch update-snappy
in repository https://gitbox.apache.org/repos/asf/couchdb-snappy.git

commit 05c004c8c6394ee8cb7173fec257868ce0f14133
Author: Peter Membrey <[email protected]>
AuthorDate: Fri Nov 15 18:11:31 2013 +0800

    Updating to snappy 1.1.1
---
 c_src/snappy/NEWS                    |  71 +++++
 c_src/snappy/README                  |   4 +-
 c_src/snappy/snappy-internal.h       |   4 +-
 c_src/snappy/snappy-sinksource.cc    |   1 -
 c_src/snappy/snappy-sinksource.h     |   1 +
 c_src/snappy/snappy-stubs-internal.h | 188 +++++++------
 c_src/snappy/snappy-stubs-public.h   |  20 +-
 c_src/snappy/snappy.cc               | 505 +++++++++++++++++++++++++++--------
 c_src/snappy/snappy.h                |  51 +++-
 c_src/snappy_nif.cc                  |  12 +-
 rebar.config                         |   2 +-
 11 files changed, 638 insertions(+), 221 deletions(-)

diff --git a/c_src/snappy/NEWS b/c_src/snappy/NEWS
index d514787..f21e9d0 100644
--- a/c_src/snappy/NEWS
+++ b/c_src/snappy/NEWS
@@ -1,3 +1,74 @@
+Snappy v1.1.1, October 15th 2013:
+
+  * Add support for uncompressing to iovecs (scatter I/O).
+    The bulk of this patch was contributed by Mohit Aron.
+
+  * Speed up decompression by ~2%; much more so (~13-20%) on
+    a few benchmarks on given compilers and CPUs.
+
+  * Fix a few issues with MSVC compilation.
+
+  * Support truncated test data in the benchmark.
+
+
+Snappy v1.1.0, January 18th 2013:
+
+  * Snappy now uses 64 kB block size instead of 32 kB. On average,
+    this means it compresses about 3% denser (more so for some
+    inputs), at the same or better speeds.
+
+  * libsnappy no longer depends on iostream.
+
+  * Some small performance improvements in compression on x86
+    (0.5–1%).
+
+  * Various portability fixes for ARM-based platforms, for MSVC,
+    and for GNU/Hurd.
+
+
+Snappy v1.0.5, February 24th 2012:
+
+  * More speed improvements. Exactly how big will depend on
+    the architecture:
+
+    - 3–10% faster decompression for the base case (x86-64).
+
+    - ARMv7 and higher can now use unaligned accesses,
+      and will see about 30% faster decompression and
+      20–40% faster compression.
+
+    - 32-bit platforms (ARM and 32-bit x86) will see 2–5%
+      faster compression.
+
+    These are all cumulative (e.g., ARM gets all three speedups).
+
+  * Fixed an issue where the unit test would crash on system
+    with less than 256 MB address space available,
+    e.g. some embedded platforms.
+
+  * Added a framing format description, for use over e.g. HTTP,
+    or for a command-line compressor. We do not have any
+    implementations of this at the current point, but there seems
+    to be enough of a general interest in the topic.
+    Also make the format description slightly clearer.
+
+  * Remove some compile-time warnings in -Wall
+    (mostly signed/unsigned comparisons), for easier embedding
+    into projects that use -Wall -Werror.
+
+
+Snappy v1.0.4, September 15th 2011:
+
+  * Speeded up the decompressor somewhat; typically about 2–8%
+    for Core i7, in 64-bit mode (comparable for Opteron).
+    Somewhat more for some tests, almost no gain for others.
+  
+  * Make Snappy compile on certain platforms it didn't before
+    (Solaris with SunPro C++, HP-UX, AIX).
+
+  * Correct some minor errors in the format description.
+
+
 Snappy v1.0.3, June 2nd 2011:
 
   * Speeded up the decompressor somewhat; about 3-6% for Core 2,
diff --git a/c_src/snappy/README b/c_src/snappy/README
index df8f0e1..3bc8888 100644
--- a/c_src/snappy/README
+++ b/c_src/snappy/README
@@ -76,11 +76,11 @@ your calling file, and link against the compiled library.
 
 There are many ways to call Snappy, but the simplest possible is
 
-  snappy::Compress(input, &output);
+  snappy::Compress(input.data(), input.size(), &output);
 
 and similarly
 
-  snappy::Uncompress(input, &output);
+  snappy::Uncompress(input.data(), input.size(), &output);
 
 where "input" and "output" are both instances of std::string.
 
diff --git a/c_src/snappy/snappy-internal.h b/c_src/snappy/snappy-internal.h
index a32eda5..c99d331 100644
--- a/c_src/snappy/snappy-internal.h
+++ b/c_src/snappy/snappy-internal.h
@@ -85,7 +85,7 @@ char* CompressFragment(const char* input,
 static inline int FindMatchLength(const char* s1,
                                   const char* s2,
                                   const char* s2_limit) {
-  DCHECK_GE(s2_limit, s2);
+  assert(s2_limit >= s2);
   int matched = 0;
 
   // Find out how long the match is. We loop over the data 64 bits at a
@@ -122,7 +122,7 @@ static inline int FindMatchLength(const char* s1,
                                   const char* s2,
                                   const char* s2_limit) {
   // Implementation based on the x86-64 version, above.
-  DCHECK_GE(s2_limit, s2);
+  assert(s2_limit >= s2);
   int matched = 0;
 
   while (s2 <= s2_limit - 4 &&
diff --git a/c_src/snappy/snappy-sinksource.cc 
b/c_src/snappy/snappy-sinksource.cc
index 1017895..5844552 100644
--- a/c_src/snappy/snappy-sinksource.cc
+++ b/c_src/snappy/snappy-sinksource.cc
@@ -68,5 +68,4 @@ char* UncheckedByteArraySink::GetAppendBuffer(size_t len, 
char* scratch) {
   return dest_;
 }
 
-
 }
diff --git a/c_src/snappy/snappy-sinksource.h b/c_src/snappy/snappy-sinksource.h
index 430baea..faabfa1 100644
--- a/c_src/snappy/snappy-sinksource.h
+++ b/c_src/snappy/snappy-sinksource.h
@@ -60,6 +60,7 @@ class Sink {
   // The default implementation always returns the scratch buffer.
   virtual char* GetAppendBuffer(size_t length, char* scratch);
 
+
  private:
   // No copying
   Sink(const Sink&);
diff --git a/c_src/snappy/snappy-stubs-internal.h 
b/c_src/snappy/snappy-stubs-internal.h
index 46ee235..12393b6 100644
--- a/c_src/snappy/snappy-stubs-internal.h
+++ b/c_src/snappy/snappy-stubs-internal.h
@@ -35,14 +35,13 @@
 #include "config.h"
 #endif
 
-#include <iostream>
 #include <string>
 
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef HAVE_SYS_MMAN
+#ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
 #endif
 
@@ -86,99 +85,19 @@ using namespace std;
 // version (anyone who wants to regenerate it can just do the call
 // themselves within main()).
 #define DEFINE_bool(flag_name, default_value, description) \
-  bool FLAGS_ ## flag_name = default_value;
+  bool FLAGS_ ## flag_name = default_value
 #define DECLARE_bool(flag_name) \
-  extern bool FLAGS_ ## flag_name;
-#define REGISTER_MODULE_INITIALIZER(name, code)
+  extern bool FLAGS_ ## flag_name
 
 namespace snappy {
 
 static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
 static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
 
-// Logging.
-
-#define LOG(level) LogMessage()
-#define VLOG(level) true ? (void)0 : \
-    snappy::LogMessageVoidify() & snappy::LogMessage()
-
-class LogMessage {
- public:
-  LogMessage() { }
-  ~LogMessage() {
-    cerr << endl;
-  }
-
-  LogMessage& operator<<(const std::string& msg) {
-    cerr << msg;
-    return *this;
-  }
-  LogMessage& operator<<(int x) {
-    cerr << x;
-    return *this;
-  }
-};
-
-// Asserts, both versions activated in debug mode only,
-// and ones that are always active.
-
-#define CRASH_UNLESS(condition) \
-    PREDICT_TRUE(condition) ? (void)0 : \
-    snappy::LogMessageVoidify() & snappy::LogMessageCrash()
-
-class LogMessageCrash : public LogMessage {
- public:
-  LogMessageCrash() { }
-  ~LogMessageCrash() {
-    cerr << endl;
-    abort();
-  }
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-
-class LogMessageVoidify {
- public:
-  LogMessageVoidify() { }
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const LogMessage&) { }
-};
-
-#define CHECK(cond) CRASH_UNLESS(cond)
-#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
-#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
-#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
-#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
-#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
-#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
-
-#ifdef NDEBUG
-
-#define DCHECK(cond) CRASH_UNLESS(true)
-#define DCHECK_LE(a, b) CRASH_UNLESS(true)
-#define DCHECK_GE(a, b) CRASH_UNLESS(true)
-#define DCHECK_EQ(a, b) CRASH_UNLESS(true)
-#define DCHECK_NE(a, b) CRASH_UNLESS(true)
-#define DCHECK_LT(a, b) CRASH_UNLESS(true)
-#define DCHECK_GT(a, b) CRASH_UNLESS(true)
-
-#else
-
-#define DCHECK(cond) CHECK(cond)
-#define DCHECK_LE(a, b) CHECK_LE(a, b)
-#define DCHECK_GE(a, b) CHECK_GE(a, b)
-#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
-#define DCHECK_NE(a, b) CHECK_NE(a, b)
-#define DCHECK_LT(a, b) CHECK_LT(a, b)
-#define DCHECK_GT(a, b) CHECK_GT(a, b)
-
-#endif
-
 // Potentially unaligned loads and stores.
 
+// x86 and PowerPC can simply do these loads and stores native.
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
 
 #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
@@ -189,6 +108,49 @@ class LogMessageVoidify {
 #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
 #define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
 
+// ARMv7 and newer support native unaligned accesses, but only of 16-bit
+// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
+// do an unaligned read and rotate the words around a bit, or do the reads very
+// slowly (trip through kernel mode). There's no simple #define that says just
+// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
+// sub-architectures.
+//
+// This is a mess, but there's not much we can do about it.
+
+#elif defined(__arm__) && \
+      !defined(__ARM_ARCH_4__) && \
+      !defined(__ARM_ARCH_4T__) && \
+      !defined(__ARM_ARCH_5__) && \
+      !defined(__ARM_ARCH_5T__) && \
+      !defined(__ARM_ARCH_5TE__) && \
+      !defined(__ARM_ARCH_5TEJ__) && \
+      !defined(__ARM_ARCH_6__) && \
+      !defined(__ARM_ARCH_6J__) && \
+      !defined(__ARM_ARCH_6K__) && \
+      !defined(__ARM_ARCH_6Z__) && \
+      !defined(__ARM_ARCH_6ZK__) && \
+      !defined(__ARM_ARCH_6T2__)
+
+#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
+#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
+
+#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
+#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
+
+// TODO(user): NEON supports unaligned 64-bit loads and stores.
+// See if that would be more efficient on platforms supporting it,
+// at least for copies.
+
+inline uint64 UNALIGNED_LOAD64(const void *p) {
+  uint64 t;
+  memcpy(&t, p, sizeof t);
+  return t;
+}
+
+inline void UNALIGNED_STORE64(void *p, uint64 v) {
+  memcpy(p, &v, sizeof v);
+}
+
 #else
 
 // These functions are provided for architectures that don't support
@@ -226,9 +188,31 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) {
 
 #endif
 
+// This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64
+// on some platforms, in particular ARM.
+inline void UnalignedCopy64(const void *src, void *dst) {
+  if (sizeof(void *) == 8) {
+    UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src));
+  } else {
+    const char *src_char = reinterpret_cast<const char *>(src);
+    char *dst_char = reinterpret_cast<char *>(dst);
+
+    UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char));
+    UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4));
+  }
+}
+
 // The following guarantees declaration of the byte swap functions.
 #ifdef WORDS_BIGENDIAN
 
+#ifdef HAVE_SYS_BYTEORDER_H
+#include <sys/byteorder.h>
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
 #ifdef _MSC_VER
 #include <stdlib.h>
 #define bswap_16(x) _byteswap_ushort(x)
@@ -242,8 +226,38 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) {
 #define bswap_32(x) OSSwapInt32(x)
 #define bswap_64(x) OSSwapInt64(x)
 
-#else
+#elif defined(HAVE_BYTESWAP_H)
 #include <byteswap.h>
+
+#elif defined(bswap32)
+// FreeBSD defines bswap{16,32,64} in <sys/endian.h> (already #included).
+#define bswap_16(x) bswap16(x)
+#define bswap_32(x) bswap32(x)
+#define bswap_64(x) bswap64(x)
+
+#elif defined(BSWAP_64)
+// Solaris 10 defines BSWAP_{16,32,64} in <sys/byteorder.h> (already 
#included).
+#define bswap_16(x) BSWAP_16(x)
+#define bswap_32(x) BSWAP_32(x)
+#define bswap_64(x) BSWAP_64(x)
+
+#else
+
+inline uint16 bswap_16(uint16 x) {
+  return (x << 8) | (x >> 8);
+}
+
+inline uint32 bswap_32(uint32 x) {
+  x = ((x & 0xff00ff00UL) >> 8) | ((x & 0x00ff00ffUL) << 8);
+  return (x >> 16) | (x << 16);
+}
+
+inline uint64 bswap_64(uint64 x) {
+  x = ((x & 0xff00ff00ff00ff00ULL) >> 8) | ((x & 0x00ff00ff00ff00ffULL) << 8);
+  x = ((x & 0xffff0000ffff0000ULL) >> 16) | ((x & 0x0000ffff0000ffffULL) << 
16);
+  return (x >> 32) | (x << 32);
+}
+
 #endif
 
 #endif  // WORDS_BIGENDIAN
diff --git a/c_src/snappy/snappy-stubs-public.h 
b/c_src/snappy/snappy-stubs-public.h
index e545220..ecda439 100644
--- a/c_src/snappy/snappy-stubs-public.h
+++ b/c_src/snappy/snappy-stubs-public.h
@@ -36,11 +36,20 @@
 #ifndef UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
 #define UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
 
+#if 1
 #include <stdint.h>
+#endif
+
+#if 1
 #include <stddef.h>
+#endif
+
+#if 0
+#include <sys/uio.h>
+#endif
 
 #define SNAPPY_MAJOR 1
-#define SNAPPY_MINOR 0
+#define SNAPPY_MINOR 1
 #define SNAPPY_PATCHLEVEL 1
 #define SNAPPY_VERSION \
     ((SNAPPY_MAJOR << 16) | (SNAPPY_MINOR << 8) | SNAPPY_PATCHLEVEL)
@@ -75,6 +84,15 @@ typedef std::string string;
   TypeName(const TypeName&);               \
   void operator=(const TypeName&)
 
+#if !0
+// Windows does not have an iovec type, yet the concept is universally useful.
+// It is simple to define it ourselves, so we put it inside our own namespace.
+struct iovec {
+       void* iov_base;
+       size_t iov_len;
+};
+#endif
+
 }  // namespace snappy
 
 #endif  // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
diff --git a/c_src/snappy/snappy.cc b/c_src/snappy/snappy.cc
index a591aba..f8d0d23 100644
--- a/c_src/snappy/snappy.cc
+++ b/c_src/snappy/snappy.cc
@@ -82,6 +82,7 @@ enum {
   COPY_2_BYTE_OFFSET = 2,
   COPY_4_BYTE_OFFSET = 3
 };
+static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual 
offset.
 
 // Copy "len" bytes from "src" to "op", one byte at a time.  Used for
 // handling COPY operations where the input and output regions may
@@ -94,8 +95,8 @@ enum {
 //    ababababababababababab
 // Note that this does not match the semantics of either memcpy()
 // or memmove().
-static inline void IncrementalCopy(const char* src, char* op, int len) {
-  DCHECK_GT(len, 0);
+static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
+  assert(len > 0);
   do {
     *op++ = *src++;
   } while (--len > 0);
@@ -136,22 +137,22 @@ namespace {
 
 const int kMaxIncrementCopyOverflow = 10;
 
-}  // namespace
-
-static inline void IncrementalCopyFastPath(const char* src, char* op, int len) 
{
+inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
   while (op - src < 8) {
-    UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+    UnalignedCopy64(src, op);
     len -= op - src;
     op += op - src;
   }
   while (len > 0) {
-    UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+    UnalignedCopy64(src, op);
     src += 8;
     op += 8;
     len -= 8;
   }
 }
 
+}  // namespace
+
 static inline char* EmitLiteral(char* op,
                                 const char* literal,
                                 int len,
@@ -172,8 +173,8 @@ static inline char* EmitLiteral(char* op,
     //   - The output will always have 32 spare bytes (see
     //     MaxCompressedLength).
     if (allow_fast_path && len <= 16) {
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(literal + 8));
+      UnalignedCopy64(literal, op);
+      UnalignedCopy64(literal + 8, op + 8);
       return op + len;
     }
   } else {
@@ -194,25 +195,25 @@ static inline char* EmitLiteral(char* op,
   return op + len;
 }
 
-static inline char* EmitCopyLessThan64(char* op, int offset, int len) {
-  DCHECK_LE(len, 64);
-  DCHECK_GE(len, 4);
-  DCHECK_LT(offset, 65536);
+static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
+  assert(len <= 64);
+  assert(len >= 4);
+  assert(offset < 65536);
 
   if ((len < 12) && (offset < 2048)) {
-    int len_minus_4 = len - 4;
+    size_t len_minus_4 = len - 4;
     assert(len_minus_4 < 8);            // Must fit in 3 bits
-    *op++ = COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8) << 5);
+    *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
     *op++ = offset & 0xff;
   } else {
-    *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
+    *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
     LittleEndian::Store16(op, offset);
     op += 2;
   }
   return op;
 }
 
-static inline char* EmitCopy(char* op, int offset, int len) {
+static inline char* EmitCopy(char* op, size_t offset, int len) {
   // Emit 64 byte copies but make sure to keep at least four bytes reserved
   while (len >= 68) {
     op = EmitCopyLessThan64(op, offset, 64);
@@ -249,12 +250,10 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, 
int* table_size) {
   // compression, and if the input is short, we won't need that
   // many hash table entries anyway.
   assert(kMaxHashTableSize >= 256);
-  int htsize = 256;
+  size_t htsize = 256;
   while (htsize < kMaxHashTableSize && htsize < input_size) {
     htsize <<= 1;
   }
-  CHECK_EQ(0, htsize & (htsize - 1)) << ": must be power of two";
-  CHECK_LE(htsize, kMaxHashTableSize) << ": hash table too large";
 
   uint16* table;
   if (htsize <= ARRAYSIZE(small_table_)) {
@@ -272,16 +271,49 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, 
int* table_size) {
 }
 }  // end namespace internal
 
-// For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+// For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
 // equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
 // empirically found that overlapping loads such as
 //  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
 // are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
+//
+// We have different versions for 64- and 32-bit; ideally we would avoid the
+// two functions and just inline the UNALIGNED_LOAD64 call into
+// GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
+// enough to avoid loading the value multiple times then. For 64-bit, the load
+// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
+// done at GetUint32AtOffset() time.
+
+#ifdef ARCH_K8
+
+typedef uint64 EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+  return UNALIGNED_LOAD64(ptr);
+}
+
 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
-  DCHECK(0 <= offset && offset <= 4) << offset;
+  assert(offset >= 0);
+  assert(offset <= 4);
   return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
 }
 
+#else
+
+typedef const char* EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+  return ptr;
+}
+
+static inline uint32 GetUint32AtOffset(const char* v, int offset) {
+  assert(offset >= 0);
+  assert(offset <= 4);
+  return UNALIGNED_LOAD32(v + offset);
+}
+
+#endif
+
 // Flat array compression that does not emit the "uncompressed length"
 // prefix. Compresses "input" string to the "*op" buffer.
 //
@@ -294,29 +326,29 @@ static inline uint32 GetUint32AtOffset(uint64 v, int 
offset) {
 // Returns an "end" pointer into "op" buffer.
 // "end - op" is the compressed size of "input".
 namespace internal {
-char* CompressFragment(const char* const input,
-                       const size_t input_size,
+char* CompressFragment(const char* input,
+                       size_t input_size,
                        char* op,
                        uint16* table,
                        const int table_size) {
   // "ip" is the input pointer, and "op" is the output pointer.
   const char* ip = input;
-  CHECK_LE(input_size, kBlockSize);
-  CHECK_EQ(table_size & (table_size - 1), 0) << ": table must be power of two";
+  assert(input_size <= kBlockSize);
+  assert((table_size & (table_size - 1)) == 0); // table must be power of two
   const int shift = 32 - Bits::Log2Floor(table_size);
-  DCHECK_EQ(kuint32max >> shift, table_size - 1);
+  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
   const char* ip_end = input + input_size;
   const char* base_ip = ip;
   // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
   // [next_emit, ip_end) after the main loop.
   const char* next_emit = ip;
 
-  const int kInputMarginBytes = 15;
+  const size_t kInputMarginBytes = 15;
   if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
     const char* ip_limit = input + input_size - kInputMarginBytes;
 
     for (uint32 next_hash = Hash(++ip, shift); ; ) {
-      DCHECK_LT(next_emit, ip);
+      assert(next_emit < ip);
       // The body of this loop calls EmitLiteral once and then EmitCopy one or
       // more times.  (The exception is that when we're close to exhausting
       // the input we goto emit_remainder.)
@@ -349,7 +381,7 @@ char* CompressFragment(const char* const input,
       do {
         ip = next_ip;
         uint32 hash = next_hash;
-        DCHECK_EQ(hash, Hash(ip, shift));
+        assert(hash == Hash(ip, shift));
         uint32 bytes_between_hash_lookups = skip++ >> 5;
         next_ip = ip + bytes_between_hash_lookups;
         if (PREDICT_FALSE(next_ip > ip_limit)) {
@@ -357,8 +389,8 @@ char* CompressFragment(const char* const input,
         }
         next_hash = Hash(next_ip, shift);
         candidate = base_ip + table[hash];
-        DCHECK_GE(candidate, base_ip);
-        DCHECK_LT(candidate, ip);
+        assert(candidate >= base_ip);
+        assert(candidate < ip);
 
         table[hash] = ip - base_ip;
       } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
@@ -367,7 +399,7 @@ char* CompressFragment(const char* const input,
       // Step 2: A 4-byte match has been found.  We'll later see if more
       // than 4 bytes match.  But, prior to the match, input
       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
-      DCHECK_LE(next_emit + 16, ip_end);
+      assert(next_emit + 16 <= ip_end);
       op = EmitLiteral(op, next_emit, ip - next_emit, true);
 
       // Step 3: Call EmitCopy, and then see if another EmitCopy could
@@ -378,7 +410,7 @@ char* CompressFragment(const char* const input,
       // though we don't yet know how big the literal will be.  We handle that
       // by proceeding to the next iteration of the main loop.  We also can 
exit
       // this loop via goto if we get close to exhausting the input.
-      uint64 input_bytes = 0;
+      EightBytesReference input_bytes;
       uint32 candidate_bytes = 0;
 
       do {
@@ -387,8 +419,8 @@ char* CompressFragment(const char* const input,
         const char* base = ip;
         int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
         ip += matched;
-        int offset = base - candidate;
-        DCHECK_EQ(0, memcmp(base, candidate, matched));
+        size_t offset = base - candidate;
+        assert(0 == memcmp(base, candidate, matched));
         op = EmitCopy(op, offset, matched);
         // We could immediately start working at ip now, but to improve
         // compression we first update table[Hash(ip - 1, ...)].
@@ -397,7 +429,7 @@ char* CompressFragment(const char* const input,
         if (PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
         }
-        input_bytes = UNALIGNED_LOAD64(insert_tail);
+        input_bytes = GetEightBytesAt(insert_tail);
         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
         table[prev_hash] = ip - base_ip - 1;
         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -435,12 +467,31 @@ char* CompressFragment(const char* const input,
 //   bool CheckLength() const;
 //
 //   // Called repeatedly during decompression
-//   bool Append(const char* ip, uint32 length, bool allow_fast_path);
-//   bool AppendFromSelf(uint32 offset, uint32 length);
-// };
+//   bool Append(const char* ip, size_t length);
+//   bool AppendFromSelf(uint32 offset, size_t length);
 //
-// "allow_fast_path" is a parameter that says if there is at least 16
-// readable bytes in "ip". It is currently only used by SnappyArrayWriter.
+//   // The rules for how TryFastAppend differs from Append are somewhat
+//   // convoluted:
+//   //
+//   //  - TryFastAppend is allowed to decline (return false) at any
+//   //    time, for any reason -- just "return false" would be
+//   //    a perfectly legal implementation of TryFastAppend.
+//   //    The intention is for TryFastAppend to allow a fast path
+//   //    in the common case of a small append.
+//   //  - TryFastAppend is allowed to read up to <available> bytes
+//   //    from the input buffer, whereas Append is allowed to read
+//   //    <length>. However, if it returns true, it must leave
+//   //    at least five (kMaximumTagLength) bytes in the input buffer
+//   //    afterwards, so that there is always enough space to read the
+//   //    next tag without checking for a refill.
+//   //  - TryFastAppend must always return decline (return false)
+//   //    if <length> is 61 or more, as in this case the literal length is not
+//   //    decoded fully. In practice, this should not be a big problem,
+//   //    as it is unlikely that one would implement a fast path accepting
+//   //    this much data.
+//   //
+//   bool TryFastAppend(const char* ip, size_t available, size_t length);
+// };
 
 // -----------------------------------------------------------------------
 // Lookup table for decompression code.  Generated by ComputeTable() below.
@@ -507,9 +558,9 @@ static uint16 MakeEntry(unsigned int extra,
                         unsigned int len,
                         unsigned int copy_offset) {
   // Check that all of the fields fit within the allocated space
-  DCHECK_EQ(extra,       extra & 0x7);          // At most 3 bits
-  DCHECK_EQ(copy_offset, copy_offset & 0x7);    // At most 3 bits
-  DCHECK_EQ(len,         len & 0x7f);           // At most 7 bits
+  assert(extra       == (extra & 0x7));          // At most 3 bits
+  assert(copy_offset == (copy_offset & 0x7));    // At most 3 bits
+  assert(len         == (len & 0x7f));           // At most 7 bits
   return len | (copy_offset << 8) | (extra << 11);
 }
 
@@ -567,9 +618,15 @@ static void ComputeTable() {
   }
 
   // Check that each entry was initialized exactly once.
-  CHECK_EQ(assigned, 256);
+  if (assigned != 256) {
+    fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
+    abort();
+  }
   for (int i = 0; i < 256; i++) {
-    CHECK_NE(dst[i], 0xffff);
+    if (dst[i] == 0xffff) {
+      fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
+      abort();
+    }
   }
 
   if (FLAGS_snappy_dump_decompression_table) {
@@ -584,10 +641,13 @@ static void ComputeTable() {
 
   // Check that computed table matched recorded table
   for (int i = 0; i < 256; i++) {
-    CHECK_EQ(dst[i], char_table[i]);
+    if (dst[i] != char_table[i]) {
+      fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
+              i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
+      abort();
+    }
   }
 }
-REGISTER_MODULE_INITIALIZER(snappy, ComputeTable());
 #endif /* !NDEBUG */
 
 // Helper class for decompression
@@ -598,7 +658,7 @@ class SnappyDecompressor {
   const char*   ip_limit_;       // Points just past buffered bytes
   uint32        peeked_;         // Bytes peeked from reader (need to skip)
   bool          eof_;            // Hit end of input without an error?
-  char          scratch_[5];     // Temporary buffer for PeekFast() boundaries
+  char          scratch_[kMaximumTagLength];  // See RefillTag().
 
   // Ensure that all of the tag metadata for the next tag is available
   // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
@@ -630,7 +690,7 @@ class SnappyDecompressor {
   // On succcess, stores the length in *result and returns true.
   // On failure, returns false.
   bool ReadUncompressedLength(uint32* result) {
-    DCHECK(ip_ == NULL);       // Must not have read anything yet
+    assert(ip_ == NULL);       // Must not have read anything yet
     // Length is encoded in 1..5 bytes
     *result = 0;
     uint32 shift = 0;
@@ -655,25 +715,43 @@ class SnappyDecompressor {
   template <class Writer>
   void DecompressAllTags(Writer* writer) {
     const char* ip = ip_;
-    for ( ;; ) {
-      if (ip_limit_ - ip < 5) {
-        ip_ = ip;
-        if (!RefillTag()) return;
-        ip = ip_;
-      }
 
+    // We could have put this refill fragment only at the beginning of the 
loop.
+    // However, duplicating it at the end of each branch gives the compiler 
more
+    // scope to optimize the <ip_limit_ - ip> expression based on the local
+    // context, which overall increases speed.
+    #define MAYBE_REFILL() \
+        if (ip_limit_ - ip < kMaximumTagLength) { \
+          ip_ = ip; \
+          if (!RefillTag()) return; \
+          ip = ip_; \
+        }
+
+    MAYBE_REFILL();
+    for ( ;; ) {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
-      const uint32 entry = char_table[c];
-      const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
-      ip += entry >> 11;
-      const uint32 length = entry & 0xff;
 
       if ((c & 0x3) == LITERAL) {
-        uint32 literal_length = length + trailer;
-        uint32 avail = ip_limit_ - ip;
+        size_t literal_length = (c >> 2) + 1u;
+        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
+          assert(literal_length < 61);
+          ip += literal_length;
+          // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+          // will not return true unless there's already at least five spare
+          // bytes in addition to the literal.
+          continue;
+        }
+        if (PREDICT_FALSE(literal_length >= 61)) {
+          // Long literal.
+          const size_t literal_length_length = literal_length - 60;
+          literal_length =
+              (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
+          ip += literal_length_length;
+        }
+
+        size_t avail = ip_limit_ - ip;
         while (avail < literal_length) {
-          bool allow_fast_path = (avail >= 16);
-          if (!writer->Append(ip, avail, allow_fast_path)) return;
+          if (!writer->Append(ip, avail)) return;
           literal_length -= avail;
           reader_->Skip(peeked_);
           size_t n;
@@ -683,12 +761,17 @@ class SnappyDecompressor {
           if (avail == 0) return;  // Premature end of input
           ip_limit_ = ip + avail;
         }
-        bool allow_fast_path = (avail >= 16);
-        if (!writer->Append(ip, literal_length, allow_fast_path)) {
+        if (!writer->Append(ip, literal_length)) {
           return;
         }
         ip += literal_length;
+        MAYBE_REFILL();
       } else {
+        const uint32 entry = char_table[c];
+        const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 
11];
+        const uint32 length = entry & 0xff;
+        ip += entry >> 11;
+
         // copy_offset/256 is encoded in bits 8..10.  By just fetching
         // those bits, we get copy_offset (since the bit-field starts at
         // bit 8).
@@ -696,8 +779,11 @@ class SnappyDecompressor {
         if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
           return;
         }
+        MAYBE_REFILL();
       }
     }
+
+#undef MAYBE_REFILL
   }
 };
 
@@ -717,11 +803,11 @@ bool SnappyDecompressor::RefillTag() {
   }
 
   // Read the tag character
-  DCHECK_LT(ip, ip_limit_);
+  assert(ip < ip_limit_);
   const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
   const uint32 entry = char_table[c];
   const uint32 needed = (entry >> 11) + 1;  // +1 byte for 'c'
-  DCHECK_LE(needed, sizeof(scratch_));
+  assert(needed <= sizeof(scratch_));
 
   // Read more bytes from reader if needed
   uint32 nbuf = ip_limit_ - ip;
@@ -742,10 +828,10 @@ bool SnappyDecompressor::RefillTag() {
       nbuf += to_add;
       reader_->Skip(to_add);
     }
-    DCHECK_EQ(nbuf, needed);
+    assert(nbuf == needed);
     ip_ = scratch_;
     ip_limit_ = scratch_ + needed;
-  } else if (nbuf < 5) {
+  } else if (nbuf < kMaximumTagLength) {
     // Have enough bytes, but move into scratch_ so that we do not
     // read past end of input
     memmove(scratch_, ip, nbuf);
@@ -761,23 +847,23 @@ bool SnappyDecompressor::RefillTag() {
 }
 
 template <typename Writer>
-static bool InternalUncompress(Source* r,
-                               Writer* writer,
-                               uint32 max_len) {
+static bool InternalUncompress(Source* r, Writer* writer) {
   // Read the uncompressed length from the front of the compressed input
   SnappyDecompressor decompressor(r);
   uint32 uncompressed_len = 0;
   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
-  // Protect against possible DoS attack
-  if (static_cast<uint64>(uncompressed_len) > max_len) {
-    return false;
-  }
+  return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
+}
 
+template <typename Writer>
+static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
+                                      Writer* writer,
+                                      uint32 uncompressed_len) {
   writer->SetExpectedLength(uncompressed_len);
 
   // Process the entire input
-  decompressor.DecompressAllTags(writer);
-  return (decompressor.eof() && writer->CheckLength());
+  decompressor->DecompressAllTags(writer);
+  return (decompressor->eof() && writer->CheckLength());
 }
 
 bool GetUncompressedLength(Source* source, uint32* result) {
@@ -787,7 +873,7 @@ bool GetUncompressedLength(Source* source, uint32* result) {
 
 size_t Compress(Source* reader, Sink* writer) {
   size_t written = 0;
-  int N = reader->Available();
+  size_t N = reader->Available();
   char ulength[Varint::kMax32];
   char* p = Varint::Encode32(ulength, N);
   writer->Append(ulength, p-ulength);
@@ -801,11 +887,11 @@ size_t Compress(Source* reader, Sink* writer) {
     // Get next block to compress (without copying if possible)
     size_t fragment_size;
     const char* fragment = reader->Peek(&fragment_size);
-    DCHECK_NE(fragment_size, 0) << ": premature end of input";
-    const int num_to_read = min(N, kBlockSize);
+    assert(fragment_size != 0);  // premature end of input
+    const size_t num_to_read = min(N, kBlockSize);
     size_t bytes_read = fragment_size;
 
-    int pending_advance = 0;
+    size_t pending_advance = 0;
     if (bytes_read >= num_to_read) {
       // Buffer returned by reader is large enough
       pending_advance = num_to_read;
@@ -828,11 +914,11 @@ size_t Compress(Source* reader, Sink* writer) {
         bytes_read += n;
         reader->Skip(n);
       }
-      DCHECK_EQ(bytes_read, num_to_read);
+      assert(bytes_read == num_to_read);
       fragment = scratch;
       fragment_size = num_to_read;
     }
-    DCHECK_EQ(fragment_size, num_to_read);
+    assert(fragment_size == num_to_read);
 
     // Get encoding table for compression
     int table_size;
@@ -866,6 +952,183 @@ size_t Compress(Source* reader, Sink* writer) {
   return written;
 }
 
+// -----------------------------------------------------------------------
+// IOVec interfaces
+// -----------------------------------------------------------------------
+
+// A type that writes to an iovec.
+// Note that this is not a "ByteSink", but a type that matches the
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
+class SnappyIOVecWriter {
+ private:
+  const struct iovec* output_iov_;
+  const size_t output_iov_count_;
+
+  // We are currently writing into output_iov_[curr_iov_index_].
+  int curr_iov_index_;
+
+  // Bytes written to output_iov_[curr_iov_index_] so far.
+  size_t curr_iov_written_;
+
+  // Total bytes decompressed into output_iov_ so far.
+  size_t total_written_;
+
+  // Maximum number of bytes that will be decompressed into output_iov_.
+  size_t output_limit_;
+
+  inline char* GetIOVecPointer(int index, size_t offset) {
+    return reinterpret_cast<char*>(output_iov_[index].iov_base) +
+        offset;
+  }
+
+ public:
+  // Does not take ownership of iov. iov must be valid during the
+  // entire lifetime of the SnappyIOVecWriter.
+  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
+      : output_iov_(iov),
+        output_iov_count_(iov_count),
+        curr_iov_index_(0),
+        curr_iov_written_(0),
+        total_written_(0),
+        output_limit_(-1) {
+  }
+
+  inline void SetExpectedLength(size_t len) {
+    output_limit_ = len;
+  }
+
+  inline bool CheckLength() const {
+    return total_written_ == output_limit_;
+  }
+
+  inline bool Append(const char* ip, size_t len) {
+    if (total_written_ + len > output_limit_) {
+      return false;
+    }
+
+    while (len > 0) {
+      assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
+      if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+        // This iovec is full. Go to the next one.
+        if (curr_iov_index_ + 1 >= output_iov_count_) {
+          return false;
+        }
+        curr_iov_written_ = 0;
+        ++curr_iov_index_;
+      }
+
+      const size_t to_write = std::min(
+          len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
+      memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+             ip,
+             to_write);
+      curr_iov_written_ += to_write;
+      total_written_ += to_write;
+      ip += to_write;
+      len -= to_write;
+    }
+
+    return true;
+  }
+
+  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
+    const size_t space_left = output_limit_ - total_written_;
+    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
+        output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+      // Fast path, used for the majority (about 95%) of invocations.
+      char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
+      UnalignedCopy64(ip, ptr);
+      UnalignedCopy64(ip + 8, ptr + 8);
+      curr_iov_written_ += len;
+      total_written_ += len;
+      return true;
+    }
+
+    return false;
+  }
+
+  inline bool AppendFromSelf(size_t offset, size_t len) {
+    if (offset > total_written_ || offset == 0) {
+      return false;
+    }
+    const size_t space_left = output_limit_ - total_written_;
+    if (len > space_left) {
+      return false;
+    }
+
+    // Locate the iovec from which we need to start the copy.
+    int from_iov_index = curr_iov_index_;
+    size_t from_iov_offset = curr_iov_written_;
+    while (offset > 0) {
+      if (from_iov_offset >= offset) {
+        from_iov_offset -= offset;
+        break;
+      }
+
+      offset -= from_iov_offset;
+      --from_iov_index;
+      assert(from_iov_index >= 0);
+      from_iov_offset = output_iov_[from_iov_index].iov_len;
+    }
+
+    // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
+    // the current iovec.
+    while (len > 0) {
+      assert(from_iov_index <= curr_iov_index_);
+      if (from_iov_index != curr_iov_index_) {
+        const size_t to_copy = std::min(
+            output_iov_[from_iov_index].iov_len - from_iov_offset,
+            len);
+        Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+        len -= to_copy;
+        if (len > 0) {
+          ++from_iov_index;
+          from_iov_offset = 0;
+        }
+      } else {
+        assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
+        size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
+                                      curr_iov_written_,
+                                  len);
+        if (to_copy == 0) {
+          // This iovec is full. Go to the next one.
+          if (curr_iov_index_ + 1 >= output_iov_count_) {
+            return false;
+          }
+          ++curr_iov_index_;
+          curr_iov_written_ = 0;
+          continue;
+        }
+        if (to_copy > len) {
+          to_copy = len;
+        }
+        IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
+                        GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+                        to_copy);
+        curr_iov_written_ += to_copy;
+        from_iov_offset += to_copy;
+        total_written_ += to_copy;
+        len -= to_copy;
+      }
+    }
+
+    return true;
+  }
+
+};
+
+bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
+                          const struct iovec* iov, size_t iov_cnt) {
+  ByteArraySource reader(compressed, compressed_length);
+  return RawUncompressToIOVec(&reader, iov, iov_cnt);
+}
+
+bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
+                          size_t iov_cnt) {
+  SnappyIOVecWriter output(iov, iov_cnt);
+  return InternalUncompress(compressed, &output);
+}
+
 // -----------------------------------------------------------------------
 // Flat array interfaces
 // -----------------------------------------------------------------------
@@ -893,34 +1156,51 @@ class SnappyArrayWriter {
     return op_ == op_limit_;
   }
 
-  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+  inline bool Append(const char* ip, size_t len) {
     char* op = op_;
-    const int space_left = op_limit_ - op;
-    if (allow_fast_path && len <= 16 && space_left >= 16) {
-      // Fast path, used for the majority (about 90%) of dynamic invocations.
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
-    } else {
-      if (space_left < len) {
-        return false;
-      }
-      memcpy(op, ip, len);
+    const size_t space_left = op_limit_ - op;
+    if (space_left < len) {
+      return false;
     }
+    memcpy(op, ip, len);
     op_ = op + len;
     return true;
   }
 
-  inline bool AppendFromSelf(uint32 offset, uint32 len) {
+  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
     char* op = op_;
-    const int space_left = op_limit_ - op;
+    const size_t space_left = op_limit_ - op;
+    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
+      // Fast path, used for the majority (about 95%) of invocations.
+      UnalignedCopy64(ip, op);
+      UnalignedCopy64(ip + 8, op + 8);
+      op_ = op + len;
+      return true;
+    } else {
+      return false;
+    }
+  }
 
-    if (op - base_ <= offset - 1u) {  // -1u catches offset==0
+  inline bool AppendFromSelf(size_t offset, size_t len) {
+    char* op = op_;
+    const size_t space_left = op_limit_ - op;
+
+    // Check if we try to append from before the start of the buffer.
+    // Normally this would just be a check for "produced < offset",
+    // but "produced <= offset - 1u" is equivalent for every case
+    // except the one where offset==0, where the right side will wrap around
+    // to a very big number. This is convenient, as offset==0 is another
+    // invalid case that we also want to catch, so that we do not go
+    // into an infinite loop.
+    assert(op >= base_);
+    size_t produced = op - base_;
+    if (produced <= offset - 1u) {
       return false;
     }
     if (len <= 16 && offset >= 8 && space_left >= 16) {
       // Fast path, used for the majority (70-80%) of dynamic invocations.
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+      UnalignedCopy64(op - offset, op);
+      UnalignedCopy64(op - offset + 8, op + 8);
     } else {
       if (space_left >= len + kMaxIncrementCopyOverflow) {
         IncrementalCopyFastPath(op - offset, op, len);
@@ -944,7 +1224,7 @@ bool RawUncompress(const char* compressed, size_t n, char* 
uncompressed) {
 
 bool RawUncompress(Source* compressed, char* uncompressed) {
   SnappyArrayWriter output(uncompressed);
-  return InternalUncompress(compressed, &output, kuint32max);
+  return InternalUncompress(compressed, &output);
 }
 
 bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
@@ -952,9 +1232,9 @@ bool Uncompress(const char* compressed, size_t n, string* 
uncompressed) {
   if (!GetUncompressedLength(compressed, n, &ulength)) {
     return false;
   }
-  // Protect against possible DoS attack
-  if ((static_cast<uint64>(ulength) + uncompressed->size()) >
-      uncompressed->max_size()) {
+  // On 32-bit builds: max_size() < kuint32max.  Check for that instead
+  // of crashing (e.g., consider externally specified compressed data).
+  if (ulength > uncompressed->max_size()) {
     return false;
   }
   STLStringResizeUninitialized(uncompressed, ulength);
@@ -976,12 +1256,17 @@ class SnappyDecompressionValidator {
   inline bool CheckLength() const {
     return expected_ == produced_;
   }
-  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+  inline bool Append(const char* ip, size_t len) {
     produced_ += len;
     return produced_ <= expected_;
   }
-  inline bool AppendFromSelf(uint32 offset, uint32 len) {
-    if (produced_ <= offset - 1u) return false;  // -1u catches offset==0
+  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
+    return false;
+  }
+  inline bool AppendFromSelf(size_t offset, size_t len) {
+    // See SnappyArrayWriter::AppendFromSelf for an explanation of
+    // the "offset - 1u" trick.
+    if (produced_ <= offset - 1u) return false;
     produced_ += len;
     return produced_ <= expected_;
   }
@@ -990,7 +1275,7 @@ class SnappyDecompressionValidator {
 bool IsValidCompressedBuffer(const char* compressed, size_t n) {
   ByteArraySource reader(compressed, n);
   SnappyDecompressionValidator writer;
-  return InternalUncompress(&reader, &writer, kuint32max);
+  return InternalUncompress(&reader, &writer);
 }
 
 void RawCompress(const char* input,
diff --git a/c_src/snappy/snappy.h b/c_src/snappy/snappy.h
index 8d6ef22..e879e79 100644
--- a/c_src/snappy/snappy.h
+++ b/c_src/snappy/snappy.h
@@ -56,6 +56,13 @@ namespace snappy {
   // number of bytes written.
   size_t Compress(Source* source, Sink* sink);
 
+  // Find the uncompressed length of the given stream, as given by the header.
+  // Note that the true length could deviate from this; the stream could e.g.
+  // be truncated.
+  //
+  // Also note that this leaves "*source" in a state that is unsuitable for
+  // further operations, such as RawUncompress(). You will need to rewind
+  // or recreate the source yourself before attempting any further calls.
   bool GetUncompressedLength(Source* source, uint32* result);
 
   // ------------------------------------------------------------------------
@@ -117,6 +124,28 @@ namespace snappy {
   // returns false if the message is corrupted and could not be decrypted
   bool RawUncompress(Source* compressed, char* uncompressed);
 
+  // Given data in "compressed[0..compressed_length-1]" generated by
+  // calling the Snappy::Compress routine, this routine
+  // stores the uncompressed data to the iovec "iov". The number of physical
+  // buffers in "iov" is given by iov_cnt and their cumulative size
+  // must be at least GetUncompressedLength(compressed). The individual buffers
+  // in "iov" must not overlap with each other.
+  //
+  // returns false if the message is corrupted and could not be decrypted
+  bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
+                            const struct iovec* iov, size_t iov_cnt);
+
+  // Given data from the byte source 'compressed' generated by calling
+  // the Snappy::Compress routine, this routine stores the uncompressed
+  // data to the iovec "iov". The number of physical
+  // buffers in "iov" is given by iov_cnt and their cumulative size
+  // must be at least GetUncompressedLength(compressed). The individual buffers
+  // in "iov" must not overlap with each other.
+  //
+  // returns false if the message is corrupted and could not be decrypted
+  bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
+                            size_t iov_cnt);
+
   // Returns the maximal size of the compressed representation of
   // input data that is "source_bytes" bytes in length;
   size_t MaxCompressedLength(size_t source_bytes);
@@ -135,20 +164,20 @@ namespace snappy {
   bool IsValidCompressedBuffer(const char* compressed,
                                size_t compressed_length);
 
-  // *** DO NOT CHANGE THE VALUE OF kBlockSize ***
+  // The size of a compression block. Note that many parts of the compression
+  // code assumes that kBlockSize <= 65536; in particular, the hash table
+  // can only store 16-bit offsets, and EmitCopy() also assumes the offset
+  // is 65535 bytes or less. Note also that if you change this, it will
+  // affect the framing format (see framing_format.txt).
   //
-  // New Compression code chops up the input into blocks of at most
-  // the following size.  This ensures that back-references in the
-  // output never cross kBlockSize block boundaries.  This can be
-  // helpful in implementing blocked decompression.  However the
-  // decompression code should not rely on this guarantee since older
-  // compression code may not obey it.
-  static const int kBlockLog = 15;
-  static const int kBlockSize = 1 << kBlockLog;
+  // Note that there might be older data around that is compressed with larger
+  // block sizes, so the decompression code should not rely on the
+  // non-existence of long backreferences.
+  static const int kBlockLog = 16;
+  static const size_t kBlockSize = 1 << kBlockLog;
 
   static const int kMaxHashTableBits = 14;
-  static const int kMaxHashTableSize = 1 << kMaxHashTableBits;
-
+  static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
 }  // end namespace snappy
 
 
diff --git a/c_src/snappy_nif.cc b/c_src/snappy_nif.cc
index 93c1859..4c690ab 100644
--- a/c_src/snappy_nif.cc
+++ b/c_src/snappy_nif.cc
@@ -135,7 +135,7 @@ BEGIN_C
 
 
 ERL_NIF_TERM
-snappy_compress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+snappy_compress_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 {
     ErlNifBinary input;
 
@@ -157,7 +157,7 @@ snappy_compress(ErlNifEnv* env, int argc, const 
ERL_NIF_TERM argv[])
 
 
 ERL_NIF_TERM
-snappy_decompress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+snappy_decompress_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 {
     ErlNifBinary bin;
     ErlNifBinary ret;
@@ -189,7 +189,7 @@ snappy_decompress(ErlNifEnv* env, int argc, const 
ERL_NIF_TERM argv[])
 
 
 ERL_NIF_TERM
-snappy_uncompressed_length(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+snappy_uncompressed_length_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM 
argv[])
 {
     ErlNifBinary bin;
     size_t len;
@@ -252,9 +252,9 @@ on_upgrade(ErlNifEnv* env, void** priv, void** old_priv, 
ERL_NIF_TERM info)
 
 
 static ErlNifFunc nif_functions[] = {
-    {"compress", 1, snappy_compress},
-    {"decompress", 1, snappy_decompress},
-    {"uncompressed_length", 1, snappy_uncompressed_length},
+    {"compress", 1, snappy_compress_erl},
+    {"decompress", 1, snappy_decompress_erl},
+    {"uncompressed_length", 1, snappy_uncompressed_length_erl},
     {"is_valid", 1, snappy_is_valid}
 };
 
diff --git a/rebar.config b/rebar.config
index ad92b23..329f5ec 100644
--- a/rebar.config
+++ b/rebar.config
@@ -1,4 +1,4 @@
-{require_otp_vsn, "R13B04|R14|R15"}.
+{require_otp_vsn, "R13B04|R14|R15|R16"}.
 
 {erl_opts, [debug_info, warn_unused_vars, warn_shadow_vars, 
warn_unused_import]}.
 {port_env, [

Reply via email to