[jira] [Commented] (PARQUET-41) Add bloom filters to parquet statistics

ASF GitHub Bot (JIRA) Thu, 18 Jan 2018 17:58:32 -0800

    [ 
https://issues.apache.org/jira/browse/PARQUET-41?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16331602#comment-16331602
 ]


ASF GitHub Bot commented on PARQUET-41:
---------------------------------------

cjjnjust closed pull request #431: PARQUET-41: Add bloom filter for parquet-cpp
URL: https://github.com/apache/parquet-cpp/pull/431
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5aa8b930..2b69b109 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -675,6 +675,8 @@ set(LIBPARQUET_SRCS
   src/parquet/column_reader.cc
   src/parquet/column_scanner.cc
   src/parquet/column_writer.cc
+  src/parquet/murmur3.cc
+  src/parquet/bloom.cc
 
   src/parquet/file/metadata.cc
   src/parquet/file/printer.cc
diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt
index a2e283e2..a8a87323 100644
--- a/src/parquet/CMakeLists.txt
+++ b/src/parquet/CMakeLists.txt
@@ -27,6 +27,8 @@ install(FILES
   schema.h
   statistics.h
   types.h
+  bloom.h
+  murmur3.h
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet")
 
 configure_file(parquet_version.h.in
@@ -56,6 +58,7 @@ ADD_PARQUET_TEST(public-api-test)
 ADD_PARQUET_TEST(types-test)
 ADD_PARQUET_TEST(reader-test)
 ADD_PARQUET_TEST(schema-test)
+ADD_PARQUET_TEST(bloom-test)
 
 ADD_PARQUET_BENCHMARK(column-io-benchmark)
 ADD_PARQUET_BENCHMARK(encoding-benchmark)
diff --git a/src/parquet/bloom-test.cc b/src/parquet/bloom-test.cc
new file mode 100644
index 00000000..5621d13b
--- /dev/null
+++ b/src/parquet/bloom-test.cc
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "parquet/murmur3.h"
+#include "parquet/bloom.h"
+
+#include "parquet/util/memory.h"
+
+namespace parquet {
+namespace test {
+TEST(Murmur3Test, TestBloomFilter) {
+  const uint8_t bitset[8] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7};
+  int64_t result;
+  MurmurHash3_x64_128(bitset, 8, Bloom::DEFAULT_SEED, &result);
+  ASSERT_EQ(result, -3850979349427597861l);
+}
+
+
+TEST(FindTest, TestBloomFilter) {
+  std::unique_ptr<Bloom> bloom(new Bloom(1024));
+  for(int i = 0; i<10; i++) {
+    uint64_t hash_value = bloom->hash(i);
+    bloom->insert(hash_value);
+  }
+  std::shared_ptr<InMemoryOutputStream> sink;
+  sink.reset(new InMemoryOutputStream());
+
+  bloom->writeTo(sink);
+
+  std::shared_ptr<InMemoryInputStream> source(new 
InMemoryInputStream(sink->GetBuffer()));
+
+  int64_t bytes_avaliable;
+  uint32_t length = *(reinterpret_cast<const uint32_t *>(
+      source->Read(4, &bytes_avaliable)));
+  ASSERT_EQ(length, 1024);
+
+  uint32_t hash = *(reinterpret_cast<const uint32_t *>(
+      source->Read(4, &bytes_avaliable)));
+  ASSERT_EQ(hash, 0);
+
+  uint32_t algo = *(reinterpret_cast<const uint32_t *>(
+      source->Read(4, &bytes_avaliable)));
+  ASSERT_EQ(algo, 0);
+
+  const uint8_t* bitset = source->Read(length, &bytes_avaliable);
+
+  std::unique_ptr<Bloom> de_bloom(new Bloom(bitset, length));
+
+  for(int i = 0; i<10; i++) {
+    ASSERT_TRUE(de_bloom->find(bloom->hash(i)));
+  }
+}
+
+}//namespace test
+
+
+}// namespace parquet
diff --git a/src/parquet/bloom.cc b/src/parquet/bloom.cc
new file mode 100644
index 00000000..cac71e45
--- /dev/null
+++ b/src/parquet/bloom.cc
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/bloom.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit-util.h"
+
+#include "parquet/exception.h"
+#include "parquet/murmur3.h"
+#include "parquet/util/logging.h"
+
+namespace parquet {
+constexpr uint32_t Bloom::SALT[8];
+
+Bloom::Bloom(uint32_t num_bytes)
+    : num_bytes(num_bytes),
+      hash_strategy(MURMUR3_X64_128),
+      algorithm(BLOCK),
+      hashFunc(NULL) {
+  initBitset(num_bytes);
+
+  switch (hash_strategy) {
+  case MURMUR3_X64_128:
+    this->hashFunc = &MurmurHash3_x64_128;
+    break;
+  default:
+    throw parquet::ParquetException("Unknown hash strategy.");
+  }
+}
+
+void Bloom::initBitset(uint32_t num_bytes) {
+  if (num_bytes < BYTES_PER_FILTER_BLOCK) {
+    num_bytes = BYTES_PER_FILTER_BLOCK;
+  }
+
+  if (num_bytes > DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES) {
+    num_bytes = DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES;
+  }
+
+  // Get next power of 2 if it is not power of 2.
+  if ((num_bytes & (num_bytes - 1)) != 0) {
+    num_bytes = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bytes));
+  }
+
+  this->bitset = new uint8_t[num_bytes];
+}
+
+Bloom::Bloom(const uint8_t* bitset, uint32_t num_bytes)
+    : num_bytes(num_bytes),
+      hash_strategy(MURMUR3_X64_128),
+      algorithm(BLOCK){
+  this->bitset = new uint8_t[num_bytes];
+  memcpy(this->bitset, bitset, num_bytes);
+  switch (hash_strategy) {
+  case MURMUR3_X64_128:
+    this->hashFunc = &MurmurHash3_x64_128;
+    break;
+  default:
+    throw new parquet::ParquetException("Not supported hash strategy");
+  }
+}
+
+void Bloom::setMask(uint32_t key, uint32_t mask[8]) {
+  for (int i = 0; i < 8; ++i) {
+    mask[i] = key * SALT[i];
+  }
+  
+  for (int i = 0; i < 8; ++i) {
+    mask[i] = mask[i] >> 27;
+  }
+  
+  for (int i = 0; i < 8; ++i) {
+    mask[i] = 0x1U << mask[i];
+  }
+}
+
+uint32_t optimalNumOfBits(uint32_t ndv, double fpp) {
+  DCHECK(fpp > 0.0 && fpp < 1.0);
+  const double M = -8 * ndv / log(1 - pow(fpp, 1.0 / 8));
+  const double MAX = Bloom::DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES << 3;
+
+  int num_bits = static_cast<uint32_t>(M);
+
+  // Handle overflow.
+  if (M > MAX || M < 0) {
+    num_bits = static_cast<uint32_t>(MAX);
+  }
+
+  // Get next power of 2 if bits is not power of 2.
+  if ((num_bits & (num_bits - 1)) != 0) {
+    num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
+  }
+
+  // Minimum
+  if (num_bits < (Bloom::BYTES_PER_FILTER_BLOCK << 3)) {
+    num_bits = Bloom::BYTES_PER_FILTER_BLOCK << 3;
+  }
+
+  return num_bits;
+}
+
+void Bloom::addElement(uint64_t hash) {
+  uint32_t* const bitset32 = reinterpret_cast<uint32_t*>(bitset);
+  const uint32_t bucketIndex = static_cast<uint32_t>(hash >> 32)
+      & (num_bytes / BYTES_PER_FILTER_BLOCK - 1);
+  uint32_t key = static_cast<uint32_t>(hash);
+
+  // Calculate mask for bucket.
+  uint32_t mask[8];
+  setMask(key, mask);
+
+  for (int i = 0; i < 8; i++) {
+    bitset32[bucketIndex * 8 + i] |= mask[i];
+  }
+}
+
+bool Bloom::contains(uint64_t hash) {
+  uint32_t * const bitset32 = reinterpret_cast<uint32_t * const>(bitset);
+  const uint32_t bucketIndex = static_cast<uint32_t>((hash >> 32)
+      & (num_bytes / BYTES_PER_FILTER_BLOCK - 1));
+  uint32_t key = static_cast<uint32_t>(hash);
+
+  // Calculate mask for bucket.
+  uint32_t mask[8];
+  setMask(key, mask);
+
+  for (int i = 0; i < 8; ++i) {
+    if (0 == (bitset32[8 * bucketIndex + i] & mask[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool Bloom::find(uint64_t hash) {
+  return contains(hash);
+}
+
+void Bloom::insert(unsigned long long hash) {
+  addElement(hash);
+}
+
+uint64_t Bloom::hash(int value) {
+  uint64_t out[2];
+  (*hashFunc)((void*)&value, sizeof(int), DEFAULT_SEED, &out);
+  return out[0];
+}
+
+uint64_t Bloom::hash(const long value) {
+  uint64_t out[2];
+  (*hashFunc)((void*)&value, sizeof(long), DEFAULT_SEED, &out);
+  return out[0];
+}
+
+uint64_t Bloom::hash(const float value) {
+  uint64_t out[2];
+  (*hashFunc)((void*)&value, sizeof(float), DEFAULT_SEED, &out);
+  return out[0];
+}
+
+
+uint64_t Bloom::hash(const double value) {
+  uint64_t out[2];
+  (*hashFunc)((void*)&value, sizeof(double), DEFAULT_SEED, &out);
+  return out[0];
+}
+
+
+uint64_t Bloom::hash(const Int96 &value) {
+  uint64_t out[2];
+  (*hashFunc)((void*)value.value, sizeof(value.value), DEFAULT_SEED, &out);
+  return out[0];
+}
+
+uint64_t Bloom::hash(const ByteArray &value) {
+  uint64_t out[2];
+  (*hashFunc)((void*)value.ptr, value.len, DEFAULT_SEED, &out);
+  return out[0];
+}
+
+uint64_t Bloom::hash(const FLBA &value, uint32_t len) {
+  uint64_t out[2];
+  (*hashFunc)((void*)value.ptr, len, DEFAULT_SEED, &out);
+  return out[0];
+}
+
+void Bloom::writeTo(const std::shared_ptr<OutputStream>& sink){
+  sink->Write(reinterpret_cast<const uint8_t *>(&num_bytes), sizeof(uint32_t));
+  sink->Write(reinterpret_cast<const uint8_t *>(&hash_strategy), 
sizeof(uint32_t));
+  sink->Write(reinterpret_cast<const uint8_t *>(&algorithm), sizeof(uint32_t));
+  sink->Write(bitset, num_bytes);
+}
+
+Bloom::~Bloom() {
+  if (bitset) {
+    free(bitset);
+  }
+}
+} // namespace parquet
+
diff --git a/src/parquet/bloom.h b/src/parquet/bloom.h
new file mode 100644
index 00000000..fb3a7996
--- /dev/null
+++ b/src/parquet/bloom.h
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_BLOOM_H
+#define PARQUET_BLOOM_H
+
+#include <set>
+#include <cstdint>
+
+#include "parquet/types.h"
+#include "parquet/util/memory.h"
+
+namespace parquet{
+  class OutputStream;
+  
+ // Bloom Filter is a compact structure to indicate whether an item is not in 
set or
+ // probably in set. Bloom class is underlying class of Bloom Filter which 
stores a
+ // bit set represents elements set, hash strategy and bloom filter algorithm.
+
+ // Bloom Filter algorithm is implemented using block Bloom filters from Putze 
et al.'s
+ // "Cache-,Hash- and Space-Efficient Bloom Filters". The basic idea is to 
hash the
+ // item to a tiny Bloom Filter which size fit a single cache line or smaller. 
This
+ // implementation sets 8 bits in each tiny Bloom Filter. Tiny bloom filter 
are 32
+ // bytes to take advantage of 32-bytes SIMD instruction.
+
+class Bloom {
+public:
+  // Hash strategy available for bloom filter.
+  enum HashStrategy {
+    MURMUR3_X64_128
+  };
+
+  // Bloom filter algorithm.
+  enum Algorithm {
+    BLOCK
+  };
+
+  /**
+   * Default false positive probability value use to calculate optimal number 
of bits
+   * used by bloom filter.
+   */
+  static constexpr double DEFAULT_FPP = 0.01;
+
+  // Bloom filter data header, including number of bytes, hash strategy and 
algorithm.
+  static constexpr int HEADER_SIZE = 12;
+
+  // Bytes in a tiny bloom filter block.
+  static constexpr int BYTES_PER_FILTER_BLOCK = 32;
+
+  // Default seed for hash function
+  static constexpr int DEFAULT_SEED = 104729;
+
+  // Default maximum bloom filter size (need to discuss)
+  static constexpr int DEFAULT_MAXIMUM_BLOOM_FILTER_BYTES = 16 * 1024 * 1024;
+
+  // The block based algorithm needs 8 odd SALT values to calculate eight index
+  // of bit to set, one bit in 32-bit word.
+  static constexpr uint32_t SALT[8] = { 0x47b6137bU, 0x44974d91U, 0x8824ad5bU,
+      0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U };
+
+  typedef void (*HashFunc)(const void *, int, uint32_t, void*);
+public:
+  /// Constructor of bloom filter, if numBytes is zero, bloom filter bitset
+  /// will be created lazily and the number of bytes will be calculated through
+  /// distinct values in cache. It use murmur3_x64_128 as its default hash 
function
+  /// and block based algorithm as default algorithm.
+  /// @param num_bytes The number of bytes for bloom filter bitset, set to 
zero can
+  ///               let it calculate number automatically by using default 
DEFAULT_FPP.
+  Bloom(uint32_t num_bytes);
+  
+  
+  /// Construct the bloom filter with given bit set, it is used when 
reconstruct
+  /// bloom filter from parquet file.It use murmur3_x64_128 as its default hash
+  /// function and block based algorithm as default algorithm.
+  /// @param bitset The given bitset to construct bloom filter.
+  /// @param len Length of bitset.
+  Bloom(const uint8_t* bitset, uint32_t len);
+  
+  Bloom(const Bloom& orig) = delete;
+  virtual ~Bloom();
+
+  // Calculate optimal size according to the number of distinct values and 
false
+  // positive probability.
+  // @param ndv: The number of distinct values.
+  // @param fpp: The false positive probability.
+  // @return optimal number of bits of given n and p.
+  static uint32_t optimalNumOfBits(uint32_t ndv, double fpp);
+  
+  // Determine whether an element exist in set or not.
+  // @param hash the element to contain.
+  // @return false if value is definitely not in set, and true means PROBABLY 
in set.
+  bool find(uint64_t hash);
+
+  // Insert element to set represented by bloom bitset.
+  // @param hash the hash of value to insert into bloom filter..
+  void insert(unsigned long long hash);
+
+  // Compute hash for int value by using its plain encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const int value);
+
+  // Compute hash for long value by using its plain encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const long value);
+
+  // Compute hash for float value by using its plain encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const float value);
+
+  // Compute hash for double value by using its plain encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const double value);
+
+  // Compute hash for Int96 value by using its plain encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const Int96 &value);
+
+  // Compute hash for ByteArray value by using its plain encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const ByteArray &value);
+
+  // Compute hash for Fixed Length Byte Array value by using its plain 
encoding result.
+  // @param value the value to hash.
+  // @return hash result.
+  uint64_t hash(const FLBA &value, uint32_t len);
+  
+  // Write bloom filter to output stream. A bloom filter structure should 
include
+  // bitset length, hash strategy, algorithm, and bitset.
+  // @param sink output stream to write
+  void writeTo(const std::shared_ptr<OutputStream>& sink);
+
+private:
+  // Create a new bitset for bloom filter, at least 256 bits will be create.
+  // @param numBytes number of bytes for bitset
+  void initBitset(uint32_t num_bytes);
+  void setMask(uint32_t key, uint32_t mask[8]);
+  void addElement(uint64_t hash);
+  bool contains(uint64_t hash);
+
+  // The number of bytes of bloom filter bitset.
+  uint32_t num_bytes;
+
+  // Hash strategy used in this bloom filter.
+  HashStrategy hash_strategy;
+
+  // Algorithm applied of this bloom filter.
+  Algorithm algorithm;
+
+  // The underlying byte array for bloom filter bitset.
+  uint8_t* bitset;
+
+  // Hash function applied.
+  HashFunc hashFunc;
+};
+}
+#endif /* BLOOM_H */
+
diff --git a/src/parquet/murmur3.cc b/src/parquet/murmur3.cc
new file mode 100644
index 00000000..ab4cfa1e
--- /dev/null
+++ b/src/parquet/murmur3.cc
@@ -0,0 +1,349 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "murmur3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE   __forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)    _rotl(x,y)
+#define ROTL64(x,y)    _rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else  // defined(_MSC_VER)
+
+#define        FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define        ROTL32(x,y)     rotl32(x,y)
+#define ROTL64(x,y)    rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+    
+    h1 ^= k1;
+    h1 = ROTL32(h1,13); 
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+} 
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b; 
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5; 
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i*4+0);
+    uint32_t k2 = getblock32(blocks,i*4+1);
+    uint32_t k3 = getblock32(blocks,i*4+2);
+    uint32_t k4 = getblock32(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock64(blocks,i*2+0);
+    uint64_t k2 = getblock64(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
+  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
+  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
+  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
+  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
+  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
+  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
+  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
+  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
+  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
+  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
+  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
+  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
+  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
diff --git a/src/parquet/murmur3.h b/src/parquet/murmur3.h
new file mode 100644
index 00000000..e823e7c6
--- /dev/null
+++ b/src/parquet/murmur3.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else  // defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * 
out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * 
out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * 
out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
\ No newline at end of file


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Add bloom filters to parquet statistics
> ---------------------------------------
>
>                 Key: PARQUET-41
>                 URL: https://issues.apache.org/jira/browse/PARQUET-41
>             Project: Parquet
>          Issue Type: New Feature
>          Components: parquet-format, parquet-mr
>            Reporter: Alex Levenson
>            Assignee: Ferdinand Xu
>            Priority: Major
>              Labels: filter2
>
> For row groups with no dictionary, we could still produce a bloom filter. 
> This could be very useful in filtering entire row groups.
> Pull request:
> https://github.com/apache/parquet-mr/pull/215



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (PARQUET-41) Add bloom filters to parquet statistics

Reply via email to