This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new d4a7df6 ORC-1024: [C++] Fix inconsistent bloom filter hashing for
numeric values (#934)
d4a7df6 is described below
commit d4a7df62c44c4f89777d14c6792578cb977fff88
Author: Quanlong Huang <[email protected]>
AuthorDate: Fri Oct 15 13:16:40 2021 +0800
ORC-1024: [C++] Fix inconsistent bloom filter hashing for numeric values
(#934)
### What changes were proposed in this pull request?
This PR fixes inconsistent bloom filter hashing for numeric values between
the Java client and C++ client. Before this PR, bloom filters written by Java
clients will be incorrectly used in the C++ clients due to the hashing
inconsistency. There is an example in the JIRA description.
After this fix, when reading old ORC files that have bloom filters written
by old C++ clients, the bloom filters won't be used.
### Why are the changes needed?
In Java codes, hash values are in type "long" which is signed. Bitwise
operators like ">>" (shifting) are sign-awared. So we should use "int64_t"
instead of "uint64_t" in the corresponding C++ codes.
### How was this patch tested?
Add hashing tests in both Java and C++ codes for numeric values.
(TestBloomFilter.java & TestBloomFilter.cc)
Add hashing tests in C++ codes for string values. (TestMurmur3.cc)
Add test for reading an ORC file with bloom filters written by C++ client
in version 1.6.11. Verified bad bloom filters won't affect the results.
(TestReader.cc)
ORC-1025 will focus on adding more tests.
---
c++/src/BloomFilter.cc | 33 ++---
c++/src/BloomFilter.hh | 18 ++-
c++/src/Reader.cc | 35 ++++-
c++/src/Reader.hh | 8 ++
c++/test/CMakeLists.txt | 1 +
c++/test/TestBloomFilter.cc | 55 +++++++-
c++/test/TestMurmur3.cc | 41 ++++++
c++/test/TestReader.cc | 36 +++++
examples/bad_bloom_filter_1.6.0.orc | Bin 0 -> 709 bytes
examples/bad_bloom_filter_1.6.11.orc | Bin 0 -> 717 bytes
.../java/org/apache/orc/impl/RecordReaderImpl.java | 39 +++++-
.../src/java/org/apache/orc/util/BloomFilter.java | 10 +-
.../test/org/apache/orc/impl/TestReaderImpl.java | 34 +++++
.../test/org/apache/orc/util/TestBloomFilter.java | 146 +++++++++++++++++++++
14 files changed, 428 insertions(+), 28 deletions(-)
diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc
index 020a271..8a1f188 100644
--- a/c++/src/BloomFilter.cc
+++ b/c++/src/BloomFilter.cc
@@ -101,19 +101,6 @@ namespace orc {
return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) *
std::log(2.0)));
}
- // Thomas Wang's integer hash function
- //
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
- inline uint64_t getLongHash(uint64_t key) {
- key = (~key) + (key << 21); // key = (key << 21) - key - 1;
- key = key ^ (key >> 24);
- key = (key + (key << 3)) + (key << 8); // key * 265
- key = key ^ (key >> 14);
- key = (key + (key << 2)) + (key << 4); // key * 21
- key = key ^ (key >> 28);
- key = key + (key << 31);
- return key;
- }
-
// We use the trick mentioned in "Less Hashing, Same Performance:
// Building a Better Bloom Filter" by Kirsch et.al. From abstract
// 'only two hash functions are necessary to effectively implement
@@ -148,20 +135,20 @@ namespace orc {
void BloomFilterImpl::addBytes(const char * data, int64_t length) {
uint64_t hash64 = getBytesHash(data, length);
- addHash(hash64);
+ addHash(static_cast<int64_t>(hash64));
}
void BloomFilterImpl::addLong(int64_t data) {
- addHash(getLongHash(static_cast<uint64_t>(data)));
+ addHash(getLongHash(data));
}
bool BloomFilterImpl::testBytes(const char * data, int64_t length) const {
uint64_t hash64 = getBytesHash(data, length);
- return testHash(hash64);
+ return testHash(static_cast<int64_t>(hash64));
}
bool BloomFilterImpl::testLong(int64_t data) const {
- return testHash(getLongHash(static_cast<uint64_t>(data)));
+ return testHash(getLongHash(data));
}
uint64_t BloomFilterImpl::sizeInBytes() const {
@@ -223,9 +210,11 @@ namespace orc {
DIAGNOSTIC_POP
- void BloomFilterImpl::addHash(uint64_t hash64) {
+ void BloomFilterImpl::addHash(int64_t hash64) {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
- int32_t hash2 = static_cast<int32_t>(hash64 >> 32);
+ // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
+ // So we cast hash64 to uint64_t here for an unsigned right shift.
+ int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32);
for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
int32_t combinedHash = hash1 + i * hash2;
@@ -238,9 +227,11 @@ namespace orc {
}
}
- bool BloomFilterImpl::testHash(uint64_t hash64) const{
+ bool BloomFilterImpl::testHash(int64_t hash64) const{
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
- int32_t hash2 = static_cast<int32_t>(hash64 >> 32);
+ // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
+ // So we cast hash64 to uint64_t here for an unsigned right shift.
+ int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32);
for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
int32_t combinedHash = hash1 + i * hash2;
diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh
index 91aa528..cf18a46 100644
--- a/c++/src/BloomFilter.hh
+++ b/c++/src/BloomFilter.hh
@@ -162,12 +162,13 @@ namespace orc {
private:
friend struct BloomFilterUTF8Utils;
+ friend class TestBloomFilter_testBloomFilterBasicOperations_Test;
// compute k hash values from hash64 and set bits
- void addHash(uint64_t hash64);
+ void addHash(int64_t hash64);
// compute k hash values from hash64 and check bits
- bool testHash(uint64_t hash64) const;
+ bool testHash(int64_t hash64) const;
void serialize(proto::BloomFilter& bloomFilter) const;
@@ -191,6 +192,19 @@ namespace orc {
const proto::BloomFilter& bloomFilter);
};
+ // Thomas Wang's integer hash function
+ //
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
+ // Put this in header file so tests can use it as well.
+ inline int64_t getLongHash(int64_t key) {
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = key ^ (key >> 24);
+ key = (key + (key << 3)) + (key << 8); // key * 265
+ key = key ^ (key >> 14);
+ key = (key + (key << 2)) + (key << 4); // key * 21
+ key = key ^ (key >> 28);
+ key = key + (key << 31);
+ return key;
+ }
}
#endif //ORC_BLOOMFILTER_IMPL_HH
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index c9b1627..878c208 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -35,6 +35,11 @@
#include <set>
namespace orc {
+ // ORC files writen by these versions of cpp writers have inconsistent bloom
filter
+ // hashing. Bloom filters of them should not be used.
+ static const char* BAD_CPP_BLOOM_FILTER_VERSIONS[] = {
+ "1.6.0", "1.6.1", "1.6.2", "1.6.3", "1.6.4", "1.6.5", "1.6.6", "1.6.7",
"1.6.8",
+ "1.6.9", "1.6.10", "1.6.11", "1.7.0"};
const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() {
static const WriterVersionImpl version(WriterVersion_HIVE_8732);
@@ -243,6 +248,34 @@ namespace orc {
footer->rowindexstride(),
getWriterVersionImpl(_contents.get())));
}
+
+ skipBloomFilters = hasBadBloomFilters();
+ }
+
+ // Check if the file has inconsistent bloom filters.
+ bool RowReaderImpl::hasBadBloomFilters() {
+ // Only C++ writer in old releases could have bad bloom filters.
+ if (footer->writer() != ORC_CPP_WRITER) return false;
+ // 'softwareVersion' is added in 1.5.13, 1.6.11, and 1.7.0.
+ // 1.6.x releases before 1.6.11 won't have it. On the other side, the C++
writer
+ // supports writing bloom filters since 1.6.0. So files written by the C++
writer
+ // and with 'softwareVersion' unset would have bad bloom filters.
+ if (!footer->has_softwareversion()) return true;
+
+ const std::string &fullVersion = footer->softwareversion();
+ std::string version;
+ // Deal with snapshot versions, e.g. 1.6.12-SNAPSHOT.
+ if (fullVersion.find('-') != std::string::npos) {
+ version = fullVersion.substr(0, fullVersion.find('-'));
+ } else {
+ version = fullVersion;
+ }
+ for (const char *v : BAD_CPP_BLOOM_FILTER_VERSIONS) {
+ if (version == v) {
+ return true;
+ }
+ }
+ return false;
}
CompressionKind RowReaderImpl::getCompression() const {
@@ -363,7 +396,7 @@ namespace orc {
throw ParseError("Failed to parse the row index");
}
rowIndexes[colId] = rowIndex;
- } else { // Stream_Kind_BLOOM_FILTER_UTF8
+ } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8
proto::BloomFilterIndex pbBFIndex;
if (!pbBFIndex.ParseFromZeroCopyStream(inStream.get())) {
throw ParseError("Failed to parse bloom filter index");
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index b7b76d4..1bf0cb2 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -127,6 +127,7 @@ namespace orc {
proto::Footer* footer;
DataBuffer<uint64_t> firstRowOfStripe;
mutable std::unique_ptr<Type> selectedSchema;
+ bool skipBloomFilters;
// reading state
uint64_t previousRow;
@@ -179,6 +180,13 @@ namespace orc {
*/
void seekToRowGroup(uint32_t rowGroupEntryId);
+ /**
+ * Check if the file has bad bloom filters. We will skip using them in the
+ * following reads.
+ * @return true if it has.
+ */
+ bool hasBadBloomFilters();
+
public:
/**
* Constructor that lets the user specify additional options.
diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt
index 8b5ee7b..00aca6b 100644
--- a/c++/test/CMakeLists.txt
+++ b/c++/test/CMakeLists.txt
@@ -35,6 +35,7 @@ add_executable (orc-test
TestDictionaryEncoding.cc
TestDriver.cc
TestInt128.cc
+ TestMurmur3.cc
TestPredicateLeaf.cc
TestPredicatePushdown.cc
TestReader.cc
diff --git a/c++/test/TestBloomFilter.cc b/c++/test/TestBloomFilter.cc
index 71c9b7b..581897c 100644
--- a/c++/test/TestBloomFilter.cc
+++ b/c++/test/TestBloomFilter.cc
@@ -18,7 +18,6 @@
#include "BloomFilter.hh"
#include "orc/OrcFile.hh"
-#include "wrap/gmock.h"
#include "wrap/gtest-wrapper.h"
namespace orc {
@@ -83,6 +82,39 @@ namespace orc {
EXPECT_EQ(~0x8040201008040201L, longs[1]);
}
+ // Same test as TestBloomFilter#testLongHash() in Java codes. Make sure the
hash values
+ // are consistent between the Java client and C++ client.
+ // TODO(ORC-1025): Add exhaustive test on all numbers.
+ TEST(TestBloomFilter, testLongHash) {
+ EXPECT_EQ(0, orc::getLongHash(0));
+ EXPECT_EQ(6614246905173314819, orc::getLongHash(-1));
+ EXPECT_EQ(-5218250166726157773, orc::getLongHash(-2));
+ EXPECT_EQ(1396019780946710816, orc::getLongHash(-3));
+
+ EXPECT_EQ(3691278333958578070, orc::getLongHash(-9223372036854775805));
+ EXPECT_EQ(-1192099642781211952, orc::getLongHash(-9223372036854775806));
+ EXPECT_EQ(-9102499068535824902, orc::getLongHash(-9223372036854775807));
+
+ EXPECT_EQ(1499534499340523007, orc::getLongHash(790302201));
+ EXPECT_EQ(-5108695154500810163, orc::getLongHash(790302202));
+ EXPECT_EQ(-2450623810987162260, orc::getLongHash(790302203));
+ EXPECT_EQ(-1097054448615658549, orc::getLongHash(18000000000));
+
+ EXPECT_EQ(-4986173376161118712, orc::getLongHash(9223372036064673413));
+ EXPECT_EQ(3785699328822078862, orc::getLongHash(9223372036064673414));
+ EXPECT_EQ(294188322706112357, orc::getLongHash(9223372036064673415));
+ }
+
+#define CheckBitSet(bf, p1, p2, p3, p4, p5) \
+ EXPECT_TRUE(bf.mBitSet->get(p1)); \
+ EXPECT_TRUE(bf.mBitSet->get(p2)); \
+ EXPECT_TRUE(bf.mBitSet->get(p3)); \
+ EXPECT_TRUE(bf.mBitSet->get(p4)); \
+ EXPECT_TRUE(bf.mBitSet->get(p5))
+
+ // Same test as TestBloomFilter#testBasicOperations() in Java codes. We also
+ // verifies the bitSet positions that are set, to make sure both the Java
and C++ codes
+ // hash the same value into the same position.
TEST(TestBloomFilter, testBloomFilterBasicOperations) {
BloomFilterImpl bloomFilter(128);
@@ -99,14 +131,23 @@ namespace orc {
EXPECT_FALSE(bloomFilter.testLong(-1111));
bloomFilter.addLong(1);
+ CheckBitSet(bloomFilter, 567, 288, 246, 306, 228);
bloomFilter.addLong(11);
+ CheckBitSet(bloomFilter, 228, 285, 342, 399, 456);
bloomFilter.addLong(111);
+ CheckBitSet(bloomFilter, 802, 630, 458, 545, 717);
bloomFilter.addLong(1111);
+ CheckBitSet(bloomFilter, 826, 526, 40, 480, 86);
bloomFilter.addLong(0);
+ CheckBitSet(bloomFilter, 0, 0, 0, 0, 0);
bloomFilter.addLong(-1);
+ CheckBitSet(bloomFilter, 120, 308, 335, 108, 535);
bloomFilter.addLong(-11);
+ CheckBitSet(bloomFilter, 323, 685, 215, 577, 107);
bloomFilter.addLong(-111);
+ CheckBitSet(bloomFilter, 357, 318, 279, 15, 54);
bloomFilter.addLong(-1111);
+ CheckBitSet(bloomFilter, 572, 680, 818, 434, 232);
EXPECT_TRUE(bloomFilter.testLong(1));
EXPECT_TRUE(bloomFilter.testLong(11));
@@ -131,14 +172,23 @@ namespace orc {
EXPECT_FALSE(bloomFilter.testDouble(-1111.1111));
bloomFilter.addDouble(1.1);
+ CheckBitSet(bloomFilter, 522, 692, 12, 370, 753);
bloomFilter.addDouble(11.11);
+ CheckBitSet(bloomFilter, 210, 188, 89, 720, 389);
bloomFilter.addDouble(111.111);
+ CheckBitSet(bloomFilter, 831, 252, 583, 500, 335);
bloomFilter.addDouble(1111.1111);
+ CheckBitSet(bloomFilter, 725, 175, 374, 92, 642);
bloomFilter.addDouble(0.0);
+ CheckBitSet(bloomFilter, 0, 0, 0, 0, 0);
bloomFilter.addDouble(-1.1);
+ CheckBitSet(bloomFilter, 636, 163, 565, 206, 679);
bloomFilter.addDouble(-11.11);
+ CheckBitSet(bloomFilter, 473, 192, 743, 462, 181);
bloomFilter.addDouble(-111.111);
+ CheckBitSet(bloomFilter, 167, 152, 472, 295, 24);
bloomFilter.addDouble(-1111.1111);
+ CheckBitSet(bloomFilter, 308, 346, 384, 422, 371);
EXPECT_TRUE(bloomFilter.testDouble(1.1));
EXPECT_TRUE(bloomFilter.testDouble(11.11));
@@ -164,8 +214,11 @@ namespace orc {
static_cast<int64_t>(strlen(cnStr))));
bloomFilter.addBytes(emptyStr, static_cast<int64_t>(strlen(emptyStr)));
+ CheckBitSet(bloomFilter, 656, 807, 480, 151, 304);
bloomFilter.addBytes(enStr, static_cast<int64_t>(strlen(enStr)));
+ CheckBitSet(bloomFilter, 576, 221, 68, 729, 392);
bloomFilter.addBytes(cnStr, static_cast<int64_t>(strlen(cnStr)));
+ CheckBitSet(bloomFilter, 602, 636, 44, 362, 318);
EXPECT_TRUE(bloomFilter.testBytes(emptyStr,
static_cast<int64_t>(strlen(emptyStr))));
diff --git a/c++/test/TestMurmur3.cc b/c++/test/TestMurmur3.cc
new file mode 100644
index 0000000..7bf9e05
--- /dev/null
+++ b/c++/test/TestMurmur3.cc
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Murmur3.hh"
+#include "wrap/gtest-wrapper.h"
+
+namespace orc {
+
+ // Same test as TestMurmur3#testHashCodeM3_64() in Java codes. Make sure the
hash values
+ // are consistent between the Java client and C++ client.
+ // TODO(ORC-1025): Add exhaustive test on more strings.
+ TEST(TestMurmur3, testHash64) {
+ uint8_t origin[] = "It was the best of times, it was the worst of times,"
+ " it was the age of wisdom, it was the age of
foolishness,"
+ " it was the epoch of belief, it was the epoch of
incredulity,"
+ " it was the season of Light, it was the season of
Darkness,"
+ " it was the spring of hope, it was the winter of
despair,"
+ " we had everything before us, we had nothing before
us,"
+ " we were all going direct to Heaven,"
+ " we were all going direct the other way.";
+ uint32_t len = sizeof(origin) / sizeof(uint8_t) - 1;
+ uint64_t hash = Murmur3::hash64(origin, len);
+ EXPECT_EQ(305830725663368540L, hash);
+ }
+
+}
diff --git a/c++/test/TestReader.cc b/c++/test/TestReader.cc
index e31ed11..43c1173 100644
--- a/c++/test/TestReader.cc
+++ b/c++/test/TestReader.cc
@@ -101,4 +101,40 @@ namespace orc {
900, rowsInCurrentStripe, rowIndexStride, includedRowGroups));
}
+ void CheckFileWithSargs(const char* fileName, const char* softwareVersion) {
+ std::stringstream ss;
+ if(const char* example_dir = std::getenv("ORC_EXAMPLE_DIR")) {
+ ss << example_dir;
+ } else {
+ ss << "../../../examples";
+ }
+ // Read a file with bloom filters written by CPP writer in version 1.6.11.
+ ss << "/" << fileName;
+ ReaderOptions readerOpts;
+ std::unique_ptr<Reader> reader =
+ createReader(readLocalFile(ss.str().c_str()), readerOpts);
+ EXPECT_EQ(WriterId::ORC_CPP_WRITER, reader->getWriterId());
+ EXPECT_EQ(softwareVersion, reader->getSoftwareVersion());
+
+ // Create SearchArgument with a EQUALS predicate which can leverage the
bloom filters.
+ RowReaderOptions rowReaderOpts;
+ std::unique_ptr<SearchArgumentBuilder> sarg =
SearchArgumentFactory::newBuilder();
+ // Integer value 18000000000 has an inconsistent hash before the fix of
ORC-1024.
+ sarg->equals(1,
PredicateDataType::LONG,Literal(static_cast<int64_t>(18000000000L)));
+ std::unique_ptr<SearchArgument> final_sarg = sarg->build();
+ rowReaderOpts.searchArgument(std::move(final_sarg));
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Make sure bad bloom filters won't affect the results.
+ std::unique_ptr<ColumnVectorBatch> batch =
+ rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(5, batch->numElements);
+ EXPECT_FALSE(rowReader->next(*batch));
+ }
+
+ TEST(TestRowReader, testSkipBadBloomFilters) {
+ CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11");
+ CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++");
+ }
} // namespace
diff --git a/examples/bad_bloom_filter_1.6.0.orc
b/examples/bad_bloom_filter_1.6.0.orc
new file mode 100644
index 0000000..6c52f4f
Binary files /dev/null and b/examples/bad_bloom_filter_1.6.0.orc differ
diff --git a/examples/bad_bloom_filter_1.6.11.orc
b/examples/bad_bloom_filter_1.6.11.orc
new file mode 100644
index 0000000..fc3ffef
Binary files /dev/null and b/examples/bad_bloom_filter_1.6.11.orc differ
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 19e8f92..794de92 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -109,6 +109,11 @@ public class RecordReaderImpl implements RecordReader {
// identifies that follow columns bytes must be read
private boolean needsFollowColumnsRead;
private final boolean noSelectedVector;
+ // identifies whether the file has bad bloom filters that we should not use.
+ private final boolean skipBloomFilters;
+ static final String[] BAD_CPP_BLOOM_FILTER_VERSIONS = {
+ "1.6.0", "1.6.1", "1.6.2", "1.6.3", "1.6.4", "1.6.5", "1.6.6", "1.6.7",
"1.6.8",
+ "1.6.9", "1.6.10", "1.6.11", "1.7.0"};
/**
* Given a list of column names, find the given column and return the index.
@@ -329,6 +334,7 @@ public class RecordReaderImpl implements RecordReader {
fileReader.options.getConvertToProlepticGregorian())
.setEncryption(encryption);
reader = TreeReaderFactory.createRootReader(evolution.getReaderSchema(),
readerContext);
+ skipBloomFilters =
hasBadBloomFilters(fileReader.getFileTail().getFooter());
int columns = evolution.getFileSchema().getMaximumId() + 1;
indexes = new OrcIndex(new OrcProto.RowIndex[columns],
@@ -350,6 +356,33 @@ public class RecordReaderImpl implements RecordReader {
}
}
+ /**
+ * Check if the file has inconsistent bloom filters. We will skip using them
+ * in the following reads.
+ * @return true if it has.
+ */
+ private boolean hasBadBloomFilters(OrcProto.Footer footer) {
+ // Only C++ writer in old releases could have bad bloom filters.
+ if (footer.getWriter() != 1) return false;
+ // 'softwareVersion' is added in 1.5.13, 1.6.11, and 1.7.0.
+ // 1.6.x releases before 1.6.11 won't have it. On the other side, the C++
writer
+ // supports writing bloom filters since 1.6.0. So files written by the C++
writer
+ // and with 'softwareVersion' unset would have bad bloom filters.
+ if (!footer.hasSoftwareVersion()) return true;
+ String fullVersion = footer.getSoftwareVersion();
+ String version = fullVersion;
+ // Deal with snapshot versions, e.g. 1.6.12-SNAPSHOT.
+ if (fullVersion.contains("-")) {
+ version = fullVersion.substring(0, fullVersion.indexOf('-'));
+ }
+ for (String v : BAD_CPP_BLOOM_FILTER_VERSIONS) {
+ if (v.equals(version)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
public static final class PositionProviderImpl implements PositionProvider {
private final OrcProto.RowIndexEntry entry;
private int index;
@@ -1178,8 +1211,10 @@ public class RecordReaderImpl implements RecordReader {
}
return sargApp.pickRowGroups(stripes.get(currentStripe),
indexes.getRowGroupIndex(),
- indexes.getBloomFilterKinds(), stripeFooter.getColumnsList(),
- indexes.getBloomFilterIndex(), false);
+ skipBloomFilters ? null : indexes.getBloomFilterKinds(),
+ stripeFooter.getColumnsList(),
+ skipBloomFilters ? null : indexes.getBloomFilterIndex(),
+ false);
}
private void clearStreams() {
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilter.java
b/java/core/src/java/org/apache/orc/util/BloomFilter.java
index 2aeb898..3aa12ec 100644
--- a/java/core/src/java/org/apache/orc/util/BloomFilter.java
+++ b/java/core/src/java/org/apache/orc/util/BloomFilter.java
@@ -192,7 +192,7 @@ public class BloomFilter {
// Thomas Wang's integer hash function
//
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
- private long getLongHash(long key) {
+ static long getLongHash(long key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
@@ -247,6 +247,14 @@ public class BloomFilter {
}
/**
+ * Helper method that only used for tests. Check if the given position in
the bitSet is
+ * true. Use default visibility.
+ */
+ boolean testBitSetPos(int pos) {
+ return this.bitSet.get(pos);
+ }
+
+ /**
* Bare metal bit set implementation. For performance reasons, this
implementation does not check
* for index bounds nor expand the bit set size if the specified index is
greater than the size.
*/
diff --git a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
index e74e5d8..900d5e0 100644
--- a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -25,6 +25,9 @@ import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.Progressable;
import org.apache.orc.FileFormatException;
@@ -51,6 +54,7 @@ import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -411,4 +415,34 @@ public class TestReaderImpl {
ReaderImpl.getRawDataSizeFromColIndices(list, types, stats));
}
}
+
+ private void CheckFileWithSargs(String fileName, String softwareVersion)
+ throws IOException {
+ Configuration conf = new Configuration();
+ Path path = new Path(workDir, fileName);
+ FileSystem fs = path.getFileSystem(conf);
+ try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
+ OrcFile.readerOptions(conf).filesystem(fs))) {
+ assertEquals(softwareVersion, reader.getSoftwareVersion());
+
+ Reader.Options opt = new Reader.Options();
+ SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(conf);
+ builder.equals("id", PredicateLeaf.Type.LONG, 18000000000L);
+ opt.searchArgument(builder.build(), new String[]{"id"});
+
+ TypeDescription schema = reader.getSchema();
+ VectorizedRowBatch batch = schema.createRowBatch();
+ try (RecordReader rows = reader.rows(opt)) {
+ assertTrue(rows.nextBatch(batch), "No rows read out!");
+ assertEquals(5, batch.size);
+ assertFalse(rows.nextBatch(batch));
+ }
+ }
+ }
+
+ @Test
+ public void testSkipBadBloomFilters() throws IOException {
+ CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11");
+ CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++ ");
+ }
}
diff --git a/java/core/src/test/org/apache/orc/util/TestBloomFilter.java
b/java/core/src/test/org/apache/orc/util/TestBloomFilter.java
index 5d17413..3fb6358 100644
--- a/java/core/src/test/org/apache/orc/util/TestBloomFilter.java
+++ b/java/core/src/test/org/apache/orc/util/TestBloomFilter.java
@@ -27,6 +27,7 @@ import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Tests for BloomFilter
@@ -61,6 +62,151 @@ public class TestBloomFilter {
assertEquals(~0x8040201008040201L, longs[1]);
}
+ /**
+ * Same test as TestBloomFilter_testLongHash in C++ codes. Make sure the
hash values
+ * are consistent between the Java client and C++ client.
+ * TODO(ORC-1025): Add exhaustive test on all numbers.
+ */
+ @Test
+ public void testLongHash() {
+ assertEquals(0, BloomFilter.getLongHash(0));
+ assertEquals(6614246905173314819L, BloomFilter.getLongHash(-1));
+ assertEquals(-5218250166726157773L, BloomFilter.getLongHash(-2));
+ assertEquals(1396019780946710816L, BloomFilter.getLongHash(-3));
+
+ assertEquals(3691278333958578070L,
BloomFilter.getLongHash(-9223372036854775805L));
+ assertEquals(-1192099642781211952L,
BloomFilter.getLongHash(-9223372036854775806L));
+ assertEquals(-9102499068535824902L,
BloomFilter.getLongHash(-9223372036854775807L));
+
+ assertEquals(1499534499340523007L, BloomFilter.getLongHash(790302201));
+ assertEquals(-5108695154500810163L, BloomFilter.getLongHash(790302202));
+ assertEquals(-2450623810987162260L, BloomFilter.getLongHash(790302203));
+ assertEquals(-1097054448615658549L, BloomFilter.getLongHash(18000000000L));
+
+ assertEquals(-4986173376161118712L,
BloomFilter.getLongHash(9223372036064673413L));
+ assertEquals(3785699328822078862L,
BloomFilter.getLongHash(9223372036064673414L));
+ assertEquals(294188322706112357L,
BloomFilter.getLongHash(9223372036064673415L));
+ }
+
+ private void checkBitSet(BloomFilter bf, int[] pos) {
+ for (int i : pos) {
+ assertTrue(bf.testBitSetPos(i));
+ }
+ }
+
+ /**
+ * Same test as TestBloomFilter_testBloomFilterBasicOperations in C++ codes.
We also
+ * verifies the bitSet positions that are set, to make sure both the Java
and C++ codes
+ * hash the same value into the same position.
+ */
+ @Test
+ public void testBasicOperations() {
+ BloomFilter bloomFilter = new BloomFilterUtf8(128,
BloomFilter.DEFAULT_FPP);
+
+ // test integers
+ bloomFilter.reset();
+ assertFalse(bloomFilter.testLong(1));
+ assertFalse(bloomFilter.testLong(11));
+ assertFalse(bloomFilter.testLong(111));
+ assertFalse(bloomFilter.testLong(1111));
+ assertFalse(bloomFilter.testLong(0));
+ assertFalse(bloomFilter.testLong(-1));
+ assertFalse(bloomFilter.testLong(-11));
+ assertFalse(bloomFilter.testLong(-111));
+ assertFalse(bloomFilter.testLong(-1111));
+
+ bloomFilter.addLong(1);
+ checkBitSet(bloomFilter, new int[]{567, 288, 246, 306, 228});
+ bloomFilter.addLong(11);
+ checkBitSet(bloomFilter, new int[]{228, 285, 342, 399, 456});
+ bloomFilter.addLong(111);
+ checkBitSet(bloomFilter, new int[]{802, 630, 458, 545, 717});
+ bloomFilter.addLong(1111);
+ checkBitSet(bloomFilter, new int[]{826, 526, 40, 480, 86});
+ bloomFilter.addLong(0);
+ checkBitSet(bloomFilter, new int[]{0, 0, 0, 0, 0});
+ bloomFilter.addLong(-1);
+ checkBitSet(bloomFilter, new int[]{120, 308, 335, 108, 535});
+ bloomFilter.addLong(-11);
+ checkBitSet(bloomFilter, new int[]{323, 685, 215, 577, 107});
+ bloomFilter.addLong(-111);
+ checkBitSet(bloomFilter, new int[]{357, 318, 279, 15, 54});
+ bloomFilter.addLong(-1111);
+ checkBitSet(bloomFilter, new int[]{572, 680, 818, 434, 232});
+
+ assertTrue(bloomFilter.testLong(1));
+ assertTrue(bloomFilter.testLong(11));
+ assertTrue(bloomFilter.testLong(111));
+ assertTrue(bloomFilter.testLong(1111));
+ assertTrue(bloomFilter.testLong(0));
+ assertTrue(bloomFilter.testLong(-1));
+ assertTrue(bloomFilter.testLong(-11));
+ assertTrue(bloomFilter.testLong(-111));
+ assertTrue(bloomFilter.testLong(-1111));
+
+ // test doubles
+ bloomFilter.reset();
+ assertFalse(bloomFilter.testDouble(1.1));
+ assertFalse(bloomFilter.testDouble(11.11));
+ assertFalse(bloomFilter.testDouble(111.111));
+ assertFalse(bloomFilter.testDouble(1111.1111));
+ assertFalse(bloomFilter.testDouble(0.0));
+ assertFalse(bloomFilter.testDouble(-1.1));
+ assertFalse(bloomFilter.testDouble(-11.11));
+ assertFalse(bloomFilter.testDouble(-111.111));
+ assertFalse(bloomFilter.testDouble(-1111.1111));
+
+ bloomFilter.addDouble(1.1);
+ checkBitSet(bloomFilter, new int[]{522, 692, 12, 370, 753});
+ bloomFilter.addDouble(11.11);
+ checkBitSet(bloomFilter, new int[]{210, 188, 89, 720, 389});
+ bloomFilter.addDouble(111.111);
+ checkBitSet(bloomFilter, new int[]{831, 252, 583, 500, 335});
+ bloomFilter.addDouble(1111.1111);
+ checkBitSet(bloomFilter, new int[]{725, 175, 374, 92, 642});
+ bloomFilter.addDouble(0.0);
+ checkBitSet(bloomFilter, new int[]{0, 0, 0, 0, 0});
+ bloomFilter.addDouble(-1.1);
+ checkBitSet(bloomFilter, new int[]{636, 163, 565, 206, 679});
+ bloomFilter.addDouble(-11.11);
+ checkBitSet(bloomFilter, new int[]{473, 192, 743, 462, 181});
+ bloomFilter.addDouble(-111.111);
+ checkBitSet(bloomFilter, new int[]{167, 152, 472, 295, 24});
+ bloomFilter.addDouble(-1111.1111);
+ checkBitSet(bloomFilter, new int[]{308, 346, 384, 422, 371});
+
+ assertTrue(bloomFilter.testDouble(1.1));
+ assertTrue(bloomFilter.testDouble(11.11));
+ assertTrue(bloomFilter.testDouble(111.111));
+ assertTrue(bloomFilter.testDouble(1111.1111));
+ assertTrue(bloomFilter.testDouble(0.0));
+ assertTrue(bloomFilter.testDouble(-1.1));
+ assertTrue(bloomFilter.testDouble(-11.11));
+ assertTrue(bloomFilter.testDouble(-111.111));
+ assertTrue(bloomFilter.testDouble(-1111.1111));
+
+ // test strings
+ bloomFilter.reset();
+ String emptyStr = "";
+ String enStr = "english";
+ String cnStr = "中国字";
+
+ assertFalse(bloomFilter.testString(emptyStr));
+ assertFalse(bloomFilter.testString(enStr));
+ assertFalse(bloomFilter.testString(cnStr));
+
+ bloomFilter.addString(emptyStr);
+ checkBitSet(bloomFilter, new int[]{656, 807, 480, 151, 304});
+ bloomFilter.addString(enStr);
+ checkBitSet(bloomFilter, new int[]{576, 221, 68, 729, 392});
+ bloomFilter.addString(cnStr);
+ checkBitSet(bloomFilter, new int[]{602, 636, 44, 362, 318});
+
+ assertTrue(bloomFilter.testString(emptyStr));
+ assertTrue(bloomFilter.testString(enStr));
+ assertTrue(bloomFilter.testString(cnStr));
+ }
+
@Test
public void testBloomFilterSerialize() {
long[] bits = new long[]{0x8040201008040201L, ~0x8040201008040201L};