ORC-101 using little endian encoding of bloom filter bitsets and update spec.
Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/604dcc80 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/604dcc80 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/604dcc80 Branch: refs/heads/master Commit: 604dcc801fb2cdb68fe8284c0facf66a32bfe119 Parents: 9d39cb8 Author: Owen O'Malley <omal...@apache.org> Authored: Tue Sep 20 15:51:36 2016 -0500 Committer: Owen O'Malley <omal...@apache.org> Committed: Wed Sep 21 11:38:57 2016 -0500 ---------------------------------------------------------------------- .../java/org/apache/orc/util/BloomFilterIO.java | 6 +- .../resources/orc-file-dump-bloomfilter.out | 104 +++++++------- .../resources/orc-file-dump-bloomfilter2.out | 116 ++++++++-------- .../tools/src/test/resources/orc-file-dump.json | 134 +++++++++---------- site/_data/releases.yml | 4 + site/_docs/spec-index.md | 11 +- site/_docs/stripes.md | 4 + 7 files changed, 196 insertions(+), 183 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/core/src/java/org/apache/orc/util/BloomFilterIO.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java index ebd8c49..a6c3940 100644 --- a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java +++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java @@ -24,7 +24,7 @@ import org.apache.orc.OrcProto; import org.apache.orc.TypeDescription; import java.nio.ByteBuffer; -import java.util.Arrays; +import java.nio.ByteOrder; public class BloomFilterIO { @@ -62,7 +62,8 @@ public class BloomFilterIO { case BLOOM_FILTER_UTF8: { ByteString bits = bloomFilter.getUtf8Bitset(); long[] values = new long[bits.size() / 8]; - bits.asReadOnlyByteBuffer().asLongBuffer().get(values); + bits.asReadOnlyByteBuffer().order(ByteOrder.LITTLE_ENDIAN) + .asLongBuffer().get(values); return new BloomFilterUtf8(values, numFuncs); } default: @@ -82,6 +83,7 @@ public class BloomFilterIO { long[] bitset = bloomFilter.getBitSet(); if (bloomFilter instanceof BloomFilterUtf8) { ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8); + buffer.order(ByteOrder.LITTLE_ENDIAN); buffer.asLongBuffer().put(bitset); builder.setUtf8Bitset(ByteString.copyFrom(buffer)); } else { http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out index b879bed..e23327a 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out @@ -39,17 +39,17 @@ File Statistics: Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743 + Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 749 Stream: column 0 section ROW_INDEX start: 3 length 17 Stream: column 1 section ROW_INDEX start: 20 length 166 Stream: column 2 section ROW_INDEX start: 186 length 169 Stream: column 3 section ROW_INDEX start: 355 length 87 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304 - Stream: column 1 section DATA start: 746 length 20035 - Stream: column 2 section DATA start: 20781 length 40050 - Stream: column 3 section DATA start: 60831 length 3543 - Stream: column 3 section LENGTH start: 64374 length 25 - Stream: column 3 section DICTIONARY_DATA start: 64399 length 133 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 310 + Stream: column 1 section DATA start: 752 length 20035 + Stream: column 2 section DATA start: 20787 length 40050 + Stream: column 3 section DATA start: 60837 length 3543 + Stream: column 3 section LENGTH start: 64380 length 25 + Stream: column 3 section DICTIONARY_DATA start: 64405 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -67,17 +67,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736 - Stream: column 0 section ROW_INDEX start: 64618 length 17 - Stream: column 1 section ROW_INDEX start: 64635 length 164 - Stream: column 2 section ROW_INDEX start: 64799 length 168 - Stream: column 3 section ROW_INDEX start: 64967 length 83 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304 - Stream: column 1 section DATA start: 65354 length 20035 - Stream: column 2 section DATA start: 85389 length 40050 - Stream: column 3 section DATA start: 125439 length 3532 - Stream: column 3 section LENGTH start: 128971 length 25 - Stream: column 3 section DICTIONARY_DATA start: 128996 length 133 + Stripe: offset: 64624 data: 63775 rows: 5000 tail: 86 index: 742 + Stream: column 0 section ROW_INDEX start: 64624 length 17 + Stream: column 1 section ROW_INDEX start: 64641 length 164 + Stream: column 2 section ROW_INDEX start: 64805 length 168 + Stream: column 3 section ROW_INDEX start: 64973 length 83 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 65056 length 310 + Stream: column 1 section DATA start: 65366 length 20035 + Stream: column 2 section DATA start: 85401 length 40050 + Stream: column 3 section DATA start: 125451 length 3532 + Stream: column 3 section LENGTH start: 128983 length 25 + Stream: column 3 section DICTIONARY_DATA start: 129008 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -95,17 +95,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742 - Stream: column 0 section ROW_INDEX start: 129215 length 17 - Stream: column 1 section ROW_INDEX start: 129232 length 163 - Stream: column 2 section ROW_INDEX start: 129395 length 168 - Stream: column 3 section ROW_INDEX start: 129563 length 90 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304 - Stream: column 1 section DATA start: 129957 length 20035 - Stream: column 2 section DATA start: 149992 length 40050 - Stream: column 3 section DATA start: 190042 length 3544 - Stream: column 3 section LENGTH start: 193586 length 25 - Stream: column 3 section DICTIONARY_DATA start: 193611 length 133 + Stripe: offset: 129227 data: 63787 rows: 5000 tail: 86 index: 748 + Stream: column 0 section ROW_INDEX start: 129227 length 17 + Stream: column 1 section ROW_INDEX start: 129244 length 163 + Stream: column 2 section ROW_INDEX start: 129407 length 168 + Stream: column 3 section ROW_INDEX start: 129575 length 90 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 129665 length 310 + Stream: column 1 section DATA start: 129975 length 20035 + Stream: column 2 section DATA start: 150010 length 40050 + Stream: column 3 section DATA start: 190060 length 3544 + Stream: column 3 section LENGTH start: 193604 length 25 + Stream: column 3 section DICTIONARY_DATA start: 193629 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -123,17 +123,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744 - Stream: column 0 section ROW_INDEX start: 193830 length 17 - Stream: column 1 section ROW_INDEX start: 193847 length 165 - Stream: column 2 section ROW_INDEX start: 194012 length 167 - Stream: column 3 section ROW_INDEX start: 194179 length 91 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304 - Stream: column 1 section DATA start: 194574 length 20035 - Stream: column 2 section DATA start: 214609 length 40050 - Stream: column 3 section DATA start: 254659 length 3574 - Stream: column 3 section LENGTH start: 258233 length 25 - Stream: column 3 section DICTIONARY_DATA start: 258258 length 133 + Stripe: offset: 193848 data: 63817 rows: 5000 tail: 85 index: 750 + Stream: column 0 section ROW_INDEX start: 193848 length 17 + Stream: column 1 section ROW_INDEX start: 193865 length 165 + Stream: column 2 section ROW_INDEX start: 194030 length 167 + Stream: column 3 section ROW_INDEX start: 194197 length 91 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 194288 length 310 + Stream: column 1 section DATA start: 194598 length 20035 + Stream: column 2 section DATA start: 214633 length 40050 + Stream: column 3 section DATA start: 254683 length 3574 + Stream: column 3 section LENGTH start: 258257 length 25 + Stream: column 3 section DICTIONARY_DATA start: 258282 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -151,17 +151,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382 - Stream: column 0 section ROW_INDEX start: 258476 length 12 - Stream: column 1 section ROW_INDEX start: 258488 length 38 - Stream: column 2 section ROW_INDEX start: 258526 length 41 - Stream: column 3 section ROW_INDEX start: 258567 length 40 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251 - Stream: column 1 section DATA start: 258858 length 4007 - Stream: column 2 section DATA start: 262865 length 8010 - Stream: column 3 section DATA start: 270875 length 768 - Stream: column 3 section LENGTH start: 271643 length 25 - Stream: column 3 section DICTIONARY_DATA start: 271668 length 133 + Stripe: offset: 258500 data: 12943 rows: 1000 tail: 78 index: 375 + Stream: column 0 section ROW_INDEX start: 258500 length 12 + Stream: column 1 section ROW_INDEX start: 258512 length 38 + Stream: column 2 section ROW_INDEX start: 258550 length 41 + Stream: column 3 section ROW_INDEX start: 258591 length 40 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 258631 length 244 + Stream: column 1 section DATA start: 258875 length 4007 + Stream: column 2 section DATA start: 262882 length 8010 + Stream: column 3 section DATA start: 270892 length 768 + Stream: column 3 section LENGTH start: 271660 length 25 + Stream: column 3 section DICTIONARY_DATA start: 271685 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -172,7 +172,7 @@ Stripes: Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 -File length: 272427 bytes +File length: 272444 bytes Padding length: 0 bytes Padding ratio: 0% ________________________________________________________________________________________________________________________ http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out index 75cd5f4..8296382 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out @@ -39,7 +39,7 @@ File Statistics: Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950 + Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14949 Stream: column 0 section ROW_INDEX start: 3 length 17 Stream: column 1 section ROW_INDEX start: 20 length 166 Stream: column 2 section ROW_INDEX start: 186 length 169 @@ -47,12 +47,12 @@ Stripes: Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046 Stream: column 3 section ROW_INDEX start: 12936 length 87 Stream: column 3 section BLOOM_FILTER start: 13023 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892 - Stream: column 1 section DATA start: 14953 length 20035 - Stream: column 2 section DATA start: 34988 length 40050 - Stream: column 3 section DATA start: 75038 length 3543 - Stream: column 3 section LENGTH start: 78581 length 25 - Stream: column 3 section DICTIONARY_DATA start: 78606 length 133 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 891 + Stream: column 1 section DATA start: 14952 length 20035 + Stream: column 2 section DATA start: 34987 length 40050 + Stream: column 3 section DATA start: 75037 length 3543 + Stream: column 3 section LENGTH start: 78580 length 25 + Stream: column 3 section DICTIONARY_DATA start: 78605 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -70,20 +70,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482 - Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941 - Stream: column 0 section ROW_INDEX start: 78843 length 17 - Stream: column 1 section ROW_INDEX start: 78860 length 164 - Stream: column 2 section ROW_INDEX start: 79024 length 168 - Stream: column 2 section BLOOM_FILTER start: 79192 length 6533 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046 - Stream: column 3 section ROW_INDEX start: 91771 length 83 - Stream: column 3 section BLOOM_FILTER start: 91854 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892 - Stream: column 1 section DATA start: 93784 length 20035 - Stream: column 2 section DATA start: 113819 length 40050 - Stream: column 3 section DATA start: 153869 length 3532 - Stream: column 3 section LENGTH start: 157401 length 25 - Stream: column 3 section DICTIONARY_DATA start: 157426 length 133 + Stripe: offset: 78842 data: 63775 rows: 5000 tail: 103 index: 14940 + Stream: column 0 section ROW_INDEX start: 78842 length 17 + Stream: column 1 section ROW_INDEX start: 78859 length 164 + Stream: column 2 section ROW_INDEX start: 79023 length 168 + Stream: column 2 section BLOOM_FILTER start: 79191 length 6533 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 85724 length 6046 + Stream: column 3 section ROW_INDEX start: 91770 length 83 + Stream: column 3 section BLOOM_FILTER start: 91853 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 92891 length 891 + Stream: column 1 section DATA start: 93782 length 20035 + Stream: column 2 section DATA start: 113817 length 40050 + Stream: column 3 section DATA start: 153867 length 3532 + Stream: column 3 section LENGTH start: 157399 length 25 + Stream: column 3 section DICTIONARY_DATA start: 157424 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -101,20 +101,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205 - Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947 - Stream: column 0 section ROW_INDEX start: 157662 length 17 - Stream: column 1 section ROW_INDEX start: 157679 length 163 - Stream: column 2 section ROW_INDEX start: 157842 length 168 - Stream: column 2 section BLOOM_FILTER start: 158010 length 6533 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046 - Stream: column 3 section ROW_INDEX start: 170589 length 90 - Stream: column 3 section BLOOM_FILTER start: 170679 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892 - Stream: column 1 section DATA start: 172609 length 20035 - Stream: column 2 section DATA start: 192644 length 40050 - Stream: column 3 section DATA start: 232694 length 3544 - Stream: column 3 section LENGTH start: 236238 length 25 - Stream: column 3 section DICTIONARY_DATA start: 236263 length 133 + Stripe: offset: 157660 data: 63787 rows: 5000 tail: 104 index: 14946 + Stream: column 0 section ROW_INDEX start: 157660 length 17 + Stream: column 1 section ROW_INDEX start: 157677 length 163 + Stream: column 2 section ROW_INDEX start: 157840 length 168 + Stream: column 2 section BLOOM_FILTER start: 158008 length 6533 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 164541 length 6046 + Stream: column 3 section ROW_INDEX start: 170587 length 90 + Stream: column 3 section BLOOM_FILTER start: 170677 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 171715 length 891 + Stream: column 1 section DATA start: 172606 length 20035 + Stream: column 2 section DATA start: 192641 length 40050 + Stream: column 3 section DATA start: 232691 length 3544 + Stream: column 3 section LENGTH start: 236235 length 25 + Stream: column 3 section DICTIONARY_DATA start: 236260 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -132,20 +132,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444 - Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940 - Stream: column 0 section ROW_INDEX start: 236500 length 17 - Stream: column 1 section ROW_INDEX start: 236517 length 165 - Stream: column 2 section ROW_INDEX start: 236682 length 167 - Stream: column 2 section BLOOM_FILTER start: 236849 length 6524 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046 - Stream: column 3 section ROW_INDEX start: 249419 length 91 - Stream: column 3 section BLOOM_FILTER start: 249510 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892 - Stream: column 1 section DATA start: 251440 length 20035 - Stream: column 2 section DATA start: 271475 length 40050 - Stream: column 3 section DATA start: 311525 length 3574 - Stream: column 3 section LENGTH start: 315099 length 25 - Stream: column 3 section DICTIONARY_DATA start: 315124 length 133 + Stripe: offset: 236497 data: 63817 rows: 5000 tail: 103 index: 14939 + Stream: column 0 section ROW_INDEX start: 236497 length 17 + Stream: column 1 section ROW_INDEX start: 236514 length 165 + Stream: column 2 section ROW_INDEX start: 236679 length 167 + Stream: column 2 section BLOOM_FILTER start: 236846 length 6524 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 243370 length 6046 + Stream: column 3 section ROW_INDEX start: 249416 length 91 + Stream: column 3 section BLOOM_FILTER start: 249507 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 250545 length 891 + Stream: column 1 section DATA start: 251436 length 20035 + Stream: column 2 section DATA start: 271471 length 40050 + Stream: column 3 section DATA start: 311521 length 3574 + Stream: column 3 section LENGTH start: 315095 length 25 + Stream: column 3 section DICTIONARY_DATA start: 315120 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -163,15 +163,15 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165 - Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542 - Stream: column 0 section ROW_INDEX start: 315360 length 12 - Stream: column 1 section ROW_INDEX start: 315372 length 38 - Stream: column 2 section ROW_INDEX start: 315410 length 41 - Stream: column 2 section BLOOM_FILTER start: 315451 length 1337 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211 - Stream: column 3 section ROW_INDEX start: 317999 length 40 - Stream: column 3 section BLOOM_FILTER start: 318039 length 472 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391 + Stripe: offset: 315356 data: 12943 rows: 1000 tail: 96 index: 3546 + Stream: column 0 section ROW_INDEX start: 315356 length 12 + Stream: column 1 section ROW_INDEX start: 315368 length 38 + Stream: column 2 section ROW_INDEX start: 315406 length 41 + Stream: column 2 section BLOOM_FILTER start: 315447 length 1337 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 316784 length 1211 + Stream: column 3 section ROW_INDEX start: 317995 length 40 + Stream: column 3 section BLOOM_FILTER start: 318035 length 472 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 318507 length 395 Stream: column 1 section DATA start: 318902 length 4007 Stream: column 2 section DATA start: 322909 length 8010 Stream: column 3 section DATA start: 330919 length 768 http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump.json ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index 3dd0dc0..b3e9d12 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -254,9 +254,9 @@ "stripeNumber": 1, "stripeInformation": { "offset": 3, - "indexLength": 762, + "indexLength": 768, "dataLength": 63770, - "footerLength": 89, + "footerLength": 88, "rowCount": 5000 }, "streams": [ @@ -288,42 +288,42 @@ "columnId": 3, "section": "BLOOM_FILTER_UTF8", "startOffset": 461, - "length": 304 + "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 765, + "startOffset": 771, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 20800, + "startOffset": 20806, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 60850, + "startOffset": 60856, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 60867, + "startOffset": 60873, "length": 3510 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 64377, + "startOffset": 64383, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 64402, + "startOffset": 64408, "length": 133 } ], @@ -494,8 +494,8 @@ { "stripeNumber": 2, "stripeInformation": { - "offset": 64624, - "indexLength": 753, + "offset": 64629, + "indexLength": 759, "dataLength": 63763, "footerLength": 87, "rowCount": 5000 @@ -504,67 +504,67 @@ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 64624, + "startOffset": 64629, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 64641, + "startOffset": 64646, "length": 166 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 64807, + "startOffset": 64812, "length": 166 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 64973, + "startOffset": 64978, "length": 100 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 65073, - "length": 304 + "startOffset": 65078, + "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 65377, + "startOffset": 65388, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 85412, + "startOffset": 85423, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 125462, + "startOffset": 125473, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 125479, + "startOffset": 125490, "length": 3503 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 128982, + "startOffset": 128993, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 129007, + "startOffset": 129018, "length": 133 } ], @@ -735,77 +735,77 @@ { "stripeNumber": 3, "stripeInformation": { - "offset": 129227, - "indexLength": 754, + "offset": 129238, + "indexLength": 760, "dataLength": 63770, - "footerLength": 89, + "footerLength": 88, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 129227, + "startOffset": 129238, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 129244, + "startOffset": 129255, "length": 164 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 129408, + "startOffset": 129419, "length": 167 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 129575, + "startOffset": 129586, "length": 102 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 129677, - "length": 304 + "startOffset": 129688, + "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 129981, + "startOffset": 129998, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 150016, + "startOffset": 150033, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 190066, + "startOffset": 190083, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 190083, + "startOffset": 190100, "length": 3510 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 193593, + "startOffset": 193610, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 193618, + "startOffset": 193635, "length": 133 } ], @@ -976,8 +976,8 @@ { "stripeNumber": 4, "stripeInformation": { - "offset": 193840, - "indexLength": 765, + "offset": 193856, + "indexLength": 771, "dataLength": 63756, "footerLength": 89, "rowCount": 5000 @@ -986,67 +986,67 @@ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 193840, + "startOffset": 193856, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 193857, + "startOffset": 193873, "length": 166 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 194023, + "startOffset": 194039, "length": 171 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 194194, + "startOffset": 194210, "length": 107 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 194301, - "length": 304 + "startOffset": 194317, + "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 194605, + "startOffset": 194627, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 214640, + "startOffset": 214662, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 254690, + "startOffset": 254712, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 254707, + "startOffset": 254729, "length": 3496 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 258203, + "startOffset": 258225, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 258228, + "startOffset": 258250, "length": 133 } ], @@ -1217,8 +1217,8 @@ { "stripeNumber": 5, "stripeInformation": { - "offset": 258450, - "indexLength": 383, + "offset": 258472, + "indexLength": 376, "dataLength": 12943, "footerLength": 83, "rowCount": 1000 @@ -1227,67 +1227,67 @@ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 258450, + "startOffset": 258472, "length": 12 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 258462, + "startOffset": 258484, "length": 38 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 258500, + "startOffset": 258522, "length": 41 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 258541, + "startOffset": 258563, "length": 41 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 258582, - "length": 251 + "startOffset": 258604, + "length": 244 }, { "columnId": 1, "section": "DATA", - "startOffset": 258833, + "startOffset": 258848, "length": 4007 }, { "columnId": 2, "section": "DATA", - "startOffset": 262840, + "startOffset": 262855, "length": 8010 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 270850, + "startOffset": 270865, "length": 16 }, { "columnId": 3, "section": "DATA", - "startOffset": 270866, + "startOffset": 270881, "length": 752 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 271618, + "startOffset": 271633, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 271643, + "startOffset": 271658, "length": 133 } ], @@ -1348,7 +1348,7 @@ }] } ], - "fileLength": 272409, + "fileLength": 272428, "paddingLength": 0, "paddingRatio": 0, "status": "OK" http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_data/releases.yml ---------------------------------------------------------------------- diff --git a/site/_data/releases.yml b/site/_data/releases.yml index 3331688..1282115 100644 --- a/site/_data/releases.yml +++ b/site/_data/releases.yml @@ -9,6 +9,7 @@ sha256: 5c394c7ed3a31d20726ded55ed9c5a0eeff1bd5b85b1cb2ee6c3c1a94560578c known-issues: ORC-40: Predicate push down is not implemented in C++. + ORC-101: Bloom filters for string and decimal use inconsistent encoding 1.1.2: date: 2016-07-08 @@ -19,6 +20,7 @@ known-issues: HIVE-14214: Schema evolution and predicate pushdown don't work together. ORC-40: Predicate push down is not implemented in C++. + ORC-101: Bloom filters for string and decimal use inconsistent encoding 1.1.1: date: 2016-06-13 @@ -29,6 +31,7 @@ known-issues: HIVE-14214: Schema evolution and predicate pushdown don't work together. ORC-40: Predicate push down is not implemented in C++. + ORC-101: Bloom filters for string and decimal use inconsistent encoding 1.1.0: date: 2016-06-10 @@ -39,6 +42,7 @@ known-issues: HIVE-14214: Schema evolution and predicate pushdown don't work together. ORC-40: Predicate push down is not implemented in C++. + ORC-101: Bloom filters for string and decimal use inconsistent encoding 1.0.0: date: 2016-01-25 http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/spec-index.md ---------------------------------------------------------------------- diff --git a/site/_docs/spec-index.md b/site/_docs/spec-index.md index 009df59..263c9a8 100644 --- a/site/_docs/spec-index.md +++ b/site/_docs/spec-index.md @@ -57,14 +57,17 @@ group (default to 10,000 rows) in a column. Only the row groups that satisfy min/max row index evaluation will be evaluated against the bloom filter index. -Each BloomFilterEntry stores the number of hash functions ('k') used and -the bitset backing the bloom filter. The bitset is serialized as repeated -longs from which the number of bits ('m') for the bloom filter can be derived. -m = bitset.length * 64. +Each BloomFilterEntry stores the number of hash functions ('k') used +and the bitset backing the bloom filter. The original encoding (pre +ORC-101) of bloom filters used the bitset field encoded as a repeating +sequence of longs in the bitset field with a little endian encoding +(0x1 is bit 0 and 0x2 is bit 1.) After ORC-101, the encoding is a +sequence of bytes with a little endian encoding in the utf8bitset field. ```message BloomFilter { optional uint32 numHashFunctions = 1; repeated fixed64 bitset = 2; + optional bytes utf8bitset = 3; } ``` http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/stripes.md ---------------------------------------------------------------------- diff --git a/site/_docs/stripes.md b/site/_docs/stripes.md index d53f709..cc85feb 100644 --- a/site/_docs/stripes.md +++ b/site/_docs/stripes.md @@ -56,6 +56,10 @@ depends on the type and encoding of the column. SECONDARY = 5; // the index for seeking to particular row groups ROW_INDEX = 6; + // original bloom filters used before ORC-101 + BLOOM_FILTER = 7; + // bloom filters that consistently use utf8 + BLOOM_FILTER_UTF8 = 8; } required Kind kind = 1; // the column id