ORC-101 using little endian encoding of bloom filter bitsets and update spec.


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/604dcc80
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/604dcc80
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/604dcc80

Branch: refs/heads/master
Commit: 604dcc801fb2cdb68fe8284c0facf66a32bfe119
Parents: 9d39cb8
Author: Owen O'Malley <omal...@apache.org>
Authored: Tue Sep 20 15:51:36 2016 -0500
Committer: Owen O'Malley <omal...@apache.org>
Committed: Wed Sep 21 11:38:57 2016 -0500

----------------------------------------------------------------------
 .../java/org/apache/orc/util/BloomFilterIO.java |   6 +-
 .../resources/orc-file-dump-bloomfilter.out     | 104 +++++++-------
 .../resources/orc-file-dump-bloomfilter2.out    | 116 ++++++++--------
 .../tools/src/test/resources/orc-file-dump.json | 134 +++++++++----------
 site/_data/releases.yml                         |   4 +
 site/_docs/spec-index.md                        |  11 +-
 site/_docs/stripes.md                           |   4 +
 7 files changed, 196 insertions(+), 183 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java 
b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
index ebd8c49..a6c3940 100644
--- a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
@@ -24,7 +24,7 @@ import org.apache.orc.OrcProto;
 import org.apache.orc.TypeDescription;
 
 import java.nio.ByteBuffer;
-import java.util.Arrays;
+import java.nio.ByteOrder;
 
 public class BloomFilterIO  {
 
@@ -62,7 +62,8 @@ public class BloomFilterIO  {
       case BLOOM_FILTER_UTF8: {
         ByteString bits = bloomFilter.getUtf8Bitset();
         long[] values = new long[bits.size() / 8];
-        bits.asReadOnlyByteBuffer().asLongBuffer().get(values);
+        bits.asReadOnlyByteBuffer().order(ByteOrder.LITTLE_ENDIAN)
+            .asLongBuffer().get(values);
         return new BloomFilterUtf8(values, numFuncs);
       }
       default:
@@ -82,6 +83,7 @@ public class BloomFilterIO  {
     long[] bitset = bloomFilter.getBitSet();
     if (bloomFilter instanceof BloomFilterUtf8) {
       ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8);
+      buffer.order(ByteOrder.LITTLE_ENDIAN);
       buffer.asLongBuffer().put(bitset);
       builder.setUtf8Bitset(ByteString.copyFrom(buffer));
     } else {

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index b879bed..e23327a 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -39,17 +39,17 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 749
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 3 section ROW_INDEX start: 355 length 87
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304
-    Stream: column 1 section DATA start: 746 length 20035
-    Stream: column 2 section DATA start: 20781 length 40050
-    Stream: column 3 section DATA start: 60831 length 3543
-    Stream: column 3 section LENGTH start: 64374 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 64399 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 310
+    Stream: column 1 section DATA start: 752 length 20035
+    Stream: column 2 section DATA start: 20787 length 40050
+    Stream: column 3 section DATA start: 60837 length 3543
+    Stream: column 3 section LENGTH start: 64380 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 64405 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736
-    Stream: column 0 section ROW_INDEX start: 64618 length 17
-    Stream: column 1 section ROW_INDEX start: 64635 length 164
-    Stream: column 2 section ROW_INDEX start: 64799 length 168
-    Stream: column 3 section ROW_INDEX start: 64967 length 83
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304
-    Stream: column 1 section DATA start: 65354 length 20035
-    Stream: column 2 section DATA start: 85389 length 40050
-    Stream: column 3 section DATA start: 125439 length 3532
-    Stream: column 3 section LENGTH start: 128971 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 128996 length 133
+  Stripe: offset: 64624 data: 63775 rows: 5000 tail: 86 index: 742
+    Stream: column 0 section ROW_INDEX start: 64624 length 17
+    Stream: column 1 section ROW_INDEX start: 64641 length 164
+    Stream: column 2 section ROW_INDEX start: 64805 length 168
+    Stream: column 3 section ROW_INDEX start: 64973 length 83
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65056 length 310
+    Stream: column 1 section DATA start: 65366 length 20035
+    Stream: column 2 section DATA start: 85401 length 40050
+    Stream: column 3 section DATA start: 125451 length 3532
+    Stream: column 3 section LENGTH start: 128983 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 129008 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742
-    Stream: column 0 section ROW_INDEX start: 129215 length 17
-    Stream: column 1 section ROW_INDEX start: 129232 length 163
-    Stream: column 2 section ROW_INDEX start: 129395 length 168
-    Stream: column 3 section ROW_INDEX start: 129563 length 90
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304
-    Stream: column 1 section DATA start: 129957 length 20035
-    Stream: column 2 section DATA start: 149992 length 40050
-    Stream: column 3 section DATA start: 190042 length 3544
-    Stream: column 3 section LENGTH start: 193586 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 193611 length 133
+  Stripe: offset: 129227 data: 63787 rows: 5000 tail: 86 index: 748
+    Stream: column 0 section ROW_INDEX start: 129227 length 17
+    Stream: column 1 section ROW_INDEX start: 129244 length 163
+    Stream: column 2 section ROW_INDEX start: 129407 length 168
+    Stream: column 3 section ROW_INDEX start: 129575 length 90
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129665 length 310
+    Stream: column 1 section DATA start: 129975 length 20035
+    Stream: column 2 section DATA start: 150010 length 40050
+    Stream: column 3 section DATA start: 190060 length 3544
+    Stream: column 3 section LENGTH start: 193604 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 193629 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744
-    Stream: column 0 section ROW_INDEX start: 193830 length 17
-    Stream: column 1 section ROW_INDEX start: 193847 length 165
-    Stream: column 2 section ROW_INDEX start: 194012 length 167
-    Stream: column 3 section ROW_INDEX start: 194179 length 91
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304
-    Stream: column 1 section DATA start: 194574 length 20035
-    Stream: column 2 section DATA start: 214609 length 40050
-    Stream: column 3 section DATA start: 254659 length 3574
-    Stream: column 3 section LENGTH start: 258233 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 258258 length 133
+  Stripe: offset: 193848 data: 63817 rows: 5000 tail: 85 index: 750
+    Stream: column 0 section ROW_INDEX start: 193848 length 17
+    Stream: column 1 section ROW_INDEX start: 193865 length 165
+    Stream: column 2 section ROW_INDEX start: 194030 length 167
+    Stream: column 3 section ROW_INDEX start: 194197 length 91
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194288 length 310
+    Stream: column 1 section DATA start: 194598 length 20035
+    Stream: column 2 section DATA start: 214633 length 40050
+    Stream: column 3 section DATA start: 254683 length 3574
+    Stream: column 3 section LENGTH start: 258257 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 258282 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382
-    Stream: column 0 section ROW_INDEX start: 258476 length 12
-    Stream: column 1 section ROW_INDEX start: 258488 length 38
-    Stream: column 2 section ROW_INDEX start: 258526 length 41
-    Stream: column 3 section ROW_INDEX start: 258567 length 40
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251
-    Stream: column 1 section DATA start: 258858 length 4007
-    Stream: column 2 section DATA start: 262865 length 8010
-    Stream: column 3 section DATA start: 270875 length 768
-    Stream: column 3 section LENGTH start: 271643 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 271668 length 133
+  Stripe: offset: 258500 data: 12943 rows: 1000 tail: 78 index: 375
+    Stream: column 0 section ROW_INDEX start: 258500 length 12
+    Stream: column 1 section ROW_INDEX start: 258512 length 38
+    Stream: column 2 section ROW_INDEX start: 258550 length 41
+    Stream: column 3 section ROW_INDEX start: 258591 length 40
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258631 length 244
+    Stream: column 1 section DATA start: 258875 length 4007
+    Stream: column 2 section DATA start: 262882 length 8010
+    Stream: column 3 section DATA start: 270892 length 768
+    Stream: column 3 section LENGTH start: 271660 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 271685 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
       Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
 
-File length: 272427 bytes
+File length: 272444 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index 75cd5f4..8296382 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -39,7 +39,7 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14949
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
@@ -47,12 +47,12 @@ Stripes:
     Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
     Stream: column 3 section ROW_INDEX start: 12936 length 87
     Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892
-    Stream: column 1 section DATA start: 14953 length 20035
-    Stream: column 2 section DATA start: 34988 length 40050
-    Stream: column 3 section DATA start: 75038 length 3543
-    Stream: column 3 section LENGTH start: 78581 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 78606 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 891
+    Stream: column 1 section DATA start: 14952 length 20035
+    Stream: column 2 section DATA start: 34987 length 40050
+    Stream: column 3 section DATA start: 75037 length 3543
+    Stream: column 3 section LENGTH start: 78580 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 78605 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -70,20 +70,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 
0.5178 expectedFpp: 0.009981772
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 
0.5155 expectedFpp: 0.009676614
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 
loadFactor: 0.9736 expectedFpp: 0.829482
-  Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941
-    Stream: column 0 section ROW_INDEX start: 78843 length 17
-    Stream: column 1 section ROW_INDEX start: 78860 length 164
-    Stream: column 2 section ROW_INDEX start: 79024 length 168
-    Stream: column 2 section BLOOM_FILTER start: 79192 length 6533
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046
-    Stream: column 3 section ROW_INDEX start: 91771 length 83
-    Stream: column 3 section BLOOM_FILTER start: 91854 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892
-    Stream: column 1 section DATA start: 93784 length 20035
-    Stream: column 2 section DATA start: 113819 length 40050
-    Stream: column 3 section DATA start: 153869 length 3532
-    Stream: column 3 section LENGTH start: 157401 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 157426 length 133
+  Stripe: offset: 78842 data: 63775 rows: 5000 tail: 103 index: 14940
+    Stream: column 0 section ROW_INDEX start: 78842 length 17
+    Stream: column 1 section ROW_INDEX start: 78859 length 164
+    Stream: column 2 section ROW_INDEX start: 79023 length 168
+    Stream: column 2 section BLOOM_FILTER start: 79191 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85724 length 6046
+    Stream: column 3 section ROW_INDEX start: 91770 length 83
+    Stream: column 3 section BLOOM_FILTER start: 91853 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92891 length 891
+    Stream: column 1 section DATA start: 93782 length 20035
+    Stream: column 2 section DATA start: 113817 length 40050
+    Stream: column 3 section DATA start: 153867 length 3532
+    Stream: column 3 section LENGTH start: 157399 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 157424 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -101,20 +101,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 
0.5169 expectedFpp: 0.009855959
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 
0.5173 expectedFpp: 0.009911705
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 
loadFactor: 0.9733 expectedFpp: 0.8276205
-  Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947
-    Stream: column 0 section ROW_INDEX start: 157662 length 17
-    Stream: column 1 section ROW_INDEX start: 157679 length 163
-    Stream: column 2 section ROW_INDEX start: 157842 length 168
-    Stream: column 2 section BLOOM_FILTER start: 158010 length 6533
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046
-    Stream: column 3 section ROW_INDEX start: 170589 length 90
-    Stream: column 3 section BLOOM_FILTER start: 170679 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892
-    Stream: column 1 section DATA start: 172609 length 20035
-    Stream: column 2 section DATA start: 192644 length 40050
-    Stream: column 3 section DATA start: 232694 length 3544
-    Stream: column 3 section LENGTH start: 236238 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 236263 length 133
+  Stripe: offset: 157660 data: 63787 rows: 5000 tail: 104 index: 14946
+    Stream: column 0 section ROW_INDEX start: 157660 length 17
+    Stream: column 1 section ROW_INDEX start: 157677 length 163
+    Stream: column 2 section ROW_INDEX start: 157840 length 168
+    Stream: column 2 section BLOOM_FILTER start: 158008 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164541 length 6046
+    Stream: column 3 section ROW_INDEX start: 170587 length 90
+    Stream: column 3 section BLOOM_FILTER start: 170677 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171715 length 891
+    Stream: column 1 section DATA start: 172606 length 20035
+    Stream: column 2 section DATA start: 192641 length 40050
+    Stream: column 3 section DATA start: 232691 length 3544
+    Stream: column 3 section LENGTH start: 236235 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 236260 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -132,20 +132,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 
0.5149 expectedFpp: 0.009594797
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 
0.5135 expectedFpp: 0.009419539
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 
loadFactor: 0.9722 expectedFpp: 0.82082444
-  Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940
-    Stream: column 0 section ROW_INDEX start: 236500 length 17
-    Stream: column 1 section ROW_INDEX start: 236517 length 165
-    Stream: column 2 section ROW_INDEX start: 236682 length 167
-    Stream: column 2 section BLOOM_FILTER start: 236849 length 6524
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046
-    Stream: column 3 section ROW_INDEX start: 249419 length 91
-    Stream: column 3 section BLOOM_FILTER start: 249510 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892
-    Stream: column 1 section DATA start: 251440 length 20035
-    Stream: column 2 section DATA start: 271475 length 40050
-    Stream: column 3 section DATA start: 311525 length 3574
-    Stream: column 3 section LENGTH start: 315099 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 315124 length 133
+  Stripe: offset: 236497 data: 63817 rows: 5000 tail: 103 index: 14939
+    Stream: column 0 section ROW_INDEX start: 236497 length 17
+    Stream: column 1 section ROW_INDEX start: 236514 length 165
+    Stream: column 2 section ROW_INDEX start: 236679 length 167
+    Stream: column 2 section BLOOM_FILTER start: 236846 length 6524
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243370 length 6046
+    Stream: column 3 section ROW_INDEX start: 249416 length 91
+    Stream: column 3 section BLOOM_FILTER start: 249507 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250545 length 891
+    Stream: column 1 section DATA start: 251436 length 20035
+    Stream: column 2 section DATA start: 271471 length 40050
+    Stream: column 3 section DATA start: 311521 length 3574
+    Stream: column 3 section LENGTH start: 315095 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 315120 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -163,15 +163,15 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 
0.5147 expectedFpp: 0.009567649
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 
0.5201 expectedFpp: 0.010295142
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 
loadFactor: 0.9743 expectedFpp: 0.8332165
-  Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542
-    Stream: column 0 section ROW_INDEX start: 315360 length 12
-    Stream: column 1 section ROW_INDEX start: 315372 length 38
-    Stream: column 2 section ROW_INDEX start: 315410 length 41
-    Stream: column 2 section BLOOM_FILTER start: 315451 length 1337
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211
-    Stream: column 3 section ROW_INDEX start: 317999 length 40
-    Stream: column 3 section BLOOM_FILTER start: 318039 length 472
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391
+  Stripe: offset: 315356 data: 12943 rows: 1000 tail: 96 index: 3546
+    Stream: column 0 section ROW_INDEX start: 315356 length 12
+    Stream: column 1 section ROW_INDEX start: 315368 length 38
+    Stream: column 2 section ROW_INDEX start: 315406 length 41
+    Stream: column 2 section BLOOM_FILTER start: 315447 length 1337
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316784 length 1211
+    Stream: column 3 section ROW_INDEX start: 317995 length 40
+    Stream: column 3 section BLOOM_FILTER start: 318035 length 472
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318507 length 395
     Stream: column 1 section DATA start: 318902 length 4007
     Stream: column 2 section DATA start: 322909 length 8010
     Stream: column 3 section DATA start: 330919 length 768

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json 
b/java/tools/src/test/resources/orc-file-dump.json
index 3dd0dc0..b3e9d12 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -254,9 +254,9 @@
       "stripeNumber": 1,
       "stripeInformation": {
         "offset": 3,
-        "indexLength": 762,
+        "indexLength": 768,
         "dataLength": 63770,
-        "footerLength": 89,
+        "footerLength": 88,
         "rowCount": 5000
       },
       "streams": [
@@ -288,42 +288,42 @@
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
           "startOffset": 461,
-          "length": 304
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 765,
+          "startOffset": 771,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 20800,
+          "startOffset": 20806,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 60850,
+          "startOffset": 60856,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 60867,
+          "startOffset": 60873,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 64377,
+          "startOffset": 64383,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 64402,
+          "startOffset": 64408,
           "length": 133
         }
       ],
@@ -494,8 +494,8 @@
     {
       "stripeNumber": 2,
       "stripeInformation": {
-        "offset": 64624,
-        "indexLength": 753,
+        "offset": 64629,
+        "indexLength": 759,
         "dataLength": 63763,
         "footerLength": 87,
         "rowCount": 5000
@@ -504,67 +504,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 64624,
+          "startOffset": 64629,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 64641,
+          "startOffset": 64646,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 64807,
+          "startOffset": 64812,
           "length": 166
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 64973,
+          "startOffset": 64978,
           "length": 100
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 65073,
-          "length": 304
+          "startOffset": 65078,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 65377,
+          "startOffset": 65388,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 85412,
+          "startOffset": 85423,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 125462,
+          "startOffset": 125473,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 125479,
+          "startOffset": 125490,
           "length": 3503
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 128982,
+          "startOffset": 128993,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 129007,
+          "startOffset": 129018,
           "length": 133
         }
       ],
@@ -735,77 +735,77 @@
     {
       "stripeNumber": 3,
       "stripeInformation": {
-        "offset": 129227,
-        "indexLength": 754,
+        "offset": 129238,
+        "indexLength": 760,
         "dataLength": 63770,
-        "footerLength": 89,
+        "footerLength": 88,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 129227,
+          "startOffset": 129238,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 129244,
+          "startOffset": 129255,
           "length": 164
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 129408,
+          "startOffset": 129419,
           "length": 167
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 129575,
+          "startOffset": 129586,
           "length": 102
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 129677,
-          "length": 304
+          "startOffset": 129688,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 129981,
+          "startOffset": 129998,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 150016,
+          "startOffset": 150033,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 190066,
+          "startOffset": 190083,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 190083,
+          "startOffset": 190100,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 193593,
+          "startOffset": 193610,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 193618,
+          "startOffset": 193635,
           "length": 133
         }
       ],
@@ -976,8 +976,8 @@
     {
       "stripeNumber": 4,
       "stripeInformation": {
-        "offset": 193840,
-        "indexLength": 765,
+        "offset": 193856,
+        "indexLength": 771,
         "dataLength": 63756,
         "footerLength": 89,
         "rowCount": 5000
@@ -986,67 +986,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 193840,
+          "startOffset": 193856,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 193857,
+          "startOffset": 193873,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 194023,
+          "startOffset": 194039,
           "length": 171
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 194194,
+          "startOffset": 194210,
           "length": 107
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 194301,
-          "length": 304
+          "startOffset": 194317,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 194605,
+          "startOffset": 194627,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 214640,
+          "startOffset": 214662,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 254690,
+          "startOffset": 254712,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 254707,
+          "startOffset": 254729,
           "length": 3496
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 258203,
+          "startOffset": 258225,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 258228,
+          "startOffset": 258250,
           "length": 133
         }
       ],
@@ -1217,8 +1217,8 @@
     {
       "stripeNumber": 5,
       "stripeInformation": {
-        "offset": 258450,
-        "indexLength": 383,
+        "offset": 258472,
+        "indexLength": 376,
         "dataLength": 12943,
         "footerLength": 83,
         "rowCount": 1000
@@ -1227,67 +1227,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 258450,
+          "startOffset": 258472,
           "length": 12
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 258462,
+          "startOffset": 258484,
           "length": 38
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 258500,
+          "startOffset": 258522,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 258541,
+          "startOffset": 258563,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 258582,
-          "length": 251
+          "startOffset": 258604,
+          "length": 244
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 258833,
+          "startOffset": 258848,
           "length": 4007
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 262840,
+          "startOffset": 262855,
           "length": 8010
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 270850,
+          "startOffset": 270865,
           "length": 16
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 270866,
+          "startOffset": 270881,
           "length": 752
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 271618,
+          "startOffset": 271633,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 271643,
+          "startOffset": 271658,
           "length": 133
         }
       ],
@@ -1348,7 +1348,7 @@
       }]
     }
   ],
-  "fileLength": 272409,
+  "fileLength": 272428,
   "paddingLength": 0,
   "paddingRatio": 0,
   "status": "OK"

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_data/releases.yml
----------------------------------------------------------------------
diff --git a/site/_data/releases.yml b/site/_data/releases.yml
index 3331688..1282115 100644
--- a/site/_data/releases.yml
+++ b/site/_data/releases.yml
@@ -9,6 +9,7 @@
   sha256: 5c394c7ed3a31d20726ded55ed9c5a0eeff1bd5b85b1cb2ee6c3c1a94560578c
   known-issues:
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.2:
   date: 2016-07-08
@@ -19,6 +20,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.1:
   date: 2016-06-13
@@ -29,6 +31,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.0:
   date: 2016-06-10
@@ -39,6 +42,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.0.0:
   date: 2016-01-25

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/spec-index.md
----------------------------------------------------------------------
diff --git a/site/_docs/spec-index.md b/site/_docs/spec-index.md
index 009df59..263c9a8 100644
--- a/site/_docs/spec-index.md
+++ b/site/_docs/spec-index.md
@@ -57,14 +57,17 @@ group (default to 10,000 rows) in a column. Only the row 
groups that
 satisfy min/max row index evaluation will be evaluated against the
 bloom filter index.
 
-Each BloomFilterEntry stores the number of hash functions ('k') used and
-the bitset backing the bloom filter. The bitset is serialized as repeated
-longs from which the number of bits ('m') for the bloom filter can be derived.
-m = bitset.length * 64.
+Each BloomFilterEntry stores the number of hash functions ('k') used
+and the bitset backing the bloom filter. The original encoding (pre
+ORC-101) of bloom filters used the bitset field encoded as a repeating
+sequence of longs in the bitset field with a little endian encoding
+(0x1 is bit 0 and 0x2 is bit 1.) After ORC-101, the encoding is a
+sequence of bytes with a little endian encoding in the utf8bitset field.
 
 ```message BloomFilter {
  optional uint32 numHashFunctions = 1;
  repeated fixed64 bitset = 2;
+ optional bytes utf8bitset = 3;
 }
 ```
 

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/stripes.md
----------------------------------------------------------------------
diff --git a/site/_docs/stripes.md b/site/_docs/stripes.md
index d53f709..cc85feb 100644
--- a/site/_docs/stripes.md
+++ b/site/_docs/stripes.md
@@ -56,6 +56,10 @@ depends on the type and encoding of the column.
  SECONDARY = 5;
  // the index for seeking to particular row groups
  ROW_INDEX = 6;
+ // original bloom filters used before ORC-101
+ BLOOM_FILTER = 7;
+ // bloom filters that consistently use utf8
+ BLOOM_FILTER_UTF8 = 8;
  }
  required Kind kind = 1;
  // the column id

Reply via email to