This is an automated email from the ASF dual-hosted git repository.
planka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 7f90bbd65 ORC-1362: [Java] Enable ENABLE_INDEXES (#1387)
7f90bbd65 is described below
commit 7f90bbd65c1c45f3bf737f776ab1e52c3dc798b9
Author: Deshan Xiao <[email protected]>
AuthorDate: Tue Feb 21 07:36:46 2023 +0800
ORC-1362: [Java] Enable ENABLE_INDEXES (#1387)
### What changes were proposed in this pull request?
Enable the functionality behind `orc.create.index`.
### Why are the changes needed?
In https://github.com/apache/orc/pull/1373, we disabled the configuration
"orc.create.index" because the orc reader cannot work well when
orc.create.index=false.
During the write of a file with `orc.create.index=false` and
`orc.row.index.stride > 0` no indexes are created in the ORC file. The
`orc.row.index.stride` is persisted into the file metadata. During a read that
requires the use of indexes e.g. in the presence of search arguments we get a
read failure as the reader was expecting indexes based on the value of
`orc.row.index.stride` but none exist.
Two changes are proposed here:
* Enable consistent writes i.e. assume orc.row.index.stride=0 if
orc.create.index=false or if orc.row.index.stride >0 then assume
orc.create.index=true
* Enable the reader to handle the absence of row index information. During
the process of selecting row groups in a stripe if the index information is
seen as missing then we assume that all row groups should be read as the index
information is missing.
### How was this patch tested?
UT
---
...estSargSkipPickupGroupWithoutIndexCPlusPlus.orc | Bin 0 -> 307 bytes
...ile.testSargSkipPickupGroupWithoutIndexJava.orc | Bin 0 -> 26489 bytes
java/core/src/java/org/apache/orc/OrcFile.java | 15 +++++++++
.../java/org/apache/orc/impl/RecordReaderImpl.java | 3 +-
.../src/java/org/apache/orc/impl/WriterImpl.java | 5 +--
.../test/org/apache/orc/impl/TestReaderImpl.java | 37 +++++++++++++++++++++
.../test/org/apache/orc/impl/TestWriterImpl.java | 18 ++++++++--
7 files changed, 72 insertions(+), 6 deletions(-)
diff --git
a/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc
new file mode 100644
index 000000000..d32bb1f1d
Binary files /dev/null and
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc differ
diff --git a/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc
new file mode 100644
index 000000000..c57c5b732
Binary files /dev/null and
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc differ
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java
b/java/core/src/java/org/apache/orc/OrcFile.java
index a1a00e92f..fc164a977 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -564,6 +564,21 @@ public class OrcFile {
*/
public WriterOptions rowIndexStride(int value) {
rowIndexStrideValue = value;
+ if (rowIndexStrideValue <= 0) {
+ buildIndex = false;
+ }
+ return this;
+ }
+
+ /**
+ * Sets whether build the index. The default value is true. If the value is
+ * set to false, rowIndexStrideValue will be set to zero.
+ */
+ public WriterOptions buildIndex(boolean value) {
+ buildIndex = value;
+ if (!buildIndex) {
+ rowIndexStrideValue = 0;
+ }
return this;
}
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 5a478efed..def1fc169 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -1176,7 +1176,8 @@ public class RecordReaderImpl implements RecordReader {
leafValues[pred] = exceptionAnswer[pred];
} else {
if (indexes[columnIx] == null) {
- throw new AssertionError("Index is not populated for " +
columnIx);
+ LOG.warn("Index is not populated for " + columnIx);
+ return READ_ALL_RGS;
}
OrcProto.RowIndexEntry entry =
indexes[columnIx].getEntry(rowGroup);
if (entry == null) {
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index 264b01a3f..73450c68f 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -183,14 +183,15 @@ public class WriterImpl implements WriterInternal,
MemoryManager.Callback {
this.encodingStrategy = opts.getEncodingStrategy();
this.compressionStrategy = opts.getCompressionStrategy();
- if (opts.getRowIndexStride() >= 0) {
+ // ORC-1362: if isBuildIndex=false, then rowIndexStride will be set to 0.
+ if (opts.getRowIndexStride() >= 0 && opts.isBuildIndex()) {
this.rowIndexStride = opts.getRowIndexStride();
} else {
this.rowIndexStride = 0;
}
- // ORC-1343: We ignore `opts.isBuildIndex` due to the lack of reader
support
this.buildIndex = rowIndexStride > 0;
+
if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
throw new IllegalArgumentException("Row stride must be at least " +
MIN_ROW_INDEX_STRIDE);
diff --git a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
index 8369b03c0..528f04e4f 100644
--- a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -527,4 +527,41 @@ public class TestReaderImpl {
}
}
}
+
+ @Test
+ public void testSargSkipPickupGroupWithoutIndex() throws IOException {
+ Configuration conf = new Configuration();
+ // We use ORC files in two languages to test, the previous Java version
could not work
+ // well when orc.row.index.stride > 0 and orc.create.index=false, now it
can skip these row groups.
+ Path[] paths = new Path[] {
+ // Writen by C++ API with schema struct<x:int,y:string>
orc.row.index.stride=0
+ new Path(workDir,
"TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc"),
+ // Writen by old Java API with schema struct<x:int,y:string>
orc.row.index.stride=1000,orc.create.index=false
+ new Path(workDir,
"TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc"),
+ };
+ for (Path path: paths) {
+ FileSystem fs = path.getFileSystem(conf);
+ try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
+ OrcFile.readerOptions(conf).filesystem(fs))) {
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .startNot()
+ .lessThan("x", PredicateLeaf.Type.LONG, 100L)
+ .end().build();
+
+ try (RecordReader rows =
reader.rows(reader.options().searchArgument(sarg, new String[]{"x"}))) {
+ TypeDescription schema = reader.getSchema();
+ assertEquals("struct<x:int,y:string>", schema.toString());
+ VectorizedRowBatch batch = schema.createRowBatchV2();
+ assertTrue(rows.nextBatch(batch), "No rows read out!");
+ assertEquals(1024, batch.size);
+ LongColumnVector col1 = (LongColumnVector) batch.cols[0];
+ for (int i = 0; i < batch.size; ++i) {
+ assertEquals(i, col1.vector[i]);
+ }
+ assertTrue(rows.nextBatch(batch));
+ }
+ }
+ }
+ }
}
diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
index 679fb6cb8..e5d2616cc 100644
--- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
@@ -32,13 +32,13 @@ import org.apache.orc.Writer;
import org.apache.orc.*;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertFalse;
public class TestWriterImpl {
@@ -94,11 +94,10 @@ public class TestWriterImpl {
w.close();
}
- @Disabled("ORC-1343: Disable ENABLE_INDEXES tests until reader supports it
properly")
@Test
public void testNoIndexIfEnableIndexIsFalse() throws Exception {
conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
- conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000");
+ conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "0");
conf.setBoolean(OrcConf.ENABLE_INDEXES.getAttribute(), false);
VectorizedRowBatch b = schema.createRowBatch();
LongColumnVector f1 = (LongColumnVector) b.cols[0];
@@ -121,6 +120,19 @@ public class TestWriterImpl {
}
}
+ @Test
+ public void testEnableDisableIndex() {
+ conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "10000");
+ OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf);
+ writerOptions.buildIndex(false);
+ assertEquals(writerOptions.getRowIndexStride(), 0);
+
+ conf.set(OrcConf.ENABLE_INDEXES.getAttribute(), "true");
+ OrcFile.WriterOptions writerOptions2 = OrcFile.writerOptions(conf);
+ writerOptions2.rowIndexStride(0);
+ assertFalse(writerOptions2.isBuildIndex());
+ }
+
@Test
public void testStripes() throws Exception {
conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");