This is an automated email from the ASF dual-hosted git repository.

planka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 7f90bbd65 ORC-1362: [Java] Enable ENABLE_INDEXES (#1387)
7f90bbd65 is described below

commit 7f90bbd65c1c45f3bf737f776ab1e52c3dc798b9
Author: Deshan Xiao <[email protected]>
AuthorDate: Tue Feb 21 07:36:46 2023 +0800

    ORC-1362: [Java] Enable ENABLE_INDEXES (#1387)
    
    ### What changes were proposed in this pull request?
    Enable the functionality behind `orc.create.index`.
    
    ### Why are the changes needed?
    In https://github.com/apache/orc/pull/1373, we disabled the configuration 
"orc.create.index" because the orc reader cannot work well when 
orc.create.index=false.
    
    During the write of a file with `orc.create.index=false` and 
`orc.row.index.stride > 0` no indexes are created in the ORC file. The 
`orc.row.index.stride` is persisted into the file metadata. During a read that 
requires the use of indexes e.g. in the presence of search arguments we get a 
read failure as the reader was expecting indexes based on the value of 
`orc.row.index.stride` but none exist.
    
    Two changes are proposed here:
    * Enable consistent writes i.e. assume orc.row.index.stride=0 if 
orc.create.index=false or if orc.row.index.stride >0 then assume 
orc.create.index=true
    * Enable the reader to handle the absence of row index information. During 
the process of selecting row groups in a stripe if the index information is 
seen as missing then we assume that all row groups should be read as the index 
information is missing.
    
    ### How was this patch tested?
    UT
---
 ...estSargSkipPickupGroupWithoutIndexCPlusPlus.orc | Bin 0 -> 307 bytes
 ...ile.testSargSkipPickupGroupWithoutIndexJava.orc | Bin 0 -> 26489 bytes
 java/core/src/java/org/apache/orc/OrcFile.java     |  15 +++++++++
 .../java/org/apache/orc/impl/RecordReaderImpl.java |   3 +-
 .../src/java/org/apache/orc/impl/WriterImpl.java   |   5 +--
 .../test/org/apache/orc/impl/TestReaderImpl.java   |  37 +++++++++++++++++++++
 .../test/org/apache/orc/impl/TestWriterImpl.java   |  18 ++++++++--
 7 files changed, 72 insertions(+), 6 deletions(-)

diff --git 
a/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc 
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc
new file mode 100644
index 000000000..d32bb1f1d
Binary files /dev/null and 
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc differ
diff --git a/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc 
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc
new file mode 100644
index 000000000..c57c5b732
Binary files /dev/null and 
b/examples/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc differ
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java 
b/java/core/src/java/org/apache/orc/OrcFile.java
index a1a00e92f..fc164a977 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -564,6 +564,21 @@ public class OrcFile {
      */
     public WriterOptions rowIndexStride(int value) {
       rowIndexStrideValue = value;
+      if (rowIndexStrideValue <= 0) {
+        buildIndex = false;
+      }
+      return this;
+    }
+
+    /**
+     * Sets whether build the index. The default value is true. If the value is
+     * set to false, rowIndexStrideValue will be set to zero.
+     */
+    public WriterOptions buildIndex(boolean value) {
+      buildIndex = value;
+      if (!buildIndex) {
+        rowIndexStrideValue = 0;
+      }
       return this;
     }
 
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 5a478efed..def1fc169 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -1176,7 +1176,8 @@ public class RecordReaderImpl implements RecordReader {
             leafValues[pred] = exceptionAnswer[pred];
           } else {
             if (indexes[columnIx] == null) {
-              throw new AssertionError("Index is not populated for " + 
columnIx);
+              LOG.warn("Index is not populated for " + columnIx);
+              return READ_ALL_RGS;
             }
             OrcProto.RowIndexEntry entry = 
indexes[columnIx].getEntry(rowGroup);
             if (entry == null) {
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java 
b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index 264b01a3f..73450c68f 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -183,14 +183,15 @@ public class WriterImpl implements WriterInternal, 
MemoryManager.Callback {
     this.encodingStrategy = opts.getEncodingStrategy();
     this.compressionStrategy = opts.getCompressionStrategy();
 
-    if (opts.getRowIndexStride() >= 0) {
+    // ORC-1362: if isBuildIndex=false, then rowIndexStride will be set to 0.
+    if (opts.getRowIndexStride() >= 0 && opts.isBuildIndex()) {
       this.rowIndexStride = opts.getRowIndexStride();
     } else {
       this.rowIndexStride = 0;
     }
 
-    // ORC-1343: We ignore `opts.isBuildIndex` due to the lack of reader 
support
     this.buildIndex = rowIndexStride > 0;
+
     if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
       throw new IllegalArgumentException("Row stride must be at least " +
           MIN_ROW_INDEX_STRIDE);
diff --git a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
index 8369b03c0..528f04e4f 100644
--- a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -527,4 +527,41 @@ public class TestReaderImpl {
       }
     }
   }
+
+  @Test
+  public void testSargSkipPickupGroupWithoutIndex() throws IOException {
+    Configuration conf = new Configuration();
+    // We use ORC files in two languages to test, the previous Java version 
could not work
+    // well when orc.row.index.stride > 0 and orc.create.index=false, now it 
can skip these row groups.
+    Path[] paths = new Path[] {
+        // Writen by C++ API with schema struct<x:int,y:string> 
orc.row.index.stride=0
+        new Path(workDir, 
"TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc"),
+        // Writen by old Java API with schema struct<x:int,y:string> 
orc.row.index.stride=1000,orc.create.index=false
+        new Path(workDir, 
"TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc"),
+    };
+    for (Path path: paths) {
+      FileSystem fs = path.getFileSystem(conf);
+      try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
+          OrcFile.readerOptions(conf).filesystem(fs))) {
+
+        SearchArgument sarg = SearchArgumentFactory.newBuilder()
+            .startNot()
+            .lessThan("x", PredicateLeaf.Type.LONG, 100L)
+            .end().build();
+
+        try (RecordReader rows = 
reader.rows(reader.options().searchArgument(sarg, new String[]{"x"}))) {
+          TypeDescription schema = reader.getSchema();
+          assertEquals("struct<x:int,y:string>", schema.toString());
+          VectorizedRowBatch batch = schema.createRowBatchV2();
+          assertTrue(rows.nextBatch(batch), "No rows read out!");
+          assertEquals(1024, batch.size);
+          LongColumnVector col1 = (LongColumnVector) batch.cols[0];
+          for (int i = 0; i < batch.size; ++i) {
+            assertEquals(i, col1.vector[i]);
+          }
+          assertTrue(rows.nextBatch(batch));
+        }
+      }
+    }
+  }
 }
diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
index 679fb6cb8..e5d2616cc 100644
--- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
@@ -32,13 +32,13 @@ import org.apache.orc.Writer;
 import org.apache.orc.*;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import java.io.IOException;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 
 public class TestWriterImpl {
 
@@ -94,11 +94,10 @@ public class TestWriterImpl {
     w.close();
   }
 
-  @Disabled("ORC-1343: Disable ENABLE_INDEXES tests until reader supports it 
properly")
   @Test
   public void testNoIndexIfEnableIndexIsFalse() throws Exception {
     conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
-    conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000");
+    conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "0");
     conf.setBoolean(OrcConf.ENABLE_INDEXES.getAttribute(), false);
     VectorizedRowBatch b = schema.createRowBatch();
     LongColumnVector f1 = (LongColumnVector) b.cols[0];
@@ -121,6 +120,19 @@ public class TestWriterImpl {
     }
   }
 
+  @Test
+  public void testEnableDisableIndex() {
+    conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "10000");
+    OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf);
+    writerOptions.buildIndex(false);
+    assertEquals(writerOptions.getRowIndexStride(), 0);
+
+    conf.set(OrcConf.ENABLE_INDEXES.getAttribute(), "true");
+    OrcFile.WriterOptions writerOptions2 = OrcFile.writerOptions(conf);
+    writerOptions2.rowIndexStride(0);
+    assertFalse(writerOptions2.isBuildIndex());
+  }
+
   @Test
   public void testStripes() throws Exception {
     conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");

Reply via email to