This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 71dd81b4a ORC-1948: Fix `GeospatialTreeWriter#writeBatch` updating
ColumnStatistics with incorrect values
71dd81b4a is described below
commit 71dd81b4a2ea16cec2f68c8ca85b91d8a8532698
Author: Bradley <[email protected]>
AuthorDate: Tue Jul 8 08:51:12 2025 -0700
ORC-1948: Fix `GeospatialTreeWriter#writeBatch` updating ColumnStatistics
with incorrect values
### What changes were proposed in this pull request?
Fix incorrect values in column statistics for geometry type.
### Why are the changes needed?
`GeospatialTreeWriter#writeBatch` uses incorrect values to update column
statistics when `vector.isRepeating` is false.
### How was this patch tested?
The unit test `TestWriterImpl#testGeospatialColumnStatistics` covers cases
where the `offset` parameter in `GeospatialTreeWriter#writeBatch` is either 0
or greater than 0.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #2319 from usberkeley/ORC-1948.
Authored-by: Bradley <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../orc/impl/writer/GeospatialTreeWriter.java | 6 +--
.../test/org/apache/orc/impl/TestWriterImpl.java | 60 ++++++++++++++++++++++
2 files changed, 62 insertions(+), 4 deletions(-)
diff --git
a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
index e9a0aa70b..676ca32a9 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
@@ -20,7 +20,6 @@ package org.apache.orc.impl.writer;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.orc.OrcProto;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.CryptoUtils;
@@ -96,10 +95,9 @@ public class GeospatialTreeWriter extends TreeWriterBase {
vec.start[offset + i], vec.length[offset + i]);
this.length.write(vec.length[offset + i]);
rawDataSize += vec.length[offset + i];
- BytesWritable bw = new BytesWritable();
- bw.set(vec.vector[offset + i], vec.start[offset + i],
vec.length[offset + i]);
if (isGeometry) {
- indexStatistics.updateGeometry(vec.vector[i], vec.start[i],
vec.length[i]);
+ indexStatistics.updateGeometry(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
}
if (createBloomFilter) {
if (bloomFilter != null) {
diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
index 903e4e80c..58236502d 100644
--- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcConf;
@@ -29,9 +30,15 @@ import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.*;
+import org.apache.orc.geospatial.BoundingBox;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBWriter;
+import org.locationtech.jts.io.WKTReader;
import java.io.IOException;
@@ -178,6 +185,59 @@ public class TestWriterImpl implements TestConf {
assertEquals(10, w.getStripes().size());
}
+ @ParameterizedTest
+ @ValueSource(booleans = {true, false})
+ public void testGeospatialColumnStatistics(boolean useFilter) throws
IOException, ParseException {
+ conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
+ // Use the Geometry type
+ schema = TypeDescription.createGeometry();
+ Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf).setSchema(schema));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector geomColumn = (BytesColumnVector) batch.cols[0];
+
+ WKTReader wktReader = new WKTReader();
+ WKBWriter wkbWriter = new WKBWriter();
+ byte[] point1 = wkbWriter.write(wktReader.read("POINT (1 2)"));
+ byte[] point2 = wkbWriter.write(wktReader.read("POINT (3 4)"));
+ byte[] point3 = wkbWriter.write(wktReader.read("POINT (5 6)"));
+ byte[] point4 = wkbWriter.write(wktReader.read("POINT (7 8)"));
+
+ geomColumn.setVal(0, point1);
+ geomColumn.setVal(1, point2);
+ geomColumn.setVal(2, point3);
+ geomColumn.setVal(3, point4);
+
+ if (useFilter) {
+ int[] selected = {2};
+ batch.setFilterContext(true, selected, selected.length);
+ } else {
+ batch.size = 4;
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf));
+ ColumnStatistics[] statistics = reader.getStatistics();
+ GeospatialColumnStatistics geometryStatistics =
(GeospatialColumnStatistics) statistics[0];
+ BoundingBox bbox = geometryStatistics.getBoundingBox();
+ if (useFilter) {
+ assertEquals(5.0, bbox.getXMin());
+ assertEquals(5.0, bbox.getXMax());
+ assertEquals(6.0, bbox.getYMin());
+ assertEquals(6.0, bbox.getYMax());
+ } else {
+ assertEquals(1.0, bbox.getXMin());
+ assertEquals(7.0, bbox.getXMax());
+ assertEquals(2.0, bbox.getYMin());
+ assertEquals(8.0, bbox.getYMax());
+ }
+ assertEquals(Double.NaN, bbox.getZMin());
+ assertEquals(Double.NaN, bbox.getZMax());
+ assertEquals(Double.NaN, bbox.getMMin());
+ assertEquals(Double.NaN, bbox.getMMax());
+ reader.close();
+ }
+
@Test
public void testCloseIsIdempotent() throws IOException {
conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");