This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 8e76c2142b3 [SPARK-39294][SQL] Support vectorized Orc scans with DEFAULT values 8e76c2142b3 is described below commit 8e76c2142b382410f1c0091d873b2ee84e9cbd62 Author: Daniel Tenedorio <daniel.tenedo...@databricks.com> AuthorDate: Fri Jun 3 13:48:27 2022 -0700 [SPARK-39294][SQL] Support vectorized Orc scans with DEFAULT values ### What changes were proposed in this pull request? Support vectorized Orc scans when the table schema has associated DEFAULT column values. (Note, this PR depends on https://github.com/apache/spark/pull/36672 which adds the same for Parquet files.) Example: ``` create table t(i int) using orc; insert into t values(42); alter table t add column s string default concat('abc', def'); select * from t; > 42, 'abcdef' ``` ### Why are the changes needed? This change makes it easier to build, query, and maintain tables backed by Orc data. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? This PR includes new test coverage. Closes #36675 from dtenedor/default-orc-vectorized. Authored-by: Daniel Tenedorio <daniel.tenedo...@databricks.com> Signed-off-by: Gengliang Wang <gengli...@apache.org> --- .../execution/datasources/orc/OrcColumnarBatchReader.java | 12 +++++++++++- .../scala/org/apache/spark/sql/sources/InsertSuite.scala | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java index 40ed0b2454c..175ad37aace 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java @@ -164,6 +164,7 @@ public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> { // Just wrap the ORC column vector instead of copying it to Spark column vector. orcVectorWrappers = new org.apache.spark.sql.vectorized.ColumnVector[resultSchema.length()]; + StructType requiredSchema = new StructType(requiredFields); for (int i = 0; i < requiredFields.length; i++) { DataType dt = requiredFields[i].dataType(); if (requestedPartitionColIds[i] != -1) { @@ -176,7 +177,16 @@ public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> { // Initialize the missing columns once. if (colId == -1) { OnHeapColumnVector missingCol = new OnHeapColumnVector(capacity, dt); - missingCol.putNulls(0, capacity); + // Check if the missing column has an associated default value in the schema metadata. + // If so, fill the corresponding column vector with the value. + Object defaultValue = requiredSchema.existenceDefaultValues()[i]; + if (defaultValue == null) { + missingCol.putNulls(0, capacity); + } else if (!missingCol.appendObjects(capacity, defaultValue).isPresent()) { + throw new IllegalArgumentException("Cannot assign default column value to result " + + "column batch in vectorized Orc reader because the data type is not supported: " + + defaultValue); + } missingCol.setIsConstant(); orcVectorWrappers[i] = missingCol; } else { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 35a6f8f8a0b..1b70998c642 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -1608,6 +1608,8 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { TestCase( dataSource = "orc", Seq( + Config( + None), Config( Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false"), insertNullsToStorage = false))), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org