Re: [PR] [VL] Support mapping columns by index for ORC and Parquet files [incubator-gluten]

via GitHub Thu, 25 Sep 2025 22:30:25 -0700


rui-mo commented on code in PR #10697:
URL: 
https://github.com/apache/incubator-gluten/pull/10697#discussion_r2376980463



##########
backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala:
##########
@@ -207,4 +207,64 @@ class VeloxScanSuite extends 
VeloxWholeStageTransformerSuite {
         }
     }
   }
+
+  test("parquet index based schema evolution") {
+    withSQLConf(VeloxConfig.PARQUET_USE_COLUMN_NAMES.key -> "false") {
+      withTempDir {
+        dir =>
+          val path = dir.getCanonicalPath
+          spark
+            .range(2)
+            .selectExpr("id as a", "cast(id + 10 as string) as b")
+            .write
+            .mode("overwrite")
+            .parquet(path)
+
+          withTable("test") {
+            sql("create table test (c long, d string) using parquet options 
(path '" + path + "')")
+            var df = sql("select c, d from test")
+            checkAnswer(df, Seq(Row(0L, "10"), Row(1L, "11")))
+
+            df = sql("select d from test")
+            checkAnswer(df, Seq(Row("10"), Row("11")))
+
+            df = sql("select c from test")
+            checkAnswer(df, Seq(Row(0L), Row(1L)))
+
+            df = sql("select d, c from test")
+            checkAnswer(df, Seq(Row("10", 0L), Row("11", 1L)))
+          }
+      }
+    }
+  }
+
+  test("ORC index based schema evolution") {
+    withSQLConf(VeloxConfig.ORC_USE_COLUMN_NAMES.key -> "false") {
+      withTempDir {
+        dir =>
+          val path = dir.getCanonicalPath
+          spark
+            .range(2)
+            .selectExpr("id as a", "cast(id + 10 as string) as b")
+            .write
+            .mode("overwrite")
+            .orc(path)
+
+          withTable("test") {
+            sql("create table test (c long, d string) using orc options (path 
'" + path + "')")
+            var df = sql("select c, d from test")
+            checkAnswer(df, Seq(Row(0L, "10"), Row(1L, "11")))
+
+            df = sql("select d from test")
+            checkAnswer(df, Seq(Row("10"), Row("11")))
+
+            df = sql("select c from test")
+            checkAnswer(df, Seq(Row(0L), Row(1L)))
+
+            df = sql("select d, c from test")
+            checkAnswer(df, Seq(Row("10", 0L), Row("11", 1L)))
+          }

Review Comment:
   Can we also add tests for selecting more columns than the data columns?



##########
cpp/velox/substrait/SubstraitToVeloxPlan.cc:
##########
@@ -1272,26 +1272,30 @@ core::PlanNodePtr 
SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
     SubstraitParser::parseColumnTypes(baseSchema, columnTypes);
   }
 
-  // Velox requires Filter Pushdown must being enabled.
-  bool filterPushdownEnabled = true;
   auto names = colNameList;
   auto types = veloxTypeList;
-  auto dataColumns = ROW(std::move(names), std::move(types));
+  // The columns we project from the file.
+  auto baseSchema = ROW(std::move(names), std::move(types));
+  // The columns present in the table, if not available default to the 
baseSchema.
+  auto tableSchema = splitInfo->tableSchema ? splitInfo->tableSchema : 
baseSchema;

Review Comment:
   > we need to pass the full table schema to the HiveTableHandle,
   
   Is table schema set only when mapping by index?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [VL] Support mapping columns by index for ORC and Parquet files [incubator-gluten]

Reply via email to