(incubator-gluten) branch main updated: [GLUTEN-11088][VL] Fall back CSV reader (#11190)

philo Mon, 19 Jan 2026 00:38:22 -0800

This is an automated email from the ASF dual-hosted git repository.

philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new e121903e5f [GLUTEN-11088][VL] Fall back CSV reader (#11190)
e121903e5f is described below

commit e121903e5f4e4e86b3c52c08a22692fd646badec
Author: Jin Chengcheng <[email protected]>
AuthorDate: Mon Jan 19 16:36:40 2026 +0800

    [GLUTEN-11088][VL] Fall back CSV reader (#11190)
---
 .../gluten/backendsapi/velox/VeloxRuleApi.scala    |   3 -
 .../gluten/utils/velox/VeloxTestSettings.scala     | 119 +++++++++++----------
 .../execution/datasources/csv/GlutenCSVSuite.scala |   4 +
 3 files changed, 66 insertions(+), 60 deletions(-)

diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
index 3bf030bf2c..fcc00389d5 100644
--- 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
+++ 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
@@ -65,7 +65,6 @@ object VeloxRuleApi {
     injector.injectOptimizerRule(HLLRewriteRule.apply)
     injector.injectOptimizerRule(CollapseGetJsonObjectExpressionRule.apply)
     injector.injectOptimizerRule(RewriteCastFromArray.apply)
-    injector.injectPostHocResolutionRule(ArrowConvertorRule.apply)
     injector.injectOptimizerRule(RewriteUnboundedWindow.apply)
     if (BackendsApiManager.getSettings.supportAppendDataExec()) {
       
injector.injectPlannerStrategy(SparkShimLoader.getSparkShims.getRewriteCreateTableAsSelect(_))
@@ -89,7 +88,6 @@ object VeloxRuleApi {
         BloomFilterMightContainJointRewriteRule.apply(
           c.session,
           c.caller.isBloomFilterStatFunction()))
-    injector.injectPreTransform(c => ArrowScanReplaceRule.apply(c.session))
     injector.injectPreTransform(_ => EliminateRedundantGetTimestamp)
 
     // Legacy: The legacy transform rule.
@@ -172,7 +170,6 @@ object VeloxRuleApi {
         BloomFilterMightContainJointRewriteRule.apply(
           c.session,
           c.caller.isBloomFilterStatFunction()))
-    injector.injectPreTransform(c => ArrowScanReplaceRule.apply(c.session))
     injector.injectPreTransform(_ => EliminateRedundantGetTimestamp)
 
     // Gluten RAS: The RAS rule.
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 614a6b70c1..8ce9933df0 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.execution._
 import 
org.apache.spark.sql.execution.adaptive.velox.VeloxAdaptiveQueryExecSuite
 import org.apache.spark.sql.execution.datasources._
 import 
org.apache.spark.sql.execution.datasources.binaryfile.GlutenBinaryFileFormatSuite
+import 
org.apache.spark.sql.execution.datasources.csv.{GlutenCSVLegacyTimeParserSuite, 
GlutenCSVv1Suite, GlutenCSVv2Suite}
 import 
org.apache.spark.sql.execution.datasources.json.{GlutenJsonLegacyTimeParserSuite,
 GlutenJsonV1Suite, GlutenJsonV2Suite}
 import org.apache.spark.sql.execution.datasources.orc._
 import org.apache.spark.sql.execution.datasources.parquet._
@@ -234,61 +235,66 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenBinaryFileFormatSuite]
     // Exception.
     .exclude("column pruning - non-readable file")
-  // TODO: fix in Spark-4.0
-  // enableSuite[GlutenCSVv1Suite]
-  //   // file cars.csv include null string, Arrow not support to read
-  //   .exclude("DDL test with schema")
-  //   .exclude("save csv")
-  //   .exclude("save csv with compression codec option")
-  //   .exclude("save csv with empty fields with user defined empty values")
-  //   .exclude("save csv with quote")
-  //   .exclude("SPARK-13543 Write the output as uncompressed via option()")
-  //   .exclude("DDL test with tab separated file")
-  //   .exclude("DDL test parsing decimal type")
-  //   .exclude("test with tab delimiter and double quote")
-  //   // Arrow not support corrupt record
-  //   .exclude("SPARK-27873: disabling enforceSchema should not fail 
columnNameOfCorruptRecord")
-  //   // varchar
-  //   .exclude("SPARK-48241: CSV parsing failure with char/varchar type 
columns")
-  //   // Flaky and already excluded in other cases
-  //   .exclude("Gluten - test for FAILFAST parsing mode")
+  enableSuite[GlutenCSVv1Suite]
+    // file cars.csv include null string, Arrow not support to read
+    .exclude("DDL test with schema")
+    .exclude("save csv")
+    .exclude("save csv with compression codec option")
+    .exclude("save csv with empty fields with user defined empty values")
+    .exclude("save csv with quote")
+    .exclude("SPARK-13543 Write the output as uncompressed via option()")
+    .exclude("DDL test with tab separated file")
+    .exclude("DDL test parsing decimal type")
+    .exclude("test with tab delimiter and double quote")
+    .exclude("when mode is null, will fall back to PermissiveMode mode")
+    .exclude("SPARK-46890: CSV fails on a column with default and without 
enforcing schema")
+    // Arrow not support corrupt record
+    .exclude("SPARK-27873: disabling enforceSchema should not fail 
columnNameOfCorruptRecord")
+    // varchar
+    .exclude("SPARK-48241: CSV parsing failure with char/varchar type columns")
+    // Flaky and already excluded in other cases
+    .exclude("Gluten - test for FAILFAST parsing mode")
 
-  // enableSuite[GlutenCSVv2Suite]
-  //   .exclude("Gluten - test for FAILFAST parsing mode")
-  //   // Rule 
org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch
-  //   // Early Filter and Projection Push-Down generated an invalid plan
-  //   .exclude("SPARK-26208: write and read empty data to csv file with 
headers")
-  //   // file cars.csv include null string, Arrow not support to read
-  //   .exclude("old csv data source name works")
-  //   .exclude("DDL test with schema")
-  //   .exclude("save csv")
-  //   .exclude("save csv with compression codec option")
-  //   .exclude("save csv with empty fields with user defined empty values")
-  //   .exclude("save csv with quote")
-  //   .exclude("SPARK-13543 Write the output as uncompressed via option()")
-  //   .exclude("DDL test with tab separated file")
-  //   .exclude("DDL test parsing decimal type")
-  //   .exclude("test with tab delimiter and double quote")
-  //   // Arrow not support corrupt record
-  //   .exclude("SPARK-27873: disabling enforceSchema should not fail 
columnNameOfCorruptRecord")
-  //   // varchar
-  //   .exclude("SPARK-48241: CSV parsing failure with char/varchar type 
columns")
+  enableSuite[GlutenCSVv2Suite]
+    .exclude("Gluten - test for FAILFAST parsing mode")
+    // Rule 
org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch
+    // Early Filter and Projection Push-Down generated an invalid plan
+    .exclude("SPARK-26208: write and read empty data to csv file with headers")
+    // file cars.csv include null string, Arrow not support to read
+    .exclude("old csv data source name works")
+    .exclude("DDL test with schema")
+    .exclude("save csv")
+    .exclude("save csv with compression codec option")
+    .exclude("save csv with empty fields with user defined empty values")
+    .exclude("save csv with quote")
+    .exclude("SPARK-13543 Write the output as uncompressed via option()")
+    .exclude("DDL test with tab separated file")
+    .exclude("DDL test parsing decimal type")
+    .exclude("test with tab delimiter and double quote")
+    .exclude("when mode is null, will fall back to PermissiveMode mode")
+    .exclude("SPARK-46890: CSV fails on a column with default and without 
enforcing schema")
+    // Arrow not support corrupt record
+    .exclude("SPARK-27873: disabling enforceSchema should not fail 
columnNameOfCorruptRecord")
+    // varchar
+    .exclude("SPARK-48241: CSV parsing failure with char/varchar type columns")
 
-  // enableSuite[GlutenCSVLegacyTimeParserSuite]
-  //   // file cars.csv include null string, Arrow not support to read
-  //   .exclude("DDL test with schema")
-  //   .exclude("save csv")
-  //   .exclude("save csv with compression codec option")
-  //   .exclude("save csv with empty fields with user defined empty values")
-  //   .exclude("save csv with quote")
-  //   .exclude("SPARK-13543 Write the output as uncompressed via option()")
-  //   // Arrow not support corrupt record
-  //   .exclude("SPARK-27873: disabling enforceSchema should not fail 
columnNameOfCorruptRecord")
-  //   .exclude("DDL test with tab separated file")
-  //   .exclude("DDL test parsing decimal type")
-  //   .exclude("test with tab delimiter and double quote")
-  //   // varchar
-  //   .exclude("SPARK-48241: CSV parsing failure with char/varchar type 
columns")
+  enableSuite[GlutenCSVLegacyTimeParserSuite]
+    // file cars.csv include null string, Arrow not support to read
+    .exclude("DDL test with schema")
+    .exclude("save csv")
+    .exclude("save csv with compression codec option")
+    .exclude("save csv with empty fields with user defined empty values")
+    .exclude("save csv with quote")
+    .exclude("SPARK-13543 Write the output as uncompressed via option()")
+    .exclude("when mode is null, will fall back to PermissiveMode mode")
+    .exclude("SPARK-46890: CSV fails on a column with default and without 
enforcing schema")
+    // Arrow not support corrupt record
+    .exclude("SPARK-27873: disabling enforceSchema should not fail 
columnNameOfCorruptRecord")
+    .exclude("DDL test with tab separated file")
+    .exclude("DDL test parsing decimal type")
+    .exclude("test with tab delimiter and double quote")
+    // varchar
+    .exclude("SPARK-48241: CSV parsing failure with char/varchar type columns")
   enableSuite[GlutenJsonV1Suite]
     // FIXME: Array direct selection fails
     .exclude("Complex field and type inferring")
@@ -553,10 +559,9 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenPathFilterStrategySuite]
   enableSuite[GlutenPathFilterSuite]
   enableSuite[GlutenPruneFileSourcePartitionsSuite]
-  // TODO: fix in Spark-4.0
-  // enableSuite[GlutenCSVReadSchemaSuite]
-  // enableSuite[GlutenHeaderCSVReadSchemaSuite]
-  //   .exclude("change column type from int to long")
+  enableSuite[GlutenCSVReadSchemaSuite]
+  enableSuite[GlutenHeaderCSVReadSchemaSuite]
+    .exclude("change column type from int to long")
   enableSuite[GlutenJsonReadSchemaSuite]
   enableSuite[GlutenOrcReadSchemaSuite]
   enableSuite[GlutenVectorizedOrcReadSchemaSuite]
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
index 6cfa9f2028..af0f59b9bc 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
@@ -132,4 +132,8 @@ class GlutenCSVLegacyTimeParserSuite extends GlutenCSVSuite 
{
   override def sparkConf: SparkConf =
     super.sparkConf
       .set(SQLConf.LEGACY_TIME_PARSER_POLICY, "legacy")
+
+  // The source CSVLegacyTimeParserSuite exclude the test
+  override def excluded: Seq[String] =
+    Seq("Write timestamps correctly in ISO8601 format by default")
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-11088][VL] Fall back CSV reader (#11190)

Reply via email to