This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new e121903e5f [GLUTEN-11088][VL] Fall back CSV reader (#11190)
e121903e5f is described below
commit e121903e5f4e4e86b3c52c08a22692fd646badec
Author: Jin Chengcheng <[email protected]>
AuthorDate: Mon Jan 19 16:36:40 2026 +0800
[GLUTEN-11088][VL] Fall back CSV reader (#11190)
---
.../gluten/backendsapi/velox/VeloxRuleApi.scala | 3 -
.../gluten/utils/velox/VeloxTestSettings.scala | 119 +++++++++++----------
.../execution/datasources/csv/GlutenCSVSuite.scala | 4 +
3 files changed, 66 insertions(+), 60 deletions(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
index 3bf030bf2c..fcc00389d5 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
@@ -65,7 +65,6 @@ object VeloxRuleApi {
injector.injectOptimizerRule(HLLRewriteRule.apply)
injector.injectOptimizerRule(CollapseGetJsonObjectExpressionRule.apply)
injector.injectOptimizerRule(RewriteCastFromArray.apply)
- injector.injectPostHocResolutionRule(ArrowConvertorRule.apply)
injector.injectOptimizerRule(RewriteUnboundedWindow.apply)
if (BackendsApiManager.getSettings.supportAppendDataExec()) {
injector.injectPlannerStrategy(SparkShimLoader.getSparkShims.getRewriteCreateTableAsSelect(_))
@@ -89,7 +88,6 @@ object VeloxRuleApi {
BloomFilterMightContainJointRewriteRule.apply(
c.session,
c.caller.isBloomFilterStatFunction()))
- injector.injectPreTransform(c => ArrowScanReplaceRule.apply(c.session))
injector.injectPreTransform(_ => EliminateRedundantGetTimestamp)
// Legacy: The legacy transform rule.
@@ -172,7 +170,6 @@ object VeloxRuleApi {
BloomFilterMightContainJointRewriteRule.apply(
c.session,
c.caller.isBloomFilterStatFunction()))
- injector.injectPreTransform(c => ArrowScanReplaceRule.apply(c.session))
injector.injectPreTransform(_ => EliminateRedundantGetTimestamp)
// Gluten RAS: The RAS rule.
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 614a6b70c1..8ce9933df0 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.execution._
import
org.apache.spark.sql.execution.adaptive.velox.VeloxAdaptiveQueryExecSuite
import org.apache.spark.sql.execution.datasources._
import
org.apache.spark.sql.execution.datasources.binaryfile.GlutenBinaryFileFormatSuite
+import
org.apache.spark.sql.execution.datasources.csv.{GlutenCSVLegacyTimeParserSuite,
GlutenCSVv1Suite, GlutenCSVv2Suite}
import
org.apache.spark.sql.execution.datasources.json.{GlutenJsonLegacyTimeParserSuite,
GlutenJsonV1Suite, GlutenJsonV2Suite}
import org.apache.spark.sql.execution.datasources.orc._
import org.apache.spark.sql.execution.datasources.parquet._
@@ -234,61 +235,66 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenBinaryFileFormatSuite]
// Exception.
.exclude("column pruning - non-readable file")
- // TODO: fix in Spark-4.0
- // enableSuite[GlutenCSVv1Suite]
- // // file cars.csv include null string, Arrow not support to read
- // .exclude("DDL test with schema")
- // .exclude("save csv")
- // .exclude("save csv with compression codec option")
- // .exclude("save csv with empty fields with user defined empty values")
- // .exclude("save csv with quote")
- // .exclude("SPARK-13543 Write the output as uncompressed via option()")
- // .exclude("DDL test with tab separated file")
- // .exclude("DDL test parsing decimal type")
- // .exclude("test with tab delimiter and double quote")
- // // Arrow not support corrupt record
- // .exclude("SPARK-27873: disabling enforceSchema should not fail
columnNameOfCorruptRecord")
- // // varchar
- // .exclude("SPARK-48241: CSV parsing failure with char/varchar type
columns")
- // // Flaky and already excluded in other cases
- // .exclude("Gluten - test for FAILFAST parsing mode")
+ enableSuite[GlutenCSVv1Suite]
+ // file cars.csv include null string, Arrow not support to read
+ .exclude("DDL test with schema")
+ .exclude("save csv")
+ .exclude("save csv with compression codec option")
+ .exclude("save csv with empty fields with user defined empty values")
+ .exclude("save csv with quote")
+ .exclude("SPARK-13543 Write the output as uncompressed via option()")
+ .exclude("DDL test with tab separated file")
+ .exclude("DDL test parsing decimal type")
+ .exclude("test with tab delimiter and double quote")
+ .exclude("when mode is null, will fall back to PermissiveMode mode")
+ .exclude("SPARK-46890: CSV fails on a column with default and without
enforcing schema")
+ // Arrow not support corrupt record
+ .exclude("SPARK-27873: disabling enforceSchema should not fail
columnNameOfCorruptRecord")
+ // varchar
+ .exclude("SPARK-48241: CSV parsing failure with char/varchar type columns")
+ // Flaky and already excluded in other cases
+ .exclude("Gluten - test for FAILFAST parsing mode")
- // enableSuite[GlutenCSVv2Suite]
- // .exclude("Gluten - test for FAILFAST parsing mode")
- // // Rule
org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch
- // // Early Filter and Projection Push-Down generated an invalid plan
- // .exclude("SPARK-26208: write and read empty data to csv file with
headers")
- // // file cars.csv include null string, Arrow not support to read
- // .exclude("old csv data source name works")
- // .exclude("DDL test with schema")
- // .exclude("save csv")
- // .exclude("save csv with compression codec option")
- // .exclude("save csv with empty fields with user defined empty values")
- // .exclude("save csv with quote")
- // .exclude("SPARK-13543 Write the output as uncompressed via option()")
- // .exclude("DDL test with tab separated file")
- // .exclude("DDL test parsing decimal type")
- // .exclude("test with tab delimiter and double quote")
- // // Arrow not support corrupt record
- // .exclude("SPARK-27873: disabling enforceSchema should not fail
columnNameOfCorruptRecord")
- // // varchar
- // .exclude("SPARK-48241: CSV parsing failure with char/varchar type
columns")
+ enableSuite[GlutenCSVv2Suite]
+ .exclude("Gluten - test for FAILFAST parsing mode")
+ // Rule
org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch
+ // Early Filter and Projection Push-Down generated an invalid plan
+ .exclude("SPARK-26208: write and read empty data to csv file with headers")
+ // file cars.csv include null string, Arrow not support to read
+ .exclude("old csv data source name works")
+ .exclude("DDL test with schema")
+ .exclude("save csv")
+ .exclude("save csv with compression codec option")
+ .exclude("save csv with empty fields with user defined empty values")
+ .exclude("save csv with quote")
+ .exclude("SPARK-13543 Write the output as uncompressed via option()")
+ .exclude("DDL test with tab separated file")
+ .exclude("DDL test parsing decimal type")
+ .exclude("test with tab delimiter and double quote")
+ .exclude("when mode is null, will fall back to PermissiveMode mode")
+ .exclude("SPARK-46890: CSV fails on a column with default and without
enforcing schema")
+ // Arrow not support corrupt record
+ .exclude("SPARK-27873: disabling enforceSchema should not fail
columnNameOfCorruptRecord")
+ // varchar
+ .exclude("SPARK-48241: CSV parsing failure with char/varchar type columns")
- // enableSuite[GlutenCSVLegacyTimeParserSuite]
- // // file cars.csv include null string, Arrow not support to read
- // .exclude("DDL test with schema")
- // .exclude("save csv")
- // .exclude("save csv with compression codec option")
- // .exclude("save csv with empty fields with user defined empty values")
- // .exclude("save csv with quote")
- // .exclude("SPARK-13543 Write the output as uncompressed via option()")
- // // Arrow not support corrupt record
- // .exclude("SPARK-27873: disabling enforceSchema should not fail
columnNameOfCorruptRecord")
- // .exclude("DDL test with tab separated file")
- // .exclude("DDL test parsing decimal type")
- // .exclude("test with tab delimiter and double quote")
- // // varchar
- // .exclude("SPARK-48241: CSV parsing failure with char/varchar type
columns")
+ enableSuite[GlutenCSVLegacyTimeParserSuite]
+ // file cars.csv include null string, Arrow not support to read
+ .exclude("DDL test with schema")
+ .exclude("save csv")
+ .exclude("save csv with compression codec option")
+ .exclude("save csv with empty fields with user defined empty values")
+ .exclude("save csv with quote")
+ .exclude("SPARK-13543 Write the output as uncompressed via option()")
+ .exclude("when mode is null, will fall back to PermissiveMode mode")
+ .exclude("SPARK-46890: CSV fails on a column with default and without
enforcing schema")
+ // Arrow not support corrupt record
+ .exclude("SPARK-27873: disabling enforceSchema should not fail
columnNameOfCorruptRecord")
+ .exclude("DDL test with tab separated file")
+ .exclude("DDL test parsing decimal type")
+ .exclude("test with tab delimiter and double quote")
+ // varchar
+ .exclude("SPARK-48241: CSV parsing failure with char/varchar type columns")
enableSuite[GlutenJsonV1Suite]
// FIXME: Array direct selection fails
.exclude("Complex field and type inferring")
@@ -553,10 +559,9 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenPathFilterStrategySuite]
enableSuite[GlutenPathFilterSuite]
enableSuite[GlutenPruneFileSourcePartitionsSuite]
- // TODO: fix in Spark-4.0
- // enableSuite[GlutenCSVReadSchemaSuite]
- // enableSuite[GlutenHeaderCSVReadSchemaSuite]
- // .exclude("change column type from int to long")
+ enableSuite[GlutenCSVReadSchemaSuite]
+ enableSuite[GlutenHeaderCSVReadSchemaSuite]
+ .exclude("change column type from int to long")
enableSuite[GlutenJsonReadSchemaSuite]
enableSuite[GlutenOrcReadSchemaSuite]
enableSuite[GlutenVectorizedOrcReadSchemaSuite]
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
index 6cfa9f2028..af0f59b9bc 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala
@@ -132,4 +132,8 @@ class GlutenCSVLegacyTimeParserSuite extends GlutenCSVSuite
{
override def sparkConf: SparkConf =
super.sparkConf
.set(SQLConf.LEGACY_TIME_PARSER_POLICY, "legacy")
+
+ // The source CSVLegacyTimeParserSuite exclude the test
+ override def excluded: Seq[String] =
+ Seq("Write timestamps correctly in ISO8601 format by default")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]