This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new 811c92f7c5f [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of write null value in csv to unquoted empty string 811c92f7c5f is described below commit 811c92f7c5f0e1bc4c12d9b121912a91fc67c208 Author: Xinyi Yu <xinyi...@databricks.com> AuthorDate: Fri Apr 15 16:45:47 2022 +0800 [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of write null value in csv to unquoted empty string ### What changes were proposed in this pull request? Add a legacy flag `spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv` for the breaking change introduced in https://github.com/apache/spark/pull/34853 and https://github.com/apache/spark/pull/34905 (followup). The flag is disabled by default, so the null values written as csv will output an unquoted empty string. When the legacy flag is enabled, the null will output quoted empty string. ### Why are the changes needed? The original commit is a breaking change, and breaking changes should be encouraged to add a flag to turn it off for smooth migration between versions. ### Does this PR introduce _any_ user-facing change? With the default value of the conf, there is no user-facing difference. If users turn this conf off, they can restore the pre-change behavior. ### How was this patch tested? Through unit tests. Closes #36110 from anchovYu/flags-null-to-csv. Authored-by: Xinyi Yu <xinyi...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 965f872500a3554142cab3078a7a4d513d2d2ee8) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/csv/UnivocityGenerator.scala | 4 ++++ .../org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++++++ .../sql/execution/datasources/csv/CSVSuite.scala | 20 ++++++++++++++------ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala index 5dd8c35e4c2..d124a055f63 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala @@ -24,6 +24,7 @@ import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, IntervalStringStyles, IntervalUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class UnivocityGenerator( @@ -95,6 +96,9 @@ class UnivocityGenerator( while (i < row.numFields) { if (!row.isNullAt(i)) { values(i) = valueConverters(i).apply(row, i) + } else if ( + SQLConf.get.getConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV)) { + values(i) = options.nullValue } i += 1 } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9e4496a2c33..5f803ed963b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3724,6 +3724,16 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV = + buildConf("spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv") + .internal() + .doc("When set to false, nulls are written as unquoted empty strings in CSV data source. " + + "If set to false, it restores the legacy behavior that nulls were written as quoted " + + "empty strings, `\"\"`.") + .version("3.3.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 41b4f909ce9..7cbe6ed9fce 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -807,12 +807,20 @@ abstract class CSVSuite test("SPARK-37575: null values should be saved as nothing rather than " + "quoted empty Strings \"\" with default settings") { - withTempPath { path => - Seq(("Tesla", null: String, "")) - .toDF("make", "comment", "blank") - .write - .csv(path.getCanonicalPath) - checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,,\"\"")) + Seq("true", "false").foreach { confVal => + withSQLConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV.key -> confVal) { + withTempPath { path => + Seq(("Tesla", null: String, "")) + .toDF("make", "comment", "blank") + .write + .csv(path.getCanonicalPath) + if (confVal == "false") { + checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,,\"\"")) + } else { + checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,\"\",\"\"")) + } + } + } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org