This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 965f872500a [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of write null value in csv to unquoted empty string 965f872500a is described below commit 965f872500a3554142cab3078a7a4d513d2d2ee8 Author: Xinyi Yu <xinyi...@databricks.com> AuthorDate: Fri Apr 15 16:45:47 2022 +0800 [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of write null value in csv to unquoted empty string ### What changes were proposed in this pull request? Add a legacy flag `spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv` for the breaking change introduced in https://github.com/apache/spark/pull/34853 and https://github.com/apache/spark/pull/34905 (followup). The flag is disabled by default, so the null values written as csv will output an unquoted empty string. When the legacy flag is enabled, the null will output quoted empty string. ### Why are the changes needed? The original commit is a breaking change, and breaking changes should be encouraged to add a flag to turn it off for smooth migration between versions. ### Does this PR introduce _any_ user-facing change? With the default value of the conf, there is no user-facing difference. If users turn this conf off, they can restore the pre-change behavior. ### How was this patch tested? Through unit tests. Closes #36110 from anchovYu/flags-null-to-csv. Authored-by: Xinyi Yu <xinyi...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/csv/UnivocityGenerator.scala | 4 ++++ .../org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++++++ .../sql/execution/datasources/csv/CSVSuite.scala | 20 ++++++++++++++------ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala index 5dd8c35e4c2..d124a055f63 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala @@ -24,6 +24,7 @@ import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, IntervalStringStyles, IntervalUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class UnivocityGenerator( @@ -95,6 +96,9 @@ class UnivocityGenerator( while (i < row.numFields) { if (!row.isNullAt(i)) { values(i) = valueConverters(i).apply(row, i) + } else if ( + SQLConf.get.getConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV)) { + values(i) = options.nullValue } i += 1 } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index ac2a2e350c6..36b666fd59c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3754,6 +3754,16 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV = + buildConf("spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv") + .internal() + .doc("When set to false, nulls are written as unquoted empty strings in CSV data source. " + + "If set to false, it restores the legacy behavior that nulls were written as quoted " + + "empty strings, `\"\"`.") + .version("3.3.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 819bb430173..9637a85ea35 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -807,12 +807,20 @@ abstract class CSVSuite test("SPARK-37575: null values should be saved as nothing rather than " + "quoted empty Strings \"\" with default settings") { - withTempPath { path => - Seq(("Tesla", null: String, "")) - .toDF("make", "comment", "blank") - .write - .csv(path.getCanonicalPath) - checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,,\"\"")) + Seq("true", "false").foreach { confVal => + withSQLConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV.key -> confVal) { + withTempPath { path => + Seq(("Tesla", null: String, "")) + .toDF("make", "comment", "blank") + .write + .csv(path.getCanonicalPath) + if (confVal == "false") { + checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,,\"\"")) + } else { + checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,\"\",\"\"")) + } + } + } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org