[spark] branch master updated: [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of write null value in csv to unquoted empty string

wenchen Fri, 15 Apr 2022 01:46:09 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 965f872500a [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the 
breaking change of write null value in csv to unquoted empty string
965f872500a is described below

commit 965f872500a3554142cab3078a7a4d513d2d2ee8
Author: Xinyi Yu <xinyi...@databricks.com>
AuthorDate: Fri Apr 15 16:45:47 2022 +0800

    [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of 
write null value in csv to unquoted empty string
    
    ### What changes were proposed in this pull request?
    
    Add a legacy flag `spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv` 
for the breaking change introduced in 
https://github.com/apache/spark/pull/34853 and 
https://github.com/apache/spark/pull/34905 (followup).
    
    The flag is disabled by default, so the null values written as csv will 
output an unquoted empty string. When the legacy flag is enabled, the null will 
output quoted empty string.
    
    ### Why are the changes needed?
    The original commit is a breaking change, and breaking changes should be 
encouraged to add a flag to turn it off for smooth migration between versions.
    
    ### Does this PR introduce _any_ user-facing change?
    With the default value of the conf, there is no user-facing difference.
    If users turn this conf off, they can restore the pre-change behavior.
    
    ### How was this patch tested?
    Through unit tests.
    
    Closes #36110 from anchovYu/flags-null-to-csv.
    
    Authored-by: Xinyi Yu <xinyi...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/csv/UnivocityGenerator.scala  |  4 ++++
 .../org/apache/spark/sql/internal/SQLConf.scala      | 10 ++++++++++
 .../sql/execution/datasources/csv/CSVSuite.scala     | 20 ++++++++++++++------
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
index 5dd8c35e4c2..d124a055f63 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
@@ -24,6 +24,7 @@ import com.univocity.parsers.csv.CsvWriter
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, 
IntervalStringStyles, IntervalUtils, TimestampFormatter}
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 class UnivocityGenerator(
@@ -95,6 +96,9 @@ class UnivocityGenerator(
     while (i < row.numFields) {
       if (!row.isNullAt(i)) {
         values(i) = valueConverters(i).apply(row, i)
+      } else if (
+        
SQLConf.get.getConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV))
 {
+        values(i) = options.nullValue
       }
       i += 1
     }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index ac2a2e350c6..36b666fd59c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3754,6 +3754,16 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV =
+    buildConf("spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv")
+      .internal()
+      .doc("When set to false, nulls are written as unquoted empty strings in 
CSV data source. " +
+        "If set to false, it restores the legacy behavior that nulls were 
written as quoted " +
+        "empty strings, `\"\"`.")
+      .version("3.3.0")
+      .booleanConf
+      .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 819bb430173..9637a85ea35 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -807,12 +807,20 @@ abstract class CSVSuite
 
   test("SPARK-37575: null values should be saved as nothing rather than " +
     "quoted empty Strings \"\" with default settings") {
-    withTempPath { path =>
-      Seq(("Tesla", null: String, ""))
-        .toDF("make", "comment", "blank")
-        .write
-        .csv(path.getCanonicalPath)
-      checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,,\"\""))
+    Seq("true", "false").foreach { confVal =>
+      
withSQLConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV.key -> 
confVal) {
+        withTempPath { path =>
+          Seq(("Tesla", null: String, ""))
+            .toDF("make", "comment", "blank")
+            .write
+            .csv(path.getCanonicalPath)
+          if (confVal == "false") {
+            checkAnswer(spark.read.text(path.getCanonicalPath), 
Row("Tesla,,\"\""))
+          } else {
+            checkAnswer(spark.read.text(path.getCanonicalPath), 
Row("Tesla,\"\",\"\""))
+          }
+        }
+      }
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of write null value in csv to unquoted empty string

Reply via email to