[spark] branch master updated: [SPARK-40001][SQL] Make NULL writes to JSON DEFAULT columns write ‘null’ to storage

gengliang Sat, 13 Aug 2022 10:53:28 -0700

This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f1f17024fc0 [SPARK-40001][SQL] Make NULL writes to JSON DEFAULT 
columns write ‘null’ to storage
f1f17024fc0 is described below

commit f1f17024fc0a9df7bf5683e7b3e5e507d87d4613
Author: Daniel Tenedorio <daniel.tenedo...@databricks.com>
AuthorDate: Sat Aug 13 10:53:10 2022 -0700

    [SPARK-40001][SQL] Make NULL writes to JSON DEFAULT columns write ‘null’ to 
storage
    
    ### What changes were proposed in this pull request?
    
    Add config to Make NULL writes to JSON DEFAULT columns write ‘null’ to 
storage.
    
    When this new config is true, INSERT/UPDATE/MERGE commands writing NULL 
values to DEFAULT columns in JSON tables will always write 'null' to the JSON 
storage, overriding any other settings. This can be useful to enforce that 
inserted NULL values are present in storage to differentiate from missing data.
    
    ### Why are the changes needed?
    
    This can help guard correctness of query results.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, please see above.
    
    ### How was this patch tested?
    
    This PR adds new unit test coverage.
    
    Closes #37431 from dtenedor/json-explicits.
    
    Authored-by: Daniel Tenedorio <daniel.tenedo...@databricks.com>
    Signed-off-by: Gengliang Wang <gengli...@apache.org>
---
 .../spark/sql/catalyst/json/JSONOptions.scala      |  6 ++++++
 .../spark/sql/catalyst/json/JacksonGenerator.scala |  3 ++-
 .../org/apache/spark/sql/internal/SQLConf.scala    | 15 +++++++++++++
 .../org/apache/spark/sql/sources/InsertSuite.scala | 25 ++++++++++++++++++++--
 4 files changed, 46 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index 66fd22894f9..9679a60622b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -84,6 +84,12 @@ private[sql] class JSONOptions(
   val ignoreNullFields = parameters.get("ignoreNullFields").map(_.toBoolean)
     .getOrElse(SQLConf.get.jsonGeneratorIgnoreNullFields)
 
+  // If this is true, when writing NULL values to columns of JSON tables with 
explicit DEFAULT
+  // values, never skip writing the NULL values to storage, overriding 
'ignoreNullFields' above.
+  // This can be useful to enforce that inserted NULL values are present in 
storage to differentiate
+  // from missing data.
+  val writeNullIfWithDefaultValue = SQLConf.get.jsonWriteNullIfWithDefaultValue
+
   // A language tag in IETF BCP 47 format
   val locale: Locale = 
parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index 336c0ceecc9..a1e25eb4c94 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -227,7 +227,8 @@ private[sql] class JacksonGenerator(
       if (!row.isNullAt(i)) {
         gen.writeFieldName(field.name)
         fieldWriters(i).apply(row, i)
-      } else if (!options.ignoreNullFields) {
+      } else if (!options.ignoreNullFields ||
+        (options.writeNullIfWithDefaultValue && 
field.getExistenceDefaultValue().isDefined)) {
         gen.writeFieldName(field.name)
         gen.writeNull()
       }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f78fa8c9ef2..98e6d2a1360 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2924,6 +2924,18 @@ object SQLConf {
       .stringConf
       .createWithDefault("csv,json,orc,parquet")
 
+  val JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE =
+    buildConf("spark.sql.jsonGenerator.writeNullIfWithDefaultValue")
+      .internal()
+      .doc("When true, when writing NULL values to columns of JSON tables with 
explicit DEFAULT " +
+        "values using INSERT, UPDATE, or MERGE commands, never skip writing 
the NULL values to " +
+        "storage, overriding spark.sql.jsonGenerator.ignoreNullFields or the 
ignoreNullFields " +
+        "option. This can be useful to enforce that inserted NULL values are 
present in " +
+        "storage to differentiate from missing data.")
+      .version("3.4.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES =
     buildConf("spark.sql.defaultColumn.useNullsForMissingDefaultValues")
       .internal()
@@ -4523,6 +4535,9 @@ class SQLConf extends Serializable with Logging {
 
   def defaultColumnAllowedProviders: String = 
getConf(SQLConf.DEFAULT_COLUMN_ALLOWED_PROVIDERS)
 
+  def jsonWriteNullIfWithDefaultValue: Boolean =
+    getConf(JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE)
+
   def useNullsForMissingDefaultColumnValues: Boolean =
     getConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES)
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 9ba01badd19..3936f2b995c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -1654,8 +1654,7 @@ class InsertSuite extends DataSourceTest with 
SharedSparkSession {
         dataSource = "json",
         Seq(
           Config(
-            None,
-            insertNullsToStorage = false),
+            None),
           Config(
             Some(SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key -> "false")))),
       TestCase(
@@ -1702,6 +1701,28 @@ class InsertSuite extends DataSourceTest with 
SharedSparkSession {
     }
   }
 
+  test("SPARK-40001 JSON DEFAULT columns = 
JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE off") {
+    val error = "DEFAULT values are not supported for JSON tables"
+    // Check that the JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE config 
overrides the
+    // JSON_GENERATOR_IGNORE_NULL_FIELDS config.
+    withSQLConf(SQLConf.JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE.key -> 
"true",
+      SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key -> "true") {
+      withTable("t") {
+        sql("create table t (a int default 42) using json")
+        sql("insert into t values (null)")
+        checkAnswer(spark.table("t"), Row(null))
+      }
+    }
+    withSQLConf(SQLConf.JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE.key -> 
"false",
+      SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key -> "true") {
+      withTable("t") {
+        sql("create table t (a int default 42) using json")
+        sql("insert into t values (null)")
+        checkAnswer(spark.table("t"), Row(42))
+      }
+    }
+  }
+
   test("SPARK-39359 Restrict DEFAULT columns to allowlist of supported data 
source types") {
     withSQLConf(SQLConf.DEFAULT_COLUMN_ALLOWED_PROVIDERS.key -> 
"csv,json,orc") {
       val unsupported = "DEFAULT values are not supported for target data 
source"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-40001][SQL] Make NULL writes to JSON DEFAULT columns write ‘null’ to storage

Reply via email to