Re: [PR] fix(spark): preserve raw number text in json_tuple to match Spark [datafusion]

via GitHub Mon, 30 Mar 2026 14:02:21 -0700


Copilot commented on code in PR #21264:
URL: https://github.com/apache/datafusion/pull/21264#discussion_r3012180514



##########
datafusion/spark/src/function/json/json_tuple.rs:
##########
@@ -144,14 +147,25 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: 
&DataType) -> Result<ArrayRe
                     }
                     let field_name = field_arrays[field_idx].value(row_idx);
                     match map.get(field_name) {
-                        Some(serde_json::Value::Null) => {
-                            builder.append_null();
-                        }
-                        Some(serde_json::Value::String(s)) => {
-                            builder.append_value(s);
-                        }
-                        Some(other) => {
-                            builder.append_value(other.to_string());
+                        Some(raw) => {
+                            let raw_str = raw.get();
+                            if raw_str == "null" {
+                                builder.append_null();
+                            } else if raw_str.starts_with('"') {
+                                // String value: parse to unescape
+                                match serde_json::from_str::<String>(raw_str) {
+                                    Ok(s) => builder.append_value(s),
+                                    Err(_) => builder.append_value(raw_str),
+                                }
+                            } else {
+                                // Numbers, booleans: use raw text as-is
+                                // Spark uppercases exponent: 1.5e10 → 1.5E10
+                                if raw_str.contains('e') {

Review Comment:
   The exponent uppercasing logic is applied to any non-string JSON value that 
contains the character 'e'. This will corrupt values like the boolean literal 
`false` (becomes `falsE`) and can also mutate nested objects/arrays if their 
raw JSON contains 'e' in keys/strings (e.g. `{ "here": 1 }` becomes `{ "hErE": 
1 }`). Uppercasing should only be applied when the raw value is a JSON number 
that uses exponent notation (e/E), not for booleans/objects/arrays.
   ```suggestion
                                   // Numbers, booleans, objects, arrays: use 
raw text as-is
                                   // Spark uppercases exponent in numeric 
literals: 1.5e10 → 1.5E10
                                   if (raw_str.contains('e') || 
raw_str.contains('E'))
                                       && 
serde_json::from_str::<serde_json::Number>(raw_str).is_ok()
                                   {
                                       // Only adjust exponent marker for valid 
numeric literals
   ```



##########
datafusion/spark/src/function/json/json_tuple.rs:
##########
@@ -219,6 +234,73 @@ mod tests {
         }
     }
 
+    /// Helper to run json_tuple with a single field and return the result 
string.
+    fn json_tuple_single(json: &str, field: &str) -> Option<String> {
+        let json_arr: ArrayRef = Arc::new(StringArray::from(vec![json]));
+        let field_arr: ArrayRef = Arc::new(StringArray::from(vec![field]));
+
+        let return_type =
+            DataType::Struct(vec![Field::new("c0", DataType::Utf8, 
true)].into());
+
+        let result = json_tuple_inner(&[json_arr, field_arr], 
&return_type).unwrap();
+        let struct_arr = 
result.as_any().downcast_ref::<StructArray>().unwrap();
+        let col = struct_arr.column(0);
+        let str_arr = col.as_any().downcast_ref::<StringArray>().unwrap();
+
+        if str_arr.is_null(0) {
+            None
+        } else {
+            Some(str_arr.value(0).to_string())
+        }
+    }
+
+    #[test]
+    fn test_number_scientific_notation() {
+        // Spark: json_tuple('{"v":1.5e10}', 'v') → '1.5E10'
+        assert_eq!(
+            json_tuple_single(r#"{"v":1.5e10}"#, "v"),
+            Some("1.5E10".to_string())
+        );
+    }
+
+    #[test]
+    fn test_number_large_integer() {
+        // Spark: json_tuple('{"v":99999999999999999999}', 'v') → 
'99999999999999999999'
+        assert_eq!(
+            json_tuple_single(r#"{"v":99999999999999999999}"#, "v"),
+            Some("99999999999999999999".to_string())
+        );
+    }
+
+    #[test]
+    fn test_number_negative_zero() {
+        // Spark: json_tuple('{"v":-0}', 'v') → '0'
+        // RawValue preserves '-0', but Spark returns '0'
+        // This is acceptable — both are valid representations
+        let result = json_tuple_single(r#"{"v":-0}"#, "v");
+        assert!(
+            result == Some("-0".to_string()) || result == 
Some("0".to_string()),
+            "expected '-0' or '0', got {:?}",
+            result

Review Comment:
   This unit test allows either "-0" or "0", which makes it possible for 
`json_tuple` to diverge from Spark without failing CI (the comment states Spark 
returns "0"). If Spark compatibility is required here, the test should assert a 
single expected value after the implementation normalizes negative zero.
   ```suggestion
           // Ensure compatibility with Spark by expecting normalized "0"
           assert_eq!(
               json_tuple_single(r#"{"v":-0}"#, "v"),
               Some("0".to_string())
   ```



##########
datafusion/spark/src/function/json/json_tuple.rs:
##########
@@ -144,14 +147,25 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: 
&DataType) -> Result<ArrayRe
                     }
                     let field_name = field_arrays[field_idx].value(row_idx);
                     match map.get(field_name) {
-                        Some(serde_json::Value::Null) => {
-                            builder.append_null();
-                        }
-                        Some(serde_json::Value::String(s)) => {
-                            builder.append_value(s);
-                        }
-                        Some(other) => {
-                            builder.append_value(other.to_string());
+                        Some(raw) => {
+                            let raw_str = raw.get();
+                            if raw_str == "null" {
+                                builder.append_null();
+                            } else if raw_str.starts_with('"') {
+                                // String value: parse to unescape
+                                match serde_json::from_str::<String>(raw_str) {
+                                    Ok(s) => builder.append_value(s),
+                                    Err(_) => builder.append_value(raw_str),
+                                }
+                            } else {
+                                // Numbers, booleans: use raw text as-is
+                                // Spark uppercases exponent: 1.5e10 → 1.5E10
+                                if raw_str.contains('e') {
+                                    builder.append_value(raw_str.replace('e', 
"E"));
+                                } else {
+                                    builder.append_value(raw_str);
+                                }

Review Comment:
   `json_tuple` currently returns the raw text for `-0` (i.e. "-0"), but the 
test comment notes Spark returns "0". If the goal is Spark compatibility, 
consider normalizing negative zero to "0" (and then assert exactly "0" in the 
unit test) so behavior is deterministic and matches Spark.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] fix(spark): preserve raw number text in json_tuple to match Spark [datafusion]

Reply via email to