Copilot commented on code in PR #21264:
URL: https://github.com/apache/datafusion/pull/21264#discussion_r3012180514
##########
datafusion/spark/src/function/json/json_tuple.rs:
##########
@@ -144,14 +147,25 @@ fn json_tuple_inner(args: &[ArrayRef], return_type:
&DataType) -> Result<ArrayRe
}
let field_name = field_arrays[field_idx].value(row_idx);
match map.get(field_name) {
- Some(serde_json::Value::Null) => {
- builder.append_null();
- }
- Some(serde_json::Value::String(s)) => {
- builder.append_value(s);
- }
- Some(other) => {
- builder.append_value(other.to_string());
+ Some(raw) => {
+ let raw_str = raw.get();
+ if raw_str == "null" {
+ builder.append_null();
+ } else if raw_str.starts_with('"') {
+ // String value: parse to unescape
+ match serde_json::from_str::<String>(raw_str) {
+ Ok(s) => builder.append_value(s),
+ Err(_) => builder.append_value(raw_str),
+ }
+ } else {
+ // Numbers, booleans: use raw text as-is
+ // Spark uppercases exponent: 1.5e10 → 1.5E10
+ if raw_str.contains('e') {
Review Comment:
The exponent uppercasing logic is applied to any non-string JSON value that
contains the character 'e'. This will corrupt values like the boolean literal
`false` (becomes `falsE`) and can also mutate nested objects/arrays if their
raw JSON contains 'e' in keys/strings (e.g. `{ "here": 1 }` becomes `{ "hErE":
1 }`). Uppercasing should only be applied when the raw value is a JSON number
that uses exponent notation (e/E), not for booleans/objects/arrays.
```suggestion
// Numbers, booleans, objects, arrays: use
raw text as-is
// Spark uppercases exponent in numeric
literals: 1.5e10 → 1.5E10
if (raw_str.contains('e') ||
raw_str.contains('E'))
&&
serde_json::from_str::<serde_json::Number>(raw_str).is_ok()
{
// Only adjust exponent marker for valid
numeric literals
```
##########
datafusion/spark/src/function/json/json_tuple.rs:
##########
@@ -219,6 +234,73 @@ mod tests {
}
}
+ /// Helper to run json_tuple with a single field and return the result
string.
+ fn json_tuple_single(json: &str, field: &str) -> Option<String> {
+ let json_arr: ArrayRef = Arc::new(StringArray::from(vec![json]));
+ let field_arr: ArrayRef = Arc::new(StringArray::from(vec![field]));
+
+ let return_type =
+ DataType::Struct(vec![Field::new("c0", DataType::Utf8,
true)].into());
+
+ let result = json_tuple_inner(&[json_arr, field_arr],
&return_type).unwrap();
+ let struct_arr =
result.as_any().downcast_ref::<StructArray>().unwrap();
+ let col = struct_arr.column(0);
+ let str_arr = col.as_any().downcast_ref::<StringArray>().unwrap();
+
+ if str_arr.is_null(0) {
+ None
+ } else {
+ Some(str_arr.value(0).to_string())
+ }
+ }
+
+ #[test]
+ fn test_number_scientific_notation() {
+ // Spark: json_tuple('{"v":1.5e10}', 'v') → '1.5E10'
+ assert_eq!(
+ json_tuple_single(r#"{"v":1.5e10}"#, "v"),
+ Some("1.5E10".to_string())
+ );
+ }
+
+ #[test]
+ fn test_number_large_integer() {
+ // Spark: json_tuple('{"v":99999999999999999999}', 'v') →
'99999999999999999999'
+ assert_eq!(
+ json_tuple_single(r#"{"v":99999999999999999999}"#, "v"),
+ Some("99999999999999999999".to_string())
+ );
+ }
+
+ #[test]
+ fn test_number_negative_zero() {
+ // Spark: json_tuple('{"v":-0}', 'v') → '0'
+ // RawValue preserves '-0', but Spark returns '0'
+ // This is acceptable — both are valid representations
+ let result = json_tuple_single(r#"{"v":-0}"#, "v");
+ assert!(
+ result == Some("-0".to_string()) || result ==
Some("0".to_string()),
+ "expected '-0' or '0', got {:?}",
+ result
Review Comment:
This unit test allows either "-0" or "0", which makes it possible for
`json_tuple` to diverge from Spark without failing CI (the comment states Spark
returns "0"). If Spark compatibility is required here, the test should assert a
single expected value after the implementation normalizes negative zero.
```suggestion
// Ensure compatibility with Spark by expecting normalized "0"
assert_eq!(
json_tuple_single(r#"{"v":-0}"#, "v"),
Some("0".to_string())
```
##########
datafusion/spark/src/function/json/json_tuple.rs:
##########
@@ -144,14 +147,25 @@ fn json_tuple_inner(args: &[ArrayRef], return_type:
&DataType) -> Result<ArrayRe
}
let field_name = field_arrays[field_idx].value(row_idx);
match map.get(field_name) {
- Some(serde_json::Value::Null) => {
- builder.append_null();
- }
- Some(serde_json::Value::String(s)) => {
- builder.append_value(s);
- }
- Some(other) => {
- builder.append_value(other.to_string());
+ Some(raw) => {
+ let raw_str = raw.get();
+ if raw_str == "null" {
+ builder.append_null();
+ } else if raw_str.starts_with('"') {
+ // String value: parse to unescape
+ match serde_json::from_str::<String>(raw_str) {
+ Ok(s) => builder.append_value(s),
+ Err(_) => builder.append_value(raw_str),
+ }
+ } else {
+ // Numbers, booleans: use raw text as-is
+ // Spark uppercases exponent: 1.5e10 → 1.5E10
+ if raw_str.contains('e') {
+ builder.append_value(raw_str.replace('e',
"E"));
+ } else {
+ builder.append_value(raw_str);
+ }
Review Comment:
`json_tuple` currently returns the raw text for `-0` (i.e. "-0"), but the
test comment notes Spark returns "0". If the goal is Spark compatibility,
consider normalizing negative zero to "0" (and then assert exactly "0" in the
unit test) so behavior is deterministic and matches Spark.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]