This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 5740774acc Fix AvroReader: Add union resolving for nested struct 
arrays (#12686)
5740774acc is described below

commit 5740774acc9febb51fafb17235066c4c5ef2b32a
Author: JonasDev1 <[email protected]>
AuthorDate: Thu Oct 3 00:07:07 2024 +0200

    Fix AvroReader: Add union resolving for nested struct arrays (#12686)
    
    * Add union resolving for nested struct arrays
    
    * Add test
    
    * Change test
    
    * Reproduce index error
    
    * fmt
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../datasource/avro_to_arrow/arrow_array_reader.rs | 89 +++++++++++++++++++++-
 1 file changed, 88 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs 
b/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs
index 3a5d50bba0..98b6702bc3 100644
--- a/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs
+++ b/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs
@@ -573,7 +573,7 @@ impl<'a, R: Read> AvroArrowArrayReader<'a, R> {
                 // extract list values, with non-lists converted to Value::Null
                 let array_item_count = rows
                     .iter()
-                    .map(|row| match row {
+                    .map(|row| match maybe_resolve_union(row) {
                         Value::Array(values) => values.len(),
                         _ => 1,
                     })
@@ -1643,6 +1643,93 @@ mod test {
         assert_batches_eq!(expected, &[batch]);
     }
 
+    #[test]
+    fn test_avro_nullable_struct_array() {
+        let schema = apache_avro::Schema::parse_str(
+            r#"
+            {
+              "type": "record",
+              "name": "r1",
+              "fields": [
+                {
+                  "name": "col1",
+                  "type": [
+                    "null",
+                    {
+                        "type": "array",
+                        "items": {
+                            "type": [
+                                "null",
+                                {
+                                    "type": "record",
+                                    "name": "Item",
+                                    "fields": [
+                                        {
+                                            "name": "id",
+                                            "type": "long"
+                                        }
+                                    ]
+                                }
+                            ]
+                        }
+                    }
+                  ],
+                  "default": null
+                }
+              ]
+            }"#,
+        )
+        .unwrap();
+        let jv1 = serde_json::json!({
+            "col1": [
+                {
+                    "id": 234
+                },
+                {
+                    "id": 345
+                }
+            ]
+        });
+        let r1 = apache_avro::to_value(jv1)
+            .unwrap()
+            .resolve(&schema)
+            .unwrap();
+        let r2 = apache_avro::to_value(serde_json::json!({ "col1": null }))
+            .unwrap()
+            .resolve(&schema)
+            .unwrap();
+
+        let mut w = apache_avro::Writer::new(&schema, vec![]);
+        for _i in 0..5 {
+            w.append(r1.clone()).unwrap();
+        }
+        w.append(r2).unwrap();
+        let bytes = w.into_inner().unwrap();
+
+        let mut reader = ReaderBuilder::new()
+            .read_schema()
+            .with_batch_size(20)
+            .build(std::io::Cursor::new(bytes))
+            .unwrap();
+        let batch = reader.next().unwrap().unwrap();
+        assert_eq!(batch.num_rows(), 6);
+        assert_eq!(batch.num_columns(), 1);
+
+        let expected = [
+            "+------------------------+",
+            "| col1                   |",
+            "+------------------------+",
+            "| [{id: 234}, {id: 345}] |",
+            "| [{id: 234}, {id: 345}] |",
+            "| [{id: 234}, {id: 345}] |",
+            "| [{id: 234}, {id: 345}] |",
+            "| [{id: 234}, {id: 345}] |",
+            "|                        |",
+            "+------------------------+",
+        ];
+        assert_batches_eq!(expected, &[batch]);
+    }
+
     #[test]
     fn test_avro_iterator() {
         let reader = build_reader("alltypes_plain.avro", 5);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to