This is an automated email from the ASF dual-hosted git repository.

jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 215f30f74a fix: improve `unnest_generic_list` handling of null list 
(#9975)
215f30f74a is described below

commit 215f30f74a12e91fd7dca0d30e37014c8c3493ed
Author: Jonah Gao <[email protected]>
AuthorDate: Mon Apr 8 11:08:06 2024 +0800

    fix: improve `unnest_generic_list` handling of null list (#9975)
    
    * fix: improve `unnest_generic_list` handling of null list
    
    * fix clippy
    
    * fix comment
---
 datafusion/physical-plan/src/unnest.rs | 139 +++++++++++++++++++++++++++------
 1 file changed, 117 insertions(+), 22 deletions(-)

diff --git a/datafusion/physical-plan/src/unnest.rs 
b/datafusion/physical-plan/src/unnest.rs
index 324e2ea2d7..6ea1b3c40c 100644
--- a/datafusion/physical-plan/src/unnest.rs
+++ b/datafusion/physical-plan/src/unnest.rs
@@ -364,32 +364,31 @@ fn unnest_generic_list<T: OffsetSizeTrait, P: 
ArrowPrimitiveType<Native = T>>(
     options: &UnnestOptions,
 ) -> Result<Arc<dyn Array + 'static>> {
     let values = list_array.values();
-    if list_array.null_count() == 0 || !options.preserve_nulls {
-        Ok(values.clone())
-    } else {
-        let mut take_indicies_builder =
-            PrimitiveArray::<P>::builder(values.len() + 
list_array.null_count());
-        let mut take_offset = 0;
+    if list_array.null_count() == 0 {
+        return Ok(values.clone());
+    }
 
-        list_array.iter().for_each(|elem| match elem {
-            Some(array) => {
-                for i in 0..array.len() {
-                    // take_offset + i is always positive
-                    let take_index = P::Native::from_usize(take_offset + 
i).unwrap();
-                    take_indicies_builder.append_value(take_index);
-                }
-                take_offset += array.len();
-            }
-            None => {
+    let mut take_indicies_builder =
+        PrimitiveArray::<P>::builder(values.len() + list_array.null_count());
+    let offsets = list_array.value_offsets();
+    for row in 0..list_array.len() {
+        if list_array.is_null(row) {
+            if options.preserve_nulls {
                 take_indicies_builder.append_null();
             }
-        });
-        Ok(kernels::take::take(
-            &values,
-            &take_indicies_builder.finish(),
-            None,
-        )?)
+        } else {
+            let start = offsets[row].as_usize();
+            let end = offsets[row + 1].as_usize();
+            for idx in start..end {
+                
take_indicies_builder.append_value(P::Native::from_usize(idx).unwrap());
+            }
+        }
     }
+    Ok(kernels::take::take(
+        &values,
+        &take_indicies_builder.finish(),
+        None,
+    )?)
 }
 
 fn build_batch_fixedsize_list(
@@ -596,3 +595,99 @@ where
 
     Ok(RecordBatch::try_new(schema.clone(), arrays.to_vec())?)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::{
+        array::AsArray,
+        datatypes::{DataType, Field},
+    };
+    use arrow_array::StringArray;
+    use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer};
+
+    // Create a ListArray with the following list values:
+    //  [A, B, C], [], NULL, [D], NULL, [NULL, F]
+    fn make_test_array() -> ListArray {
+        let mut values = vec![];
+        let mut offsets = vec![0];
+        let mut valid = BooleanBufferBuilder::new(2);
+
+        // [A, B, C]
+        values.extend_from_slice(&[Some("A"), Some("B"), Some("C")]);
+        offsets.push(values.len() as i32);
+        valid.append(true);
+
+        // []
+        offsets.push(values.len() as i32);
+        valid.append(true);
+
+        // NULL with non-zero value length
+        // Issue https://github.com/apache/arrow-datafusion/issues/9932
+        values.push(Some("?"));
+        offsets.push(values.len() as i32);
+        valid.append(false);
+
+        // [D]
+        values.push(Some("D"));
+        offsets.push(values.len() as i32);
+        valid.append(true);
+
+        // Another NULL with zero value length
+        offsets.push(values.len() as i32);
+        valid.append(false);
+
+        // [NULL, F]
+        values.extend_from_slice(&[None, Some("F")]);
+        offsets.push(values.len() as i32);
+        valid.append(true);
+
+        let field = Arc::new(Field::new("item", DataType::Utf8, true));
+        ListArray::new(
+            field,
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(StringArray::from(values)),
+            Some(NullBuffer::new(valid.finish())),
+        )
+    }
+
+    #[test]
+    fn test_unnest_generic_list() -> datafusion_common::Result<()> {
+        let list_array = make_test_array();
+
+        // Test with preserve_nulls = false
+        let options = UnnestOptions {
+            preserve_nulls: false,
+        };
+        let unnested_array =
+            unnest_generic_list::<i32, Int32Type>(&list_array, &options)?;
+        let strs = 
unnested_array.as_string::<i32>().iter().collect::<Vec<_>>();
+        assert_eq!(
+            strs,
+            vec![Some("A"), Some("B"), Some("C"), Some("D"), None, Some("F")]
+        );
+
+        // Test with preserve_nulls = true
+        let options = UnnestOptions {
+            preserve_nulls: true,
+        };
+        let unnested_array =
+            unnest_generic_list::<i32, Int32Type>(&list_array, &options)?;
+        let strs = 
unnested_array.as_string::<i32>().iter().collect::<Vec<_>>();
+        assert_eq!(
+            strs,
+            vec![
+                Some("A"),
+                Some("B"),
+                Some("C"),
+                None,
+                Some("D"),
+                None,
+                None,
+                Some("F")
+            ]
+        );
+
+        Ok(())
+    }
+}

Reply via email to