Jefffrey commented on code in PR #18531:
URL: https://github.com/apache/datafusion/pull/18531#discussion_r2517024999


##########
datafusion/functions-nested/src/map.rs:
##########
@@ -84,22 +80,42 @@ fn make_map_batch(args: &[ColumnarValue]) -> 
Result<ColumnarValue> {
 
             row_keys
                 .iter()
-                .try_for_each(|key| check_unique_keys(key.as_ref()))?;
+                .try_for_each(|key| validate_map_keys(key.as_ref()))?;
         }
         ColumnarValue::Scalar(_) => {
-            check_unique_keys(key_array)?;
+            validate_map_keys(key_array)?;
         }
     }
 
     let values = get_first_array_ref(values_arg)?;
+
+    // For const evaluation with NULL maps, we must use make_map_array_internal
+    // because make_map_batch_internal doesn't handle NULL list elements 
correctly
+    if can_evaluate_to_const && keys.null_count() > 0 {
+        // If there are NULL maps, use the array path which handles them 
correctly
+        return if let DataType::LargeList(..) = keys_arg.data_type() {
+            make_map_array_internal::<i64>(keys, values)
+        } else {
+            make_map_array_internal::<i32>(keys, values)
+        };
+    }

Review Comment:
   We should combine it with the existing check inside 
`make_map_batch_internal`:
   
   
https://github.com/apache/datafusion/blob/becc71be04732ca2f62139192dcc76caf6c08fd3/datafusion/functions-nested/src/map.rs#L133-L139
   
   e.g. `!can_evaluate_to_const || keys.null_count() > 0`
   
   Though it would be nice if we could fix the scalar fast path in 
`make_map_batch_internal` to not need this workaround
   
   (Also this made me realize FixedSizeLists are broken for key array input, 
since it handles on List/LargeList)



##########
datafusion/functions-nested/src/map.rs:
##########
@@ -393,17 +466,133 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
         .add_child_data(flattened_values.to_data())
         .build()?;
 
-    let map_data = ArrayData::builder(DataType::Map(
+    let mut map_data_builder = ArrayData::builder(DataType::Map(
         Arc::new(Field::new(
             "entries",
             struct_data.data_type().clone(),
             false,
         )),
         false,
     ))
-    .len(keys.len())
+    .len(original_len) // Use the original number of rows, not the filtered 
count
     .add_child_data(struct_data)
-    .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice()))
-    .build()?;
+    .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice()));

Review Comment:
   I just realized this would lead to an error with large lists; large lists 
have offset `i64` and `offset_buffer` would similarly be `Vec<i64>` but 
MapArrays expect offsets as i32



##########
datafusion/functions-nested/src/map.rs:
##########
@@ -356,27 +372,84 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
     keys: ArrayRef,
     values: ArrayRef,
 ) -> Result<ColumnarValue> {
-    let mut offset_buffer = vec![O::zero()];
-    let mut running_offset = O::zero();
+    // Save original data types and array length before list_to_arrays 
transforms them
+    let keys_data_type = keys.data_type().clone();
+    let values_data_type = values.data_type().clone();
+    let original_len = keys.len(); // This is the number of rows in the input
+
+    // Save the nulls bitmap from the original keys array (before 
list_to_arrays)
+    // This tells us which MAP values are NULL (not which keys within maps are 
null)
+    let nulls_bitmap = keys.nulls().cloned();
 
     let keys = list_to_arrays::<O>(&keys);
     let values = list_to_arrays::<O>(&values);
 
     let mut key_array_vec = vec![];
     let mut value_array_vec = vec![];
     for (k, v) in keys.iter().zip(values.iter()) {
-        running_offset = running_offset.add(O::usize_as(k.len()));
-        offset_buffer.push(running_offset);
         key_array_vec.push(k.as_ref());
         value_array_vec.push(v.as_ref());
     }
 
-    // concatenate all the arrays
-    let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?;
-    if flattened_keys.null_count() > 0 {
-        return exec_err!("keys cannot be null");
+    // Build offset buffer that accounts for NULL maps
+    // For each row, if it's NULL, the offset stays the same (empty range)
+    // If it's not NULL, the offset advances by the number of entries in that 
map
+    let mut offset_buffer = vec![O::zero()];
+    let mut running_offset = O::zero();
+    let mut non_null_idx = 0;
+
+    for i in 0..original_len {
+        let is_null = nulls_bitmap.as_ref().is_some_and(|nulls| 
nulls.is_null(i));
+        if is_null {
+            // NULL map: offset doesn't advance (empty range)
+            offset_buffer.push(running_offset);
+        } else {
+            // Non-NULL map: advance offset by the number of entries
+            if non_null_idx < key_array_vec.len() {
+                running_offset =
+                    
running_offset.add(O::usize_as(key_array_vec[non_null_idx].len()));
+                non_null_idx += 1;
+            }
+            offset_buffer.push(running_offset);
+        }
     }

Review Comment:
   ```suggestion
       let mut running_offset = O::zero();
       let mut offset_buffer = vec![running_offset];
       let mut non_null_idx = 0;
       for i in 0..original_len {
           let is_null = nulls_bitmap.as_ref().is_some_and(|nulls| 
nulls.is_null(i));
           if !is_null {
               running_offset += O::usize_as(keys[non_null_idx].len());
               non_null_idx += 1;
           }
           offset_buffer.push(running_offset);
       }
   ```
   
   Bit more compact



##########
datafusion/functions-nested/src/map.rs:
##########
@@ -356,27 +372,84 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
     keys: ArrayRef,
     values: ArrayRef,
 ) -> Result<ColumnarValue> {
-    let mut offset_buffer = vec![O::zero()];
-    let mut running_offset = O::zero();
+    // Save original data types and array length before list_to_arrays 
transforms them
+    let keys_data_type = keys.data_type().clone();
+    let values_data_type = values.data_type().clone();
+    let original_len = keys.len(); // This is the number of rows in the input
+
+    // Save the nulls bitmap from the original keys array (before 
list_to_arrays)
+    // This tells us which MAP values are NULL (not which keys within maps are 
null)
+    let nulls_bitmap = keys.nulls().cloned();
 
     let keys = list_to_arrays::<O>(&keys);
     let values = list_to_arrays::<O>(&values);
 
     let mut key_array_vec = vec![];
     let mut value_array_vec = vec![];
     for (k, v) in keys.iter().zip(values.iter()) {
-        running_offset = running_offset.add(O::usize_as(k.len()));
-        offset_buffer.push(running_offset);
         key_array_vec.push(k.as_ref());
         value_array_vec.push(v.as_ref());
     }
 
-    // concatenate all the arrays
-    let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?;
-    if flattened_keys.null_count() > 0 {
-        return exec_err!("keys cannot be null");
+    // Build offset buffer that accounts for NULL maps
+    // For each row, if it's NULL, the offset stays the same (empty range)
+    // If it's not NULL, the offset advances by the number of entries in that 
map
+    let mut offset_buffer = vec![O::zero()];
+    let mut running_offset = O::zero();
+    let mut non_null_idx = 0;
+
+    for i in 0..original_len {
+        let is_null = nulls_bitmap.as_ref().is_some_and(|nulls| 
nulls.is_null(i));
+        if is_null {
+            // NULL map: offset doesn't advance (empty range)
+            offset_buffer.push(running_offset);
+        } else {
+            // Non-NULL map: advance offset by the number of entries
+            if non_null_idx < key_array_vec.len() {
+                running_offset =
+                    
running_offset.add(O::usize_as(key_array_vec[non_null_idx].len()));
+                non_null_idx += 1;
+            }
+            offset_buffer.push(running_offset);
+        }
     }
-    let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?;
+
+    // concatenate all the arrays
+    // If key_array_vec is empty, it means all maps were NULL (list elements 
were NULL).
+    // In this case, we need to create empty arrays with the correct data type.
+    let (flattened_keys, flattened_values) = if key_array_vec.is_empty() {
+        // All maps are NULL - create empty arrays
+        // We need to infer the data type from the original keys/values arrays
+        let key_type = match &keys_data_type {
+            DataType::List(field) | DataType::LargeList(field) => {
+                field.data_type().clone()
+            }
+            DataType::FixedSizeList(field, _) => field.data_type().clone(),
+            _ => return exec_err!("Expected List, LargeList or FixedSizeList 
for keys"),
+        };
+
+        let value_type = match &values_data_type {
+            DataType::List(field) | DataType::LargeList(field) => {
+                field.data_type().clone()
+            }
+            DataType::FixedSizeList(field, _) => field.data_type().clone(),
+            _ => {
+                return exec_err!("Expected List, LargeList or FixedSizeList 
for values")
+            }
+        };

Review Comment:
   ```suggestion
           let key_type = get_element_type(&keys_data_type)?;
           let value_type = get_element_type(&values_data_type)?;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to