This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new f870dcd878 fix: Support Dictionary[Int32, Binary] for bitmap count 
spark function (#18273)
f870dcd878 is described below

commit f870dcd87852d77c16e49c8b7c38337becf787ef
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Mon Oct 27 00:06:54 2025 -0700

    fix: Support Dictionary[Int32, Binary] for bitmap count spark function 
(#18273)
    
    ## Which issue does this PR close?
    
    Closes https://github.com/apache/datafusion/issues/18058
    
    ## Rationale for this change
    
    When adding the bitmap_count function to Comet, we get the following
    error - org.apache.comet.CometNativeException: Error from DataFusion:
    bitmap_count expects Binary/BinaryView/FixedSizeBinary/LargeBinary as
    argument, got Dictionary(Int32, Binary).
    
    ## Are these changes tested?
    
    Added new UT
    
    ---------
    
    Co-authored-by: Kazantsev Maksim <[email protected]>
---
 .../spark/src/function/bitmap/bitmap_count.rs      | 65 ++++++++++++++++++++--
 .../test_files/spark/bitmap/bitmap_count.slt       | 32 +++++++++++
 2 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs 
b/datafusion/spark/src/function/bitmap/bitmap_count.rs
index 15bd33229a..56a9c5edb8 100644
--- a/datafusion/spark/src/function/bitmap/bitmap_count.rs
+++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs
@@ -19,13 +19,13 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, 
Int64Array,
-    LargeBinaryArray,
+    as_dictionary_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
+    FixedSizeBinaryArray, Int64Array, LargeBinaryArray,
 };
-use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{
-    Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary,
+    Binary, BinaryView, Dictionary, FixedSizeBinary, LargeBinary,
 };
+use arrow::datatypes::{DataType, Int16Type, Int32Type, Int64Type, Int8Type};
 use datafusion_common::utils::take_function_args;
 use datafusion_common::{internal_err, Result};
 use datafusion_expr::{
@@ -71,7 +71,7 @@ impl ScalarUDFImpl for BitmapCount {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(Int64)
+        Ok(DataType::Int64)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
@@ -90,6 +90,17 @@ macro_rules! downcast_and_count_ones {
     }};
 }
 
+macro_rules! downcast_dict_and_count_ones {
+    ($input_dict:expr, $key_array_type:ident) => {{
+        let dict_array = as_dictionary_array::<$key_array_type>($input_dict);
+        let array = dict_array.downcast_dict::<BinaryArray>().unwrap();
+        Ok(array
+            .into_iter()
+            .map(binary_count_ones)
+            .collect::<Int64Array>())
+    }};
+}
+
 pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
     let [input_array] = take_function_args("bitmap_count", arg)?;
 
@@ -100,6 +111,17 @@ pub fn bitmap_count_inner(arg: &[ArrayRef]) -> 
Result<ArrayRef> {
         FixedSizeBinary(_size) => {
             downcast_and_count_ones!(input_array, FixedSizeBinaryArray)
         }
+        Dictionary(k, v) if v.as_ref() == &Binary => match k.as_ref() {
+            DataType::Int8 => downcast_dict_and_count_ones!(input_array, 
Int8Type),
+            DataType::Int16 => downcast_dict_and_count_ones!(input_array, 
Int16Type),
+            DataType::Int32 => downcast_dict_and_count_ones!(input_array, 
Int32Type),
+            DataType::Int64 => downcast_dict_and_count_ones!(input_array, 
Int64Type),
+            data_type => {
+                internal_err!(
+                    "bitmap_count does not support Dictionary({data_type}, 
Binary)"
+                )
+            }
+        },
         data_type => {
             internal_err!("bitmap_count does not support {data_type}")
         }
@@ -114,8 +136,12 @@ mod tests {
     use crate::function::utils::test::test_scalar_function;
     use arrow::array::{Array, Int64Array};
     use arrow::datatypes::DataType::Int64;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::ColumnarValue::Scalar;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
 
     macro_rules! test_bitmap_count_binary_invoke {
         ($INPUT:expr, $EXPECTED:expr) => {
@@ -171,4 +197,31 @@ mod tests {
         );
         Ok(())
     }
+
+    #[test]
+    fn test_dictionary_encoded_bitmap_count_invoke() -> Result<()> {
+        let dict = Scalar(ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Binary(Some(vec![0xFFu8, 0xFFu8]))),
+        ));
+
+        let arg_fields = vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Binary)),
+            true,
+        )
+        .into()];
+        let args = ScalarFunctionArgs {
+            args: vec![dict.clone()],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", Int64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let udf = BitmapCount::new();
+        let actual = udf.invoke_with_args(args)?;
+        let expect = Scalar(ScalarValue::Int64(Some(16)));
+        assert_eq!(*actual.into_array(1)?, *expect.into_array(1)?);
+        Ok(())
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt 
b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
index 2789efef7b..39dca51222 100644
--- a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
@@ -59,3 +59,35 @@ SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) 
FROM (VALUES (X'1010'),
 5
 16
 NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int32, Binary)')) FROM (VALUES 
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int8, Binary)')) FROM (VALUES 
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int16, Binary)')) FROM (VALUES 
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int64, Binary)')) FROM (VALUES 
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to