This is an automated email from the ASF dual-hosted git repository.
jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new fd7df66724 feat(spark): implement Spark `bitmap` function
`bitmap_count` (#17179)
fd7df66724 is described below
commit fd7df66724f958a2d44ba1fda1b11dc6833f0296
Author: Evgenii Glotov <[email protected]>
AuthorDate: Sun Aug 24 17:49:28 2025 +0300
feat(spark): implement Spark `bitmap` function `bitmap_count` (#17179)
* feat(spark): implement Spark `misc` function `bitmap_count`
* chore: add ASF license text
* chore: move bitmap_count to spark/bitmap module, improve error handling,
add sqllogictests for different types, remove hint
* fix: BitmapCount derive PartialEq, Eq, Hash
* chore: reminder to implement TypeSignature for BitmapCount when possible
---
.../spark/src/function/bitmap/bitmap_count.rs | 178 +++++++++++++++++++++
datafusion/spark/src/function/{ => bitmap}/mod.rs | 46 +++---
datafusion/spark/src/function/mod.rs | 1 +
datafusion/spark/src/lib.rs | 2 +
.../test_files/spark/bitmap/bitmap_count.slt | 61 +++++++
5 files changed, 263 insertions(+), 25 deletions(-)
diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs
b/datafusion/spark/src/function/bitmap/bitmap_count.rs
new file mode 100644
index 0000000000..966b0930f0
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+ Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray,
Int64Array,
+ LargeBinaryArray,
+};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::{
+ Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{internal_datafusion_err, internal_err, plan_err,
Result};
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use datafusion_functions::{downcast_arg, downcast_named_arg};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapCount {
+ signature: Signature,
+}
+
+impl Default for BitmapCount {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl BitmapCount {
+ pub fn new() -> Self {
+ Self {
+ // TODO: add definitive TypeSignature after
https://github.com/apache/datafusion/issues/17291 is done
+ signature: Signature::any(1, Volatility::Immutable),
+ }
+ }
+}
+
+impl ScalarUDFImpl for BitmapCount {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "bitmap_count"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ match arg_types.first() {
+ Some(Binary | BinaryView | FixedSizeBinary(_) | LargeBinary) =>
Ok(Int64),
+ Some(data_type) => plan_err!(
+ "bitmap_count expects
Binary/BinaryView/FixedSizeBinary/LargeBinary as argument, got {:?}",
+ data_type
+ ),
+ None => internal_err!("bitmap_count does not support zero
arguments"),
+ }
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(bitmap_count_inner, vec![])(&args.args)
+ }
+}
+
+fn binary_count_ones(opt: Option<&[u8]>) -> Option<i64> {
+ opt.map(|value| value.iter().map(|b| b.count_ones() as i64).sum())
+}
+
+macro_rules! downcast_and_count_ones {
+ ($input_array:expr, $array_type:ident) => {{
+ let arr = downcast_arg!($input_array, $array_type);
+ Ok(arr.iter().map(binary_count_ones).collect::<Int64Array>())
+ }};
+}
+
+pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+ let [input_array] = take_function_args("bitmap_count", arg)?;
+
+ let res: Result<Int64Array> = match &input_array.data_type() {
+ Binary => downcast_and_count_ones!(input_array, BinaryArray),
+ BinaryView => downcast_and_count_ones!(input_array, BinaryViewArray),
+ LargeBinary => downcast_and_count_ones!(input_array, LargeBinaryArray),
+ FixedSizeBinary(_size) => {
+ downcast_and_count_ones!(input_array, FixedSizeBinaryArray)
+ }
+ data_type => {
+ internal_err!("bitmap_count does not support {:?}", data_type)
+ }
+ };
+
+ Ok(Arc::new(res?))
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::function::bitmap::bitmap_count::BitmapCount;
+ use crate::function::utils::test::test_scalar_function;
+ use arrow::array::{Array, Int64Array};
+ use arrow::datatypes::DataType::Int64;
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ macro_rules! test_bitmap_count_binary_invoke {
+ ($INPUT:expr, $EXPECTED:expr) => {
+ test_scalar_function!(
+ BitmapCount::new(),
+ vec![ColumnarValue::Scalar(ScalarValue::Binary($INPUT))],
+ $EXPECTED,
+ i64,
+ Int64,
+ Int64Array
+ );
+
+ test_scalar_function!(
+ BitmapCount::new(),
+ vec![ColumnarValue::Scalar(ScalarValue::LargeBinary($INPUT))],
+ $EXPECTED,
+ i64,
+ Int64,
+ Int64Array
+ );
+
+ test_scalar_function!(
+ BitmapCount::new(),
+ vec![ColumnarValue::Scalar(ScalarValue::BinaryView($INPUT))],
+ $EXPECTED,
+ i64,
+ Int64,
+ Int64Array
+ );
+
+ test_scalar_function!(
+ BitmapCount::new(),
+ vec![ColumnarValue::Scalar(ScalarValue::FixedSizeBinary(
+ $INPUT.map(|a| a.len()).unwrap_or(0) as i32,
+ $INPUT
+ ))],
+ $EXPECTED,
+ i64,
+ Int64,
+ Int64Array
+ );
+ };
+ }
+
+ #[test]
+ fn test_bitmap_count_invoke() -> Result<()> {
+ test_bitmap_count_binary_invoke!(None::<Vec<u8>>, Ok(None));
+ test_bitmap_count_binary_invoke!(Some(vec![0x0Au8]), Ok(Some(2)));
+ test_bitmap_count_binary_invoke!(Some(vec![0xFFu8, 0xFFu8]),
Ok(Some(16)));
+ test_bitmap_count_binary_invoke!(
+ Some(vec![0x0Au8, 0xB0u8, 0xCDu8]),
+ Ok(Some(10))
+ );
+ Ok(())
+ }
+}
diff --git a/datafusion/spark/src/function/mod.rs
b/datafusion/spark/src/function/bitmap/mod.rs
similarity index 63%
copy from datafusion/spark/src/function/mod.rs
copy to datafusion/spark/src/function/bitmap/mod.rs
index cac8741a89..8532c32ac9 100644
--- a/datafusion/spark/src/function/mod.rs
+++ b/datafusion/spark/src/function/bitmap/mod.rs
@@ -15,28 +15,24 @@
// specific language governing permissions and limitations
// under the License.
-pub mod aggregate;
-pub mod array;
-pub mod bitwise;
-pub mod collection;
-pub mod conditional;
-pub mod conversion;
-pub mod csv;
-pub mod datetime;
-pub mod error_utils;
-pub mod functions_nested_utils;
-pub mod generator;
-pub mod hash;
-pub mod json;
-pub mod lambda;
-pub mod map;
-pub mod math;
-pub mod misc;
-pub mod predicate;
-pub mod string;
-pub mod r#struct;
-pub mod table;
-pub mod url;
-pub mod utils;
-pub mod window;
-pub mod xml;
+pub mod bitmap_count;
+
+use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
+use std::sync::Arc;
+
+make_udf_function!(bitmap_count::BitmapCount, bitmap_count);
+
+pub mod expr_fn {
+ use datafusion_functions::export_functions;
+
+ export_functions!((
+ bitmap_count,
+ "Returns the number of set bits in the input bitmap.",
+ arg
+ ));
+}
+
+pub fn functions() -> Vec<Arc<ScalarUDF>> {
+ vec![bitmap_count()]
+}
diff --git a/datafusion/spark/src/function/mod.rs
b/datafusion/spark/src/function/mod.rs
index cac8741a89..3f4f94cfaa 100644
--- a/datafusion/spark/src/function/mod.rs
+++ b/datafusion/spark/src/function/mod.rs
@@ -17,6 +17,7 @@
pub mod aggregate;
pub mod array;
+pub mod bitmap;
pub mod bitwise;
pub mod collection;
pub mod conditional;
diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs
index 4ce9be1263..531883a6c4 100644
--- a/datafusion/spark/src/lib.rs
+++ b/datafusion/spark/src/lib.rs
@@ -104,6 +104,7 @@ use std::sync::Arc;
pub mod expr_fn {
pub use super::function::aggregate::expr_fn::*;
pub use super::function::array::expr_fn::*;
+ pub use super::function::bitmap::expr_fn::*;
pub use super::function::bitwise::expr_fn::*;
pub use super::function::collection::expr_fn::*;
pub use super::function::conditional::expr_fn::*;
@@ -130,6 +131,7 @@ pub mod expr_fn {
pub fn all_default_scalar_functions() -> Vec<Arc<ScalarUDF>> {
function::array::functions()
.into_iter()
+ .chain(function::bitmap::functions())
.chain(function::bitwise::functions())
.chain(function::collection::functions())
.chain(function::conditional::functions())
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
new file mode 100644
index 0000000000..2789efef7b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT bitmap_count(X'1010');
+----
+2
+
+query I
+SELECT bitmap_count(X'FFFF');
+----
+16
+
+query I
+SELECT bitmap_count(X'0');
+----
+0
+
+query I
+SELECT bitmap_count(a) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+# Tests with different binary types
+query I
+SELECT bitmap_count(arrow_cast(a, 'LargeBinary')) FROM (VALUES (X'0AB0'),
(X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'BinaryView')) FROM (VALUES (X'0AB0'),
(X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) FROM (VALUES
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]