This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d7d646164d feat: Implement Spark `bin` function (#20479)
d7d646164d is described below
commit d7d646164de5da5b6c320a5500757b02dbfcc309
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Wed Feb 25 15:38:51 2026 +0400
feat: Implement Spark `bin` function (#20479)
## Which issue does this PR close?
N/A
## Rationale for this change
Add new function:
https://spark.apache.org/docs/latest/api/sql/index.html#bin
## What changes are included in this PR?
- Implementation
- Unit Tests
- SLT tests
## Are these changes tested?
Yes, tests added as part of this PR.
## Are there any user-facing changes?
No, these are new function.
---------
Co-authored-by: Kazantsev Maksim <[email protected]>
---
datafusion/spark/src/function/math/bin.rs | 109 +++++++++++++++++++++
datafusion/spark/src/function/math/mod.rs | 8 ++
.../sqllogictest/test_files/spark/math/bin.slt | 79 +++++++++++----
3 files changed, 176 insertions(+), 20 deletions(-)
diff --git a/datafusion/spark/src/function/math/bin.rs
b/datafusion/spark/src/function/math/bin.rs
new file mode 100644
index 0000000000..5d3ed0f77a
--- /dev/null
+++ b/datafusion/spark/src/function/math/bin.rs
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, StringArray};
+use arrow::datatypes::{DataType, Field, FieldRef, Int64Type};
+use datafusion::logical_expr::{ColumnarValue, Signature, TypeSignature,
Volatility};
+use datafusion_common::types::{NativeType, logical_int64};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{Coercion, ScalarFunctionArgs, ScalarUDFImpl,
TypeSignatureClass};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `bin` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bin>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBin {
+ signature: Signature,
+}
+
+impl Default for SparkBin {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl SparkBin {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::one_of(
+ vec![TypeSignature::Coercible(vec![Coercion::new_implicit(
+ TypeSignatureClass::Native(logical_int64()),
+ vec![TypeSignatureClass::Numeric],
+ NativeType::Int64,
+ )])],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for SparkBin {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "bin"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ internal_err!("return_field_from_args should be used instead")
+ }
+
+ fn return_field_from_args(
+ &self,
+ args: datafusion_expr::ReturnFieldArgs,
+ ) -> Result<FieldRef> {
+ Ok(Arc::new(Field::new(
+ self.name(),
+ DataType::Utf8,
+ args.arg_fields[0].is_nullable(),
+ )))
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(spark_bin_inner, vec![])(&args.args)
+ }
+}
+
+fn spark_bin_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+ let [array] = take_function_args("bin", arg)?;
+ match &array.data_type() {
+ DataType::Int64 => {
+ let result: StringArray = array
+ .as_primitive::<Int64Type>()
+ .iter()
+ .map(|opt| opt.map(spark_bin))
+ .collect();
+ Ok(Arc::new(result))
+ }
+ data_type => {
+ internal_err!("bin does not support: {data_type}")
+ }
+ }
+}
+
+fn spark_bin(value: i64) -> String {
+ format!("{value:b}")
+}
diff --git a/datafusion/spark/src/function/math/mod.rs
b/datafusion/spark/src/function/math/mod.rs
index 92d8e90ac3..7f7d04e06b 100644
--- a/datafusion/spark/src/function/math/mod.rs
+++ b/datafusion/spark/src/function/math/mod.rs
@@ -16,6 +16,7 @@
// under the License.
pub mod abs;
+pub mod bin;
pub mod expm1;
pub mod factorial;
pub mod hex;
@@ -42,6 +43,7 @@ make_udf_function!(width_bucket::SparkWidthBucket,
width_bucket);
make_udf_function!(trigonometry::SparkCsc, csc);
make_udf_function!(trigonometry::SparkSec, sec);
make_udf_function!(negative::SparkNegative, negative);
+make_udf_function!(bin::SparkBin, bin);
pub mod expr_fn {
use datafusion_functions::export_functions;
@@ -70,6 +72,11 @@ pub mod expr_fn {
"Returns the negation of expr (unary minus).",
arg1
));
+ export_functions!((
+ bin,
+ "Returns the string representation of the long value represented in
binary.",
+ arg1
+ ));
}
pub fn functions() -> Vec<Arc<ScalarUDF>> {
@@ -86,5 +93,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
csc(),
sec(),
negative(),
+ bin(),
]
}
diff --git a/datafusion/sqllogictest/test_files/spark/math/bin.slt
b/datafusion/sqllogictest/test_files/spark/math/bin.slt
index 1fa24e6cda..b2e2aadde4 100644
--- a/datafusion/sqllogictest/test_files/spark/math/bin.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/bin.slt
@@ -15,23 +15,62 @@
# specific language governing permissions and limitations
# under the License.
-# This file was originally created by a porting script from:
-#
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function
library.
-# For more information, please see:
-# https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT bin(-13);
-## PySpark 3.5.5 Result: {'bin(-13)':
'1111111111111111111111111111111111111111111111111111111111110011',
'typeof(bin(-13))': 'string', 'typeof(-13)': 'int'}
-#query
-#SELECT bin(-13::int);
-
-## Original Query: SELECT bin(13);
-## PySpark 3.5.5 Result: {'bin(13)': '1101', 'typeof(bin(13))': 'string',
'typeof(13)': 'int'}
-#query
-#SELECT bin(13::int);
-
-## Original Query: SELECT bin(13.3);
-## PySpark 3.5.5 Result: {'bin(13.3)': '1101', 'typeof(bin(13.3))': 'string',
'typeof(13.3)': 'decimal(3,1)'}
-#query
-#SELECT bin(13.3::decimal(3,1));
+query T
+SELECT bin(arrow_cast(NULL, 'Int8'));
+----
+NULL
+
+query T
+SELECT bin(arrow_cast(0, 'Int8'));
+----
+0
+
+query T
+SELECT bin(arrow_cast(13, 'Int8'));
+----
+1101
+
+query T
+SELECT bin(arrow_cast(13.36, 'Float16'));
+----
+1101
+
+query T
+SELECT bin(13.3::decimal(3,1));
+----
+1101
+
+query T
+SELECT bin(arrow_cast(-13, 'Int8'));
+----
+1111111111111111111111111111111111111111111111111111111111110011
+
+query T
+SELECT bin(arrow_cast(256, 'Int16'));
+----
+100000000
+
+query T
+SELECT bin(arrow_cast(-32768, 'Int16'));
+----
+1111111111111111111111111111111111111111111111111000000000000000
+
+query T
+SELECT bin(arrow_cast(-2147483648, 'Int32'));
+----
+1111111111111111111111111111111110000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(1073741824, 'Int32'));
+----
+1000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(-9223372036854775808, 'Int64'));
+----
+1000000000000000000000000000000000000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(9223372036854775807, 'Int64'));
+----
+111111111111111111111111111111111111111111111111111111111111111
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]