This is an automated email from the ASF dual-hosted git repository.
mbutrovich pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 89a3c5f8f chore: move string function handling to new expression
registry (#2931)
89a3c5f8f is described below
commit 89a3c5f8f8017509018a51373081c38ed1455b66
Author: Andy Grove <[email protected]>
AuthorDate: Thu Dec 18 05:30:07 2025 -0700
chore: move string function handling to new expression registry (#2931)
---
native/core/src/execution/expressions/mod.rs | 1 +
native/core/src/execution/expressions/strings.rs | 100 +++++++++++++++++++++
native/core/src/execution/planner.rs | 41 +--------
.../src/execution/planner/expression_registry.rs | 16 +++-
4 files changed, 120 insertions(+), 38 deletions(-)
diff --git a/native/core/src/execution/expressions/mod.rs
b/native/core/src/execution/expressions/mod.rs
index 105afd595..a06b41b2c 100644
--- a/native/core/src/execution/expressions/mod.rs
+++ b/native/core/src/execution/expressions/mod.rs
@@ -22,6 +22,7 @@ pub mod bitwise;
pub mod comparison;
pub mod logical;
pub mod nullcheck;
+pub mod strings;
pub mod subquery;
pub use datafusion_comet_spark_expr::EvalMode;
diff --git a/native/core/src/execution/expressions/strings.rs
b/native/core/src/execution/expressions/strings.rs
new file mode 100644
index 000000000..5f4300eb1
--- /dev/null
+++ b/native/core/src/execution/expressions/strings.rs
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! String expression builders
+
+use std::cmp::max;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::common::ScalarValue;
+use datafusion::physical_expr::expressions::{LikeExpr, Literal};
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_comet_proto::spark_expression::Expr;
+use datafusion_comet_spark_expr::{RLike, SubstringExpr};
+
+use crate::execution::{
+ expressions::extract_expr,
+ operators::ExecutionError,
+ planner::{expression_registry::ExpressionBuilder, PhysicalPlanner},
+};
+
+/// Builder for Substring expressions
+pub struct SubstringBuilder;
+
+impl ExpressionBuilder for SubstringBuilder {
+ fn build(
+ &self,
+ spark_expr: &Expr,
+ input_schema: SchemaRef,
+ planner: &PhysicalPlanner,
+ ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+ let expr = extract_expr!(spark_expr, Substring);
+ let child = planner.create_expr(expr.child.as_ref().unwrap(),
input_schema)?;
+ // Spark Substring's start is 1-based when start > 0
+ let start = expr.start - i32::from(expr.start > 0);
+ // substring negative len is treated as 0 in Spark
+ let len = max(expr.len, 0);
+
+ Ok(Arc::new(SubstringExpr::new(
+ child,
+ start as i64,
+ len as u64,
+ )))
+ }
+}
+
+/// Builder for Like expressions
+pub struct LikeBuilder;
+
+impl ExpressionBuilder for LikeBuilder {
+ fn build(
+ &self,
+ spark_expr: &Expr,
+ input_schema: SchemaRef,
+ planner: &PhysicalPlanner,
+ ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+ let expr = extract_expr!(spark_expr, Like);
+ let left = planner.create_expr(expr.left.as_ref().unwrap(),
Arc::clone(&input_schema))?;
+ let right = planner.create_expr(expr.right.as_ref().unwrap(),
input_schema)?;
+
+ Ok(Arc::new(LikeExpr::new(false, false, left, right)))
+ }
+}
+
+/// Builder for Rlike (regex like) expressions
+pub struct RlikeBuilder;
+
+impl ExpressionBuilder for RlikeBuilder {
+ fn build(
+ &self,
+ spark_expr: &Expr,
+ input_schema: SchemaRef,
+ planner: &PhysicalPlanner,
+ ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+ let expr = extract_expr!(spark_expr, Rlike);
+ let left = planner.create_expr(expr.left.as_ref().unwrap(),
Arc::clone(&input_schema))?;
+ let right = planner.create_expr(expr.right.as_ref().unwrap(),
input_schema)?;
+
+ match right.as_any().downcast_ref::<Literal>().unwrap().value() {
+ ScalarValue::Utf8(Some(pattern)) =>
Ok(Arc::new(RLike::try_new(left, pattern)?)),
+ _ => Err(ExecutionError::GeneralError(
+ "RLike only supports scalar patterns".to_string(),
+ )),
+ }
+ }
+}
diff --git a/native/core/src/execution/planner.rs
b/native/core/src/execution/planner.rs
index 67b2523be..a019823b1 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -52,7 +52,7 @@ use datafusion::{
logical_expr::Operator as DataFusionOperator,
physical_expr::{
expressions::{
- in_list, BinaryExpr, CaseExpr, CastExpr, Column, IsNullExpr,
LikeExpr,
+ in_list, BinaryExpr, CaseExpr, CastExpr, Column, IsNullExpr,
Literal as DataFusionLiteral,
},
PhysicalExpr, PhysicalSortExpr, ScalarFunctionExpr,
@@ -124,9 +124,9 @@ use datafusion_comet_proto::{
use
datafusion_comet_spark_expr::monotonically_increasing_id::MonotonicallyIncreasingId;
use datafusion_comet_spark_expr::{
ArrayInsert, Avg, AvgDecimal, Cast, CheckOverflow, Correlation,
Covariance, CreateNamedStruct,
- GetArrayStructFields, GetStructField, IfExpr, ListExtract,
NormalizeNaNAndZero, RLike,
- RandExpr, RandnExpr, SparkCastOptions, Stddev, SubstringExpr, SumDecimal,
TimestampTruncExpr,
- ToJson, UnboundColumn, Variance,
+ GetArrayStructFields, GetStructField, IfExpr, ListExtract,
NormalizeNaNAndZero, RandExpr,
+ RandnExpr, SparkCastOptions, Stddev, SumDecimal, TimestampTruncExpr,
ToJson, UnboundColumn,
+ Variance,
};
use itertools::Itertools;
use jni::objects::GlobalRef;
@@ -433,39 +433,6 @@ impl PhysicalPlanner {
Ok(Arc::new(TimestampTruncExpr::new(child, format, timezone)))
}
- ExprStruct::Substring(expr) => {
- let child = self.create_expr(expr.child.as_ref().unwrap(),
input_schema)?;
- // Spark Substring's start is 1-based when start > 0
- let start = expr.start - i32::from(expr.start > 0);
- // substring negative len is treated as 0 in Spark
- let len = max(expr.len, 0);
-
- Ok(Arc::new(SubstringExpr::new(
- child,
- start as i64,
- len as u64,
- )))
- }
- ExprStruct::Like(expr) => {
- let left =
- self.create_expr(expr.left.as_ref().unwrap(),
Arc::clone(&input_schema))?;
- let right = self.create_expr(expr.right.as_ref().unwrap(),
input_schema)?;
-
- Ok(Arc::new(LikeExpr::new(false, false, left, right)))
- }
- ExprStruct::Rlike(expr) => {
- let left =
- self.create_expr(expr.left.as_ref().unwrap(),
Arc::clone(&input_schema))?;
- let right = self.create_expr(expr.right.as_ref().unwrap(),
input_schema)?;
- match
right.as_any().downcast_ref::<Literal>().unwrap().value() {
- ScalarValue::Utf8(Some(pattern)) => {
- Ok(Arc::new(RLike::try_new(left, pattern)?))
- }
- _ => Err(GeneralError(
- "RLike only supports scalar patterns".to_string(),
- )),
- }
- }
ExprStruct::CheckOverflow(expr) => {
let child = self.create_expr(expr.child.as_ref().unwrap(),
input_schema)?;
let data_type =
to_arrow_datatype(expr.datatype.as_ref().unwrap());
diff --git a/native/core/src/execution/planner/expression_registry.rs
b/native/core/src/execution/planner/expression_registry.rs
index 227484ca8..3321f6118 100644
--- a/native/core/src/execution/planner/expression_registry.rs
+++ b/native/core/src/execution/planner/expression_registry.rs
@@ -177,8 +177,10 @@ impl ExpressionRegistry {
// Register null check expressions
self.register_null_check_expressions();
+ // Register string expressions
+ self.register_string_expressions();
+
// TODO: Register other expression categories in future phases
- // self.register_string_expressions();
// self.register_temporal_expressions();
// etc.
}
@@ -269,6 +271,18 @@ impl ExpressionRegistry {
.insert(ExpressionType::IsNotNull, Box::new(IsNotNullBuilder));
}
+ /// Register string expression builders
+ fn register_string_expressions(&mut self) {
+ use crate::execution::expressions::strings::*;
+
+ self.builders
+ .insert(ExpressionType::Substring, Box::new(SubstringBuilder));
+ self.builders
+ .insert(ExpressionType::Like, Box::new(LikeBuilder));
+ self.builders
+ .insert(ExpressionType::Rlike, Box::new(RlikeBuilder));
+ }
+
/// Extract expression type from Spark protobuf expression
fn get_expression_type(spark_expr: &Expr) -> Result<ExpressionType,
ExecutionError> {
match spark_expr.expr_struct.as_ref() {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]