This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new c2879f5105 move strpos, substr functions to datafusion_functions 
(#9849)
c2879f5105 is described below

commit c2879f510533a01bc04ef75da4f1416d0ddb99f6
Author: Bruce Ritchie <[email protected]>
AuthorDate: Fri Mar 29 09:53:03 2024 -0400

    move strpos, substr functions to datafusion_functions (#9849)
    
    * Fix to_timestamp benchmark
    
    * Remove reference to simd and nightly build as simd is no longer an 
available feature in DataFusion and building with nightly may not be a good 
recommendation when getting started.
    
    * Fixed missing trim() function.
    
    * Create unicode module in datafusion/functions/src/unicode and 
unicode_expressions feature flag, move char_length function
    
    * move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions
    
    * move strpos, substr functions to datafusion_functions
    
    * Cleanup tests
---
 datafusion/expr/src/built_in_function.rs           |  36 +-
 datafusion/expr/src/expr_fn.rs                     |   6 -
 datafusion/functions/src/unicode/mod.rs            |  31 ++
 datafusion/functions/src/unicode/strpos.rs         | 121 +++++++
 datafusion/functions/src/unicode/substr.rs         | 392 +++++++++++++++++++++
 datafusion/physical-expr/src/functions.rs          | 258 +-------------
 .../physical-expr/src/unicode_expressions.rs       |  95 -----
 datafusion/proto/Cargo.toml                        |   1 +
 datafusion/proto/proto/datafusion.proto            |   4 +-
 datafusion/proto/src/generated/pbjson.rs           |   6 -
 datafusion/proto/src/generated/prost.rs            |   8 +-
 datafusion/proto/src/logical_plan/from_proto.rs    |  29 +-
 datafusion/proto/src/logical_plan/to_proto.rs      |   2 -
 .../proto/tests/cases/roundtrip_logical_plan.rs    |  29 +-
 datafusion/proto/tests/cases/serialize.rs          |   5 +-
 datafusion/sql/src/expr/mod.rs                     |   9 +-
 datafusion/sql/src/expr/substring.rs               |  16 +-
 datafusion/sqllogictest/test_files/scalar.slt      |   2 +-
 18 files changed, 598 insertions(+), 452 deletions(-)

diff --git a/datafusion/expr/src/built_in_function.rs 
b/datafusion/expr/src/built_in_function.rs
index 196d278dc7..423fc11c1d 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -113,10 +113,6 @@ pub enum BuiltinScalarFunction {
     InitCap,
     /// random
     Random,
-    /// strpos
-    Strpos,
-    /// substr
-    Substr,
     /// translate
     Translate,
     /// substr_index
@@ -211,8 +207,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::EndsWith => Volatility::Immutable,
             BuiltinScalarFunction::InitCap => Volatility::Immutable,
             BuiltinScalarFunction::Radians => Volatility::Immutable,
-            BuiltinScalarFunction::Strpos => Volatility::Immutable,
-            BuiltinScalarFunction::Substr => Volatility::Immutable,
             BuiltinScalarFunction::Translate => Volatility::Immutable,
             BuiltinScalarFunction::SubstrIndex => Volatility::Immutable,
             BuiltinScalarFunction::FindInSet => Volatility::Immutable,
@@ -252,12 +246,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Pi => Ok(Float64),
             BuiltinScalarFunction::Random => Ok(Float64),
             BuiltinScalarFunction::EndsWith => Ok(Boolean),
-            BuiltinScalarFunction::Strpos => {
-                utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
-            }
-            BuiltinScalarFunction::Substr => {
-                utf8_to_str_type(&input_expr_types[0], "substr")
-            }
             BuiltinScalarFunction::SubstrIndex => {
                 utf8_to_str_type(&input_expr_types[0], "substr_index")
             }
@@ -341,24 +329,12 @@ impl BuiltinScalarFunction {
                 Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
             }
 
-            BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => 
{
-                Signature::one_of(
-                    vec![
-                        Exact(vec![Utf8, Utf8]),
-                        Exact(vec![Utf8, LargeUtf8]),
-                        Exact(vec![LargeUtf8, Utf8]),
-                        Exact(vec![LargeUtf8, LargeUtf8]),
-                    ],
-                    self.volatility(),
-                )
-            }
-
-            BuiltinScalarFunction::Substr => Signature::one_of(
+            BuiltinScalarFunction::EndsWith => Signature::one_of(
                 vec![
-                    Exact(vec![Utf8, Int64]),
-                    Exact(vec![LargeUtf8, Int64]),
-                    Exact(vec![Utf8, Int64, Int64]),
-                    Exact(vec![LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8]),
+                    Exact(vec![Utf8, LargeUtf8]),
+                    Exact(vec![LargeUtf8, Utf8]),
+                    Exact(vec![LargeUtf8, LargeUtf8]),
                 ],
                 self.volatility(),
             ),
@@ -537,8 +513,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
             BuiltinScalarFunction::EndsWith => &["ends_with"],
             BuiltinScalarFunction::InitCap => &["initcap"],
-            BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
-            BuiltinScalarFunction::Substr => &["substr"],
             BuiltinScalarFunction::Translate => &["translate"],
             BuiltinScalarFunction::SubstrIndex => &["substr_index", 
"substring_index"],
             BuiltinScalarFunction::FindInSet => &["find_in_set"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 21dab72855..09170ae639 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -579,9 +579,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a 
particular `base`");
 
 scalar_expr!(InitCap, initcap, string, "converts the first letter of each word 
in `string` in uppercase and the remaining characters in lowercase");
 scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends 
with the `suffix`");
-scalar_expr!(Strpos, strpos, string substring, "finds the position from where 
the `substring` matches the `string`");
-scalar_expr!(Substr, substr, string position, "substring from the `position` 
to the end");
-scalar_expr!(Substr, substring, string position length, "substring from the 
`position` with `length` characters");
 scalar_expr!(Translate, translate, string from to, "replaces the characters in 
`from` with the counterpart in `to`");
 nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which 
evaluates to the value of the first [Expr] which is not NULL");
 //there is a func concat_ws before, so use concat_ws_expr as name.c
@@ -1015,9 +1012,6 @@ mod test {
         test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
         test_scalar_expr!(InitCap, initcap, string);
         test_scalar_expr!(EndsWith, ends_with, string, characters);
-        test_scalar_expr!(Strpos, strpos, string, substring);
-        test_scalar_expr!(Substr, substr, string, position);
-        test_scalar_expr!(Substr, substring, string, position, count);
         test_scalar_expr!(Translate, translate, string, from, to);
         test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count);
         test_scalar_expr!(FindInSet, find_in_set, string, stringlist);
diff --git a/datafusion/functions/src/unicode/mod.rs 
b/datafusion/functions/src/unicode/mod.rs
index ea4e70a921..ddab0d1e27 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -27,6 +27,8 @@ mod lpad;
 mod reverse;
 mod right;
 mod rpad;
+mod strpos;
+mod substr;
 
 // create UDFs
 make_udf_function!(
@@ -39,6 +41,8 @@ make_udf_function!(lpad::LPadFunc, LPAD, lpad);
 make_udf_function!(right::RightFunc, RIGHT, right);
 make_udf_function!(reverse::ReverseFunc, REVERSE, reverse);
 make_udf_function!(rpad::RPadFunc, RPAD, rpad);
+make_udf_function!(strpos::StrposFunc, STRPOS, strpos);
+make_udf_function!(substr::SubstrFunc, SUBSTR, substr);
 
 pub mod expr_fn {
     use datafusion_expr::Expr;
@@ -53,6 +57,11 @@ pub mod expr_fn {
         super::character_length().call(vec![string])
     }
 
+    #[doc = "finds the position from where the `substring` matches the 
`string`"]
+    pub fn instr(string: Expr, substring: Expr) -> Expr {
+        strpos(string, substring)
+    }
+
     #[doc = "the number of characters in the `string`"]
     pub fn length(string: Expr) -> Expr {
         character_length(string)
@@ -68,6 +77,11 @@ pub mod expr_fn {
         super::lpad().call(args)
     }
 
+    #[doc = "finds the position from where the `substring` matches the 
`string`"]
+    pub fn position(string: Expr, substring: Expr) -> Expr {
+        strpos(string, substring)
+    }
+
     #[doc = "reverses the `string`"]
     pub fn reverse(string: Expr) -> Expr {
         super::reverse().call(vec![string])
@@ -82,6 +96,21 @@ pub mod expr_fn {
     pub fn rpad(args: Vec<Expr>) -> Expr {
         super::rpad().call(args)
     }
+
+    #[doc = "finds the position from where the `substring` matches the 
`string`"]
+    pub fn strpos(string: Expr, substring: Expr) -> Expr {
+        super::strpos().call(vec![string, substring])
+    }
+
+    #[doc = "substring from the `position` to the end"]
+    pub fn substr(string: Expr, position: Expr) -> Expr {
+        super::substr().call(vec![string, position])
+    }
+
+    #[doc = "substring from the `position` with `length` characters"]
+    pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr {
+        super::substr().call(vec![string, position, length])
+    }
 }
 
 ///   Return a list of all functions in this package
@@ -93,5 +122,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         reverse(),
         right(),
         rpad(),
+        strpos(),
+        substr(),
     ]
 }
diff --git a/datafusion/functions/src/unicode/strpos.rs 
b/datafusion/functions/src/unicode/strpos.rs
new file mode 100644
index 0000000000..1e8bfa37d4
--- /dev/null
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, 
PrimitiveArray,
+};
+use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
+
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_int_type};
+
+#[derive(Debug)]
+pub(super) struct StrposFunc {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl StrposFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Exact(vec![Utf8, Utf8]),
+                    Exact(vec![Utf8, LargeUtf8]),
+                    Exact(vec![LargeUtf8, Utf8]),
+                    Exact(vec![LargeUtf8, LargeUtf8]),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec![String::from("instr"), String::from("position")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for StrposFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "strpos"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        utf8_to_int_type(&arg_types[0], "strpos/instr/position")
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        match args[0].data_type() {
+            DataType::Utf8 => make_scalar_function(strpos::<Int32Type>, 
vec![])(args),
+            DataType::LargeUtf8 => {
+                make_scalar_function(strpos::<Int64Type>, vec![])(args)
+            }
+            other => exec_err!("Unsupported data type {other:?} for function 
strpos"),
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
+/// Returns starting index of specified substring within string, or zero if 
it's not present. (Same as position(substring in string), but note the reversed 
argument order.)
+/// strpos('high', 'ig') = 2
+/// The implementation uses UTF-8 code points as characters
+fn strpos<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
+where
+    T::Native: OffsetSizeTrait,
+{
+    let string_array: &GenericStringArray<T::Native> =
+        as_generic_string_array::<T::Native>(&args[0])?;
+
+    let substring_array: &GenericStringArray<T::Native> =
+        as_generic_string_array::<T::Native>(&args[1])?;
+
+    let result = string_array
+        .iter()
+        .zip(substring_array.iter())
+        .map(|(string, substring)| match (string, substring) {
+            (Some(string), Some(substring)) => {
+                // the find method returns the byte index of the substring
+                // Next, we count the number of the chars until that byte
+                T::Native::from_usize(
+                    string
+                        .find(substring)
+                        .map(|x| string[..x].chars().count() + 1)
+                        .unwrap_or(0),
+                )
+            }
+            _ => None,
+        })
+        .collect::<PrimitiveArray<T>>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
diff --git a/datafusion/functions/src/unicode/substr.rs 
b/datafusion/functions/src/unicode/substr.rs
new file mode 100644
index 0000000000..403157e2a8
--- /dev/null
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -0,0 +1,392 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::cmp::max;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+
+#[derive(Debug)]
+pub(super) struct SubstrFunc {
+    signature: Signature,
+}
+
+impl SubstrFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Exact(vec![Utf8, Int64]),
+                    Exact(vec![LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Int64, Int64]),
+                    Exact(vec![LargeUtf8, Int64, Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SubstrFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "substr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        utf8_to_str_type(&arg_types[0], "substr")
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        match args[0].data_type() {
+            DataType::Utf8 => make_scalar_function(substr::<i32>, 
vec![])(args),
+            DataType::LargeUtf8 => make_scalar_function(substr::<i64>, 
vec![])(args),
+            other => exec_err!("Unsupported data type {other:?} for function 
substr"),
+        }
+    }
+}
+
+/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
+/// substr('alphabet', 3) = 'phabet'
+/// substr('alphabet', 3, 2) = 'ph'
+/// The implementation uses UTF-8 code points as characters
+pub fn substr<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args.len() {
+        2 => {
+            let string_array = as_generic_string_array::<T>(&args[0])?;
+            let start_array = as_int64_array(&args[1])?;
+
+            let result = string_array
+                .iter()
+                .zip(start_array.iter())
+                .map(|(string, start)| match (string, start) {
+                    (Some(string), Some(start)) => {
+                        if start <= 0 {
+                            Some(string.to_string())
+                        } else {
+                            Some(string.chars().skip(start as usize - 
1).collect())
+                        }
+                    }
+                    _ => None,
+                })
+                .collect::<GenericStringArray<T>>();
+
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        3 => {
+            let string_array = as_generic_string_array::<T>(&args[0])?;
+            let start_array = as_int64_array(&args[1])?;
+            let count_array = as_int64_array(&args[2])?;
+
+            let result = string_array
+                .iter()
+                .zip(start_array.iter())
+                .zip(count_array.iter())
+                .map(|((string, start), count)| match (string, start, count) {
+                    (Some(string), Some(start), Some(count)) => {
+                        if count < 0 {
+                            exec_err!(
+                                "negative substring length not allowed: 
substr(<str>, {start}, {count})"
+                            )
+                        } else {
+                            let skip = max(0, start - 1);
+                            let count = max(0, count + (if start < 1 {start - 
1} else {0}));
+                            Ok(Some(string.chars().skip(skip as 
usize).take(count as usize).collect::<String>()))
+                        }
+                    }
+                    _ => Ok(None),
+                })
+                .collect::<Result<GenericStringArray<T>>>()?;
+
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        other => {
+            exec_err!("substr was called with {other} arguments. It requires 2 
or 3.")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Array, StringArray};
+    use arrow::datatypes::DataType::Utf8;
+
+    use datafusion_common::{exec_err, Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+    use crate::unicode::substr::SubstrFunc;
+    use crate::utils::test::test_function;
+
+    #[test]
+    fn test_functions() -> Result<()> {
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(0i64)),
+            ],
+            Ok(Some("alphabet")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from(5i64)),
+            ],
+            Ok(Some("ésoj")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+            ],
+            Ok(Some("joséésoj")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(1i64)),
+            ],
+            Ok(Some("alphabet")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("lphabet")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(3i64)),
+            ],
+            Ok(Some("phabet")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+            ],
+            Ok(Some("alphabet")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(30i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::Int64(None)),
+            ],
+            Ok(None),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(3i64)),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("ph")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(3i64)),
+                ColumnarValue::Scalar(ScalarValue::from(20i64)),
+            ],
+            Ok(Some("phabet")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(0i64)),
+                ColumnarValue::Scalar(ScalarValue::from(5i64)),
+            ],
+            Ok(Some("alph")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // starting from 5 (10 + -5)
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+                ColumnarValue::Scalar(ScalarValue::from(10i64)),
+            ],
+            Ok(Some("alph")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // starting from -1 (4 + -5)
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+                ColumnarValue::Scalar(ScalarValue::from(4i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // starting from 0 (5 + -5)
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+                ColumnarValue::Scalar(ScalarValue::from(5i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::Int64(None)),
+                ColumnarValue::Scalar(ScalarValue::from(20i64)),
+            ],
+            Ok(None),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(3i64)),
+                ColumnarValue::Scalar(ScalarValue::Int64(None)),
+            ],
+            Ok(None),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(1i64)),
+                ColumnarValue::Scalar(ScalarValue::from(-1i64)),
+            ],
+            exec_err!("negative substring length not allowed: substr(<str>, 1, 
-1)"),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from(5i64)),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("és")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        #[cfg(not(feature = "unicode_expressions"))]
+        test_function!(
+            SubstrFunc::new(),
+            &[
+                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+                ColumnarValue::Scalar(ScalarValue::from(0i64)),
+            ],
+            internal_err!(
+                "function substr requires compilation with feature flag: 
unicode_expressions."
+            ),
+            &str,
+            Utf8,
+            StringArray
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-expr/src/functions.rs 
b/datafusion/physical-expr/src/functions.rs
index c1b4900e39..513dd71d40 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -281,34 +281,6 @@ pub fn create_physical_fun(
                 exec_err!("Unsupported data type {other:?} for function 
ends_with")
             }
         }),
-        BuiltinScalarFunction::Strpos => Arc::new(|args| match 
args[0].data_type() {
-            DataType::Utf8 => {
-                let func = invoke_if_unicode_expressions_feature_flag!(
-                    strpos, Int32Type, "strpos"
-                );
-                make_scalar_function_inner(func)(args)
-            }
-            DataType::LargeUtf8 => {
-                let func = invoke_if_unicode_expressions_feature_flag!(
-                    strpos, Int64Type, "strpos"
-                );
-                make_scalar_function_inner(func)(args)
-            }
-            other => exec_err!("Unsupported data type {other:?} for function 
strpos"),
-        }),
-        BuiltinScalarFunction::Substr => Arc::new(|args| match 
args[0].data_type() {
-            DataType::Utf8 => {
-                let func =
-                    invoke_if_unicode_expressions_feature_flag!(substr, i32, 
"substr");
-                make_scalar_function_inner(func)(args)
-            }
-            DataType::LargeUtf8 => {
-                let func =
-                    invoke_if_unicode_expressions_feature_flag!(substr, i64, 
"substr");
-                make_scalar_function_inner(func)(args)
-            }
-            other => exec_err!("Unsupported data type {other:?} for function 
substr"),
-        }),
         BuiltinScalarFunction::Translate => Arc::new(|args| match 
args[0].data_type() {
             DataType::Utf8 => {
                 let func = invoke_if_unicode_expressions_feature_flag!(
@@ -450,7 +422,7 @@ mod tests {
     };
 
     use datafusion_common::cast::as_uint64_array;
-    use datafusion_common::{exec_err, internal_err, plan_err};
+    use datafusion_common::{internal_err, plan_err};
     use datafusion_common::{DataFusionError, Result, ScalarValue};
     use datafusion_expr::type_coercion::functions::data_types;
     use datafusion_expr::Signature;
@@ -663,234 +635,6 @@ mod tests {
             BooleanArray
         );
         #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(Some(0))),],
-            Ok(Some("alphabet")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),],
-            Ok(Some("ésoj")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-5))),],
-            Ok(Some("joséésoj")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(Some(1))),],
-            Ok(Some("alphabet")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(Some(2))),],
-            Ok(Some("lphabet")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(Some(3))),],
-            Ok(Some("phabet")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(Some(-3))),],
-            Ok(Some("alphabet")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(Some(30))),],
-            Ok(Some("")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[lit("alphabet"), lit(ScalarValue::Int64(None)),],
-            Ok(None),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(3))),
-                lit(ScalarValue::Int64(Some(2))),
-            ],
-            Ok(Some("ph")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(3))),
-                lit(ScalarValue::Int64(Some(20))),
-            ],
-            Ok(Some("phabet")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(0))),
-                lit(ScalarValue::Int64(Some(5))),
-            ],
-            Ok(Some("alph")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        // starting from 5 (10 + -5)
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(-5))),
-                lit(ScalarValue::Int64(Some(10))),
-            ],
-            Ok(Some("alph")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        // starting from -1 (4 + -5)
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(-5))),
-                lit(ScalarValue::Int64(Some(4))),
-            ],
-            Ok(Some("")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        // starting from 0 (5 + -5)
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(-5))),
-                lit(ScalarValue::Int64(Some(5))),
-            ],
-            Ok(Some("")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(None)),
-                lit(ScalarValue::Int64(Some(20))),
-            ],
-            Ok(None),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(3))),
-                lit(ScalarValue::Int64(None)),
-            ],
-            Ok(None),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(1))),
-                lit(ScalarValue::Int64(Some(-1))),
-            ],
-            exec_err!("negative substring length not allowed: substr(<str>, 1, 
-1)"),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            Substr,
-            &[
-                lit("joséésoj"),
-                lit(ScalarValue::Int64(Some(5))),
-                lit(ScalarValue::Int64(Some(2))),
-            ],
-            Ok(Some("és")),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(not(feature = "unicode_expressions"))]
-        test_function!(
-            Substr,
-            &[
-                lit("alphabet"),
-                lit(ScalarValue::Int64(Some(0))),
-            ],
-            internal_err!(
-                "function substr requires compilation with feature flag: 
unicode_expressions."
-            ),
-            &str,
-            Utf8,
-            StringArray
-        );
-        #[cfg(feature = "unicode_expressions")]
         test_function!(
             Translate,
             &[lit("12345"), lit("143"), lit("ax"),],
diff --git a/datafusion/physical-expr/src/unicode_expressions.rs 
b/datafusion/physical-expr/src/unicode_expressions.rs
index faff21111a..ecbd1ea320 100644
--- a/datafusion/physical-expr/src/unicode_expressions.rs
+++ b/datafusion/physical-expr/src/unicode_expressions.rs
@@ -21,7 +21,6 @@
 
 //! Unicode expressions
 
-use std::cmp::max;
 use std::sync::Arc;
 
 use arrow::{
@@ -36,100 +35,6 @@ use datafusion_common::{
     exec_err, Result,
 };
 
-/// Returns starting index of specified substring within string, or zero if 
it's not present. (Same as position(substring in string), but note the reversed 
argument order.)
-/// strpos('high', 'ig') = 2
-/// The implementation uses UTF-8 code points as characters
-pub fn strpos<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
-where
-    T::Native: OffsetSizeTrait,
-{
-    let string_array: &GenericStringArray<T::Native> =
-        as_generic_string_array::<T::Native>(&args[0])?;
-
-    let substring_array: &GenericStringArray<T::Native> =
-        as_generic_string_array::<T::Native>(&args[1])?;
-
-    let result = string_array
-        .iter()
-        .zip(substring_array.iter())
-        .map(|(string, substring)| match (string, substring) {
-            (Some(string), Some(substring)) => {
-                // the find method returns the byte index of the substring
-                // Next, we count the number of the chars until that byte
-                T::Native::from_usize(
-                    string
-                        .find(substring)
-                        .map(|x| string[..x].chars().count() + 1)
-                        .unwrap_or(0),
-                )
-            }
-            _ => None,
-        })
-        .collect::<PrimitiveArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-pub fn substr<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args.len() {
-        2 => {
-            let string_array = as_generic_string_array::<T>(&args[0])?;
-            let start_array = as_int64_array(&args[1])?;
-
-            let result = string_array
-                .iter()
-                .zip(start_array.iter())
-                .map(|(string, start)| match (string, start) {
-                    (Some(string), Some(start)) => {
-                        if start <= 0 {
-                            Some(string.to_string())
-                        } else {
-                            Some(string.chars().skip(start as usize - 
1).collect())
-                        }
-                    }
-                    _ => None,
-                })
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
-        }
-        3 => {
-            let string_array = as_generic_string_array::<T>(&args[0])?;
-            let start_array = as_int64_array(&args[1])?;
-            let count_array = as_int64_array(&args[2])?;
-
-            let result = string_array
-                .iter()
-                .zip(start_array.iter())
-                .zip(count_array.iter())
-                .map(|((string, start), count)| match (string, start, count) {
-                    (Some(string), Some(start), Some(count)) => {
-                        if count < 0 {
-                            exec_err!(
-                                "negative substring length not allowed: 
substr(<str>, {start}, {count})"
-                            )
-                        } else {
-                            let skip = max(0, start - 1);
-                            let count = max(0, count + (if start < 1 {start - 
1} else {0}));
-                            Ok(Some(string.chars().skip(skip as 
usize).take(count as usize).collect::<String>()))
-                        }
-                    }
-                    _ => Ok(None),
-                })
-                .collect::<Result<GenericStringArray<T>>>()?;
-
-            Ok(Arc::new(result) as ArrayRef)
-        }
-        other => {
-            exec_err!("substr was called with {other} arguments. It requires 2 
or 3.")
-        }
-    }
-}
-
 /// Replaces each character in string that matches a character in the from set 
with the corresponding character in the to set. If from is longer than to, 
occurrences of the extra characters in from are deleted.
 /// translate('12345', '143', 'ax') = 'a2x5'
 pub fn translate<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index f5297aefcd..bec2b8c53a 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -54,6 +54,7 @@ serde = { version = "1.0", optional = true }
 serde_json = { workspace = true, optional = true }
 
 [dev-dependencies]
+datafusion-functions = { workspace = true, default-features = true }
 doc-comment = { workspace = true }
 strum = { version = "0.26.1", features = ["derive"] }
 tokio = { workspace = true, features = ["rt-multi-thread"] }
diff --git a/datafusion/proto/proto/datafusion.proto 
b/datafusion/proto/proto/datafusion.proto
index 6319372d98..3a187eabe8 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -593,8 +593,8 @@ enum ScalarFunction {
   // 49 was SHA512
   // 50 was SplitPart
   // StartsWith = 51;
-  Strpos = 52;
-  Substr = 53;
+  // 52 was Strpos
+  // 53 was Substr
   // ToHex = 54;
   // 55 was ToTimestamp
   // 56 was ToTimestampMillis
diff --git a/datafusion/proto/src/generated/pbjson.rs 
b/datafusion/proto/src/generated/pbjson.rs
index 7281bc9dc2..07b91b26d6 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22932,8 +22932,6 @@ impl serde::Serialize for ScalarFunction {
             Self::ConcatWithSeparator => "ConcatWithSeparator",
             Self::InitCap => "InitCap",
             Self::Random => "Random",
-            Self::Strpos => "Strpos",
-            Self::Substr => "Substr",
             Self::Translate => "Translate",
             Self::Coalesce => "Coalesce",
             Self::Power => "Power",
@@ -22986,8 +22984,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
             "ConcatWithSeparator",
             "InitCap",
             "Random",
-            "Strpos",
-            "Substr",
             "Translate",
             "Coalesce",
             "Power",
@@ -23069,8 +23065,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
                     "ConcatWithSeparator" => 
Ok(ScalarFunction::ConcatWithSeparator),
                     "InitCap" => Ok(ScalarFunction::InitCap),
                     "Random" => Ok(ScalarFunction::Random),
-                    "Strpos" => Ok(ScalarFunction::Strpos),
-                    "Substr" => Ok(ScalarFunction::Substr),
                     "Translate" => Ok(ScalarFunction::Translate),
                     "Coalesce" => Ok(ScalarFunction::Coalesce),
                     "Power" => Ok(ScalarFunction::Power),
diff --git a/datafusion/proto/src/generated/prost.rs 
b/datafusion/proto/src/generated/prost.rs
index 2fe89efb9c..babeccec59 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2892,8 +2892,8 @@ pub enum ScalarFunction {
     /// 49 was SHA512
     /// 50 was SplitPart
     /// StartsWith = 51;
-    Strpos = 52,
-    Substr = 53,
+    /// 52 was Strpos
+    /// 53 was Substr
     /// ToHex = 54;
     /// 55 was ToTimestamp
     /// 56 was ToTimestampMillis
@@ -3005,8 +3005,6 @@ impl ScalarFunction {
             ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator",
             ScalarFunction::InitCap => "InitCap",
             ScalarFunction::Random => "Random",
-            ScalarFunction::Strpos => "Strpos",
-            ScalarFunction::Substr => "Substr",
             ScalarFunction::Translate => "Translate",
             ScalarFunction::Coalesce => "Coalesce",
             ScalarFunction::Power => "Power",
@@ -3053,8 +3051,6 @@ impl ScalarFunction {
             "ConcatWithSeparator" => Some(Self::ConcatWithSeparator),
             "InitCap" => Some(Self::InitCap),
             "Random" => Some(Self::Random),
-            "Strpos" => Some(Self::Strpos),
-            "Substr" => Some(Self::Substr),
             "Translate" => Some(Self::Translate),
             "Coalesce" => Some(Self::Coalesce),
             "Power" => Some(Self::Power),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs 
b/datafusion/proto/src/logical_plan/from_proto.rs
index 2c6f2e479b..ff3d6773d5 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -42,10 +42,10 @@ use datafusion_expr::{
     expr::{self, InList, Sort, WindowFunction},
     factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10, 
log2,
     logical_plan::{PlanType, StringifiedPlan},
-    nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, strpos, 
substr,
-    substr_index, substring, translate, trunc, AggregateFunction, Between, 
BinaryExpr,
-    BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, 
GetFieldAccess,
-    GetIndexedField, GroupingSet,
+    nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, 
substr_index,
+    translate, trunc, AggregateFunction, Between, BinaryExpr, 
BuiltInWindowFunction,
+    BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField,
+    GroupingSet,
     GroupingSet::GroupingSets,
     JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, 
WindowFrameBound,
     WindowFrameUnits,
@@ -455,8 +455,6 @@ impl From<&protobuf::ScalarFunction> for 
BuiltinScalarFunction {
             ScalarFunction::EndsWith => Self::EndsWith,
             ScalarFunction::InitCap => Self::InitCap,
             ScalarFunction::Random => Self::Random,
-            ScalarFunction::Strpos => Self::Strpos,
-            ScalarFunction::Substr => Self::Substr,
             ScalarFunction::Translate => Self::Translate,
             ScalarFunction::Coalesce => Self::Coalesce,
             ScalarFunction::Pi => Self::Pi,
@@ -1389,25 +1387,6 @@ pub fn parse_expr(
                     parse_expr(&args[0], registry, codec)?,
                     parse_expr(&args[1], registry, codec)?,
                 )),
-                ScalarFunction::Strpos => Ok(strpos(
-                    parse_expr(&args[0], registry, codec)?,
-                    parse_expr(&args[1], registry, codec)?,
-                )),
-                ScalarFunction::Substr => {
-                    if args.len() > 2 {
-                        assert_eq!(args.len(), 3);
-                        Ok(substring(
-                            parse_expr(&args[0], registry, codec)?,
-                            parse_expr(&args[1], registry, codec)?,
-                            parse_expr(&args[2], registry, codec)?,
-                        ))
-                    } else {
-                        Ok(substr(
-                            parse_expr(&args[0], registry, codec)?,
-                            parse_expr(&args[1], registry, codec)?,
-                        ))
-                    }
-                }
                 ScalarFunction::Translate => Ok(translate(
                     parse_expr(&args[0], registry, codec)?,
                     parse_expr(&args[1], registry, codec)?,
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs 
b/datafusion/proto/src/logical_plan/to_proto.rs
index ea682a5a22..89d49c5658 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1446,8 +1446,6 @@ impl TryFrom<&BuiltinScalarFunction> for 
protobuf::ScalarFunction {
             BuiltinScalarFunction::EndsWith => Self::EndsWith,
             BuiltinScalarFunction::InitCap => Self::InitCap,
             BuiltinScalarFunction::Random => Self::Random,
-            BuiltinScalarFunction::Strpos => Self::Strpos,
-            BuiltinScalarFunction::Substr => Self::Substr,
             BuiltinScalarFunction::Translate => Self::Translate,
             BuiltinScalarFunction::Coalesce => Self::Coalesce,
             BuiltinScalarFunction::Pi => Self::Pi,
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs 
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index 3c43f10075..3a47f556c0 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -34,8 +34,8 @@ use datafusion::test_util::{TestTableFactory, 
TestTableProvider};
 use datafusion_common::config::{FormatOptions, TableOptions};
 use datafusion_common::scalar::ScalarStructBuilder;
 use datafusion_common::{
-    internal_err, not_impl_err, plan_err, DFField, DFSchema, DFSchemaRef,
-    DataFusionError, FileType, Result, ScalarValue,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err, DFField, 
DFSchema,
+    DFSchemaRef, DataFusionError, FileType, Result, ScalarValue,
 };
 use datafusion_expr::dml::CopyTo;
 use datafusion_expr::expr::{
@@ -44,8 +44,7 @@ use datafusion_expr::expr::{
 };
 use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore};
 use datafusion_expr::{
-    col, create_udaf, lit, Accumulator, AggregateFunction,
-    BuiltinScalarFunction::{Sqrt, Substr},
+    col, create_udaf, lit, Accumulator, AggregateFunction, 
BuiltinScalarFunction::Sqrt,
     ColumnarValue, Expr, ExprSchemable, LogicalPlan, Operator, 
PartitionEvaluator,
     ScalarUDF, ScalarUDFImpl, Signature, TryCast, Volatility, WindowFrame,
     WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, WindowUDF,
@@ -60,6 +59,7 @@ use datafusion_proto::logical_plan::LogicalExtensionCodec;
 use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec};
 use datafusion_proto::protobuf;
 
+use datafusion::execution::FunctionRegistry;
 use prost::Message;
 
 #[cfg(feature = "json")]
@@ -1863,17 +1863,28 @@ fn roundtrip_cube() {
 
 #[test]
 fn roundtrip_substr() {
+    let ctx = SessionContext::new();
+
+    let fun = ctx
+        .state()
+        .udf("substr")
+        .map_err(|e| {
+            internal_datafusion_err!("Unable to find expected 'substr' 
function: {e:?}")
+        })
+        .unwrap();
+
     // substr(string, position)
-    let test_expr =
-        Expr::ScalarFunction(ScalarFunction::new(Substr, vec![col("col"), 
lit(1_i64)]));
+    let test_expr = Expr::ScalarFunction(ScalarFunction::new_udf(
+        fun.clone(),
+        vec![col("col"), lit(1_i64)],
+    ));
 
     // substr(string, position, count)
-    let test_expr_with_count = Expr::ScalarFunction(ScalarFunction::new(
-        Substr,
+    let test_expr_with_count = Expr::ScalarFunction(ScalarFunction::new_udf(
+        fun,
         vec![col("col"), lit(1_i64), lit(1_i64)],
     ));
 
-    let ctx = SessionContext::new();
     roundtrip_expr_test(test_expr, ctx.clone());
     roundtrip_expr_test(test_expr_with_count, ctx);
 }
diff --git a/datafusion/proto/tests/cases/serialize.rs 
b/datafusion/proto/tests/cases/serialize.rs
index d4a1ab44a6..972382b841 100644
--- a/datafusion/proto/tests/cases/serialize.rs
+++ b/datafusion/proto/tests/cases/serialize.rs
@@ -260,10 +260,7 @@ fn test_expression_serialization_roundtrip() {
     let lit = Expr::Literal(ScalarValue::Utf8(None));
     for builtin_fun in BuiltinScalarFunction::iter() {
         // default to 4 args (though some exprs like substr have error 
checking)
-        let num_args = match builtin_fun {
-            BuiltinScalarFunction::Substr => 3,
-            _ => 4,
-        };
+        let num_args = 4;
         let args: Vec<_> = 
std::iter::repeat(&lit).take(num_args).cloned().collect();
         let expr = Expr::ScalarFunction(ScalarFunction::new(builtin_fun, 
args));
 
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index d1fc031949..43bf2d8715 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -823,12 +823,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let fun = BuiltinScalarFunction::Strpos;
+        let fun = self
+            .context_provider
+            .get_function_meta("strpos")
+            .ok_or_else(|| {
+                internal_datafusion_err!("Unable to find expected 'strpos' 
function")
+            })?;
         let substr =
             self.sql_expr_to_logical_expr(substr_expr, schema, 
planner_context)?;
         let fullstr = self.sql_expr_to_logical_expr(str_expr, schema, 
planner_context)?;
         let args = vec![fullstr, substr];
-        Ok(Expr::ScalarFunction(ScalarFunction::new(fun, args)))
+        Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args)))
     }
     fn sql_agg_with_filter_to_expr(
         &self,
diff --git a/datafusion/sql/src/expr/substring.rs 
b/datafusion/sql/src/expr/substring.rs
index a5d1abf0f2..f58c6f3b94 100644
--- a/datafusion/sql/src/expr/substring.rs
+++ b/datafusion/sql/src/expr/substring.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::plan_err;
+use datafusion_common::{internal_datafusion_err, plan_err};
 use datafusion_common::{DFSchema, Result, ScalarValue};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::{BuiltinScalarFunction, Expr};
+use datafusion_expr::Expr;
 use sqlparser::ast::Expr as SQLExpr;
 
 impl<'a, S: ContextProvider> SqlToRel<'a, S> {
@@ -68,9 +68,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             }
         };
 
-        Ok(Expr::ScalarFunction(ScalarFunction::new(
-            BuiltinScalarFunction::Substr,
-            args,
-        )))
+        let fun = self
+            .context_provider
+            .get_function_meta("substr")
+            .ok_or_else(|| {
+                internal_datafusion_err!("Unable to find expected 'substr' 
function")
+            })?;
+
+        Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args)))
     }
 }
diff --git a/datafusion/sqllogictest/test_files/scalar.slt 
b/datafusion/sqllogictest/test_files/scalar.slt
index a77a2bf405..20c8b3d25f 100644
--- a/datafusion/sqllogictest/test_files/scalar.slt
+++ b/datafusion/sqllogictest/test_files/scalar.slt
@@ -2087,7 +2087,7 @@ select position('' in '')
 1
 
 
-query error DataFusion error: Error during planning: The STRPOS/INSTR/POSITION 
function can only accept strings, but got Int64.
+query error DataFusion error: Execution error: The STRPOS/INSTR/POSITION 
function can only accept strings, but got Int64.
 select position(1 in 1)
 
 

Reply via email to