This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 3ab1301c53 fix: handle empty delimiter in split_part (closes #20503)
(#20542)
3ab1301c53 is described below
commit 3ab1301c532047e666676da3e091af25e73d3a0b
Author: Gabriel Ferraté <[email protected]>
AuthorDate: Thu Feb 26 22:03:34 2026 +0100
fix: handle empty delimiter in split_part (closes #20503) (#20542)
## Which issue does this PR close?
- Closes #20503
## Rationale for this change
`split_part` did not handle empty delimiters in a PostgreSQL-compatible
way (`split("")` in Rust creates leading/trailing empty fields).
This could return unexpected results for positions like `1` / `-1` and
out-of-range values.
This PR aligns behavior with Postgres semantics for empty delimiters.
## What changes are included in this PR?
Small change in how we treat the 1, -1
## Are these changes tested?
Indeed!
## Are there any user-facing changes?
Yes, behavior is now more consistent with PostgreSQL for
`split_part(str, '', n)`.
No API changes.
---
datafusion/functions/src/string/split_part.rs | 130 +++++++++++++++++++++++++-
datafusion/sqllogictest/test_files/expr.slt | 20 ++++
2 files changed, 148 insertions(+), 2 deletions(-)
diff --git a/datafusion/functions/src/string/split_part.rs
b/datafusion/functions/src/string/split_part.rs
index e24dbd63d1..0bd197818e 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -231,7 +231,15 @@ where
"split_part index {n} exceeds maximum
supported value"
)
})?;
- string.split(delimiter).nth(idx)
+
+ if delimiter.is_empty() {
+ // Match PostgreSQL split_part behavior for
empty delimiter:
+ // treat the input as a single field ("ab" ->
["ab"]),
+ // rather than Rust's split("") result (["",
"a", "b", ""]).
+ (n == 1).then_some(string)
+ } else {
+ string.split(delimiter).nth(idx)
+ }
}
std::cmp::Ordering::Less => {
// Negative index: use rsplit().nth() to
efficiently get from the end
@@ -241,7 +249,14 @@ where
"split_part index {n} exceeds minimum
supported value"
)
})?;
- string.rsplit(delimiter).nth(idx)
+ if delimiter.is_empty() {
+ // Match PostgreSQL split_part behavior for
empty delimiter:
+ // treat the input as a single field ("ab" ->
["ab"]),
+ // rather than Rust's split("") result (["",
"a", "b", ""]).
+ (n == -1).then_some(string)
+ } else {
+ string.rsplit(delimiter).nth(idx)
+ }
}
std::cmp::Ordering::Equal => {
return exec_err!("field position must not be
zero");
@@ -341,6 +356,117 @@ mod tests {
Utf8,
StringArray
);
+ // Edge cases with delimiters
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+ ],
+ Ok(Some("a")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+ ],
+ Ok(Some("a,b")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("
")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+ ],
+ Ok(Some("a,b")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("
")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ // Edge cases with delimiters with negative n
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+ ],
+ Ok(Some("a,b")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("
")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+ ],
+ Ok(Some("a,b")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ vec![
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
Ok(())
}
diff --git a/datafusion/sqllogictest/test_files/expr.slt
b/datafusion/sqllogictest/test_files/expr.slt
index c737efca4a..6d19d1436e 100644
--- a/datafusion/sqllogictest/test_files/expr.slt
+++ b/datafusion/sqllogictest/test_files/expr.slt
@@ -701,6 +701,26 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
----
(empty)
+query T
+SELECT split_part('a,b', '', 1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', -1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', 2)
+----
+(empty)
+
+query T
+SELECT split_part('a,b', '', -2)
+----
+(empty)
+
statement error DataFusion error: Execution error: field position must not be
zero
SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]