This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 3ab1301c53 fix: handle empty delimiter in split_part (closes #20503) 
(#20542)
3ab1301c53 is described below

commit 3ab1301c532047e666676da3e091af25e73d3a0b
Author: Gabriel Ferraté <[email protected]>
AuthorDate: Thu Feb 26 22:03:34 2026 +0100

    fix: handle empty delimiter in split_part (closes #20503) (#20542)
    
    ## Which issue does this PR close?
    
    - Closes #20503
    
    ## Rationale for this change
    
    `split_part` did not handle empty delimiters in a PostgreSQL-compatible
    way (`split("")` in Rust creates leading/trailing empty fields).
    This could return unexpected results for positions like `1` / `-1` and
    out-of-range values.
    This PR aligns behavior with Postgres semantics for empty delimiters.
    
    ## What changes are included in this PR?
    
    Small change in how we treat the 1, -1
    
    ## Are these changes tested?
    
    Indeed!
    
    ## Are there any user-facing changes?
    
    Yes, behavior is now more consistent with PostgreSQL for
    `split_part(str, '', n)`.
    No API changes.
---
 datafusion/functions/src/string/split_part.rs | 130 +++++++++++++++++++++++++-
 datafusion/sqllogictest/test_files/expr.slt   |  20 ++++
 2 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/datafusion/functions/src/string/split_part.rs 
b/datafusion/functions/src/string/split_part.rs
index e24dbd63d1..0bd197818e 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -231,7 +231,15 @@ where
                                     "split_part index {n} exceeds maximum 
supported value"
                                 )
                             })?;
-                            string.split(delimiter).nth(idx)
+
+                            if delimiter.is_empty() {
+                                // Match PostgreSQL split_part behavior for 
empty delimiter:
+                                // treat the input as a single field ("ab" -> 
["ab"]),
+                                // rather than Rust's split("") result (["", 
"a", "b", ""]).
+                                (n == 1).then_some(string)
+                            } else {
+                                string.split(delimiter).nth(idx)
+                            }
                         }
                         std::cmp::Ordering::Less => {
                             // Negative index: use rsplit().nth() to 
efficiently get from the end
@@ -241,7 +249,14 @@ where
                                     "split_part index {n} exceeds minimum 
supported value"
                                 )
                             })?;
-                            string.rsplit(delimiter).nth(idx)
+                            if delimiter.is_empty() {
+                                // Match PostgreSQL split_part behavior for 
empty delimiter:
+                                // treat the input as a single field ("ab" -> 
["ab"]),
+                                // rather than Rust's split("") result (["", 
"a", "b", ""]).
+                                (n == -1).then_some(string)
+                            } else {
+                                string.rsplit(delimiter).nth(idx)
+                            }
                         }
                         std::cmp::Ordering::Equal => {
                             return exec_err!("field position must not be 
zero");
@@ -341,6 +356,117 @@ mod tests {
             Utf8,
             StringArray
         );
+        // Edge cases with delimiters
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" 
")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" 
")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+
+        // Edge cases with delimiters with negative n
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" 
")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
 
         Ok(())
     }
diff --git a/datafusion/sqllogictest/test_files/expr.slt 
b/datafusion/sqllogictest/test_files/expr.slt
index c737efca4a..6d19d1436e 100644
--- a/datafusion/sqllogictest/test_files/expr.slt
+++ b/datafusion/sqllogictest/test_files/expr.slt
@@ -701,6 +701,26 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
 ----
 (empty)
 
+query T
+SELECT split_part('a,b', '', 1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', -1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', 2)
+----
+(empty)
+
+query T
+SELECT split_part('a,b', '', -2)
+----
+(empty)
+
 statement error DataFusion error: Execution error: field position must not be 
zero
 SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to