This is an automated email from the ASF dual-hosted git repository.

timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 675e41ed feat: add regexp_instr function (#1382)
675e41ed is described below

commit 675e41ed988360fd0758639e3fa52a2536282ebd
Author: Daniel Mesejo <[email protected]>
AuthorDate: Wed Feb 18 17:38:59 2026 +0100

    feat: add regexp_instr function (#1382)
    
    * feat: add regexp_instr function
    
    The current implementation of regexp_instr in Datafusion, does not support
    endoption. Hence None is passed in the implementation of the function
    exposing it to Python.
    
    * chore: add test for all optional arguments
    
    * fix: make start truly optional in regexp_count
---
 python/datafusion/functions.py | 37 +++++++++++++++++++++++++++++++++++--
 python/tests/test_functions.py | 33 ++++++++++++++++++++++++++++++++-
 src/functions.rs               | 24 ++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index 2aed9dd3..431afcc3 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -225,6 +225,7 @@ __all__ = [
     "range",
     "rank",
     "regexp_count",
+    "regexp_instr",
     "regexp_like",
     "regexp_match",
     "regexp_replace",
@@ -816,7 +817,7 @@ def regexp_replace(
 
 
 def regexp_count(
-    string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None
+    string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None 
= None
 ) -> Expr:
     """Returns the number of matches in a string.
 
@@ -825,10 +826,42 @@ def regexp_count(
     """
     if flags is not None:
         flags = flags.expr
-    start = start.expr if start is not None else Expr.expr
+    start = start.expr if start is not None else start
     return Expr(f.regexp_count(string.expr, pattern.expr, start, flags))
 
 
+def regexp_instr(
+    values: Expr,
+    regex: Expr,
+    start: Expr | None = None,
+    n: Expr | None = None,
+    flags: Expr | None = None,
+    sub_expr: Expr | None = None,
+) -> Expr:
+    """Returns the position of a regular expression match in a string.
+
+    Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at 
position
+    ``start`` (the first position is 1). Returns the starting or ending 
position based
+    on ``end_position``. Use ``flags`` to control regex behavior and 
``sub_expr`` to
+    return the position of a specific capture group instead of the entire 
match.
+    """
+    start = start.expr if start is not None else None
+    n = n.expr if n is not None else None
+    flags = flags.expr if flags is not None else None
+    sub_expr = sub_expr.expr if sub_expr is not None else None
+
+    return Expr(
+        f.regexp_instr(
+            values.expr,
+            regex.expr,
+            start,
+            n,
+            flags,
+            sub_expr,
+        )
+    )
+
+
 def repeat(string: Expr, n: Expr) -> Expr:
     """Repeats the ``string`` to ``n`` times."""
     return Expr(f.repeat(string.expr, n.expr))
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index 34c8c5c9..7b3332ed 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -769,7 +769,38 @@ def test_array_function_obj_tests(stmt, py_expr):
             pa.array(["H-o", "W-d", "!"], type=pa.string_view()),
         ),
         (
-            f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)),
+            f.regexp_count(column("a"), literal("(ell|orl)"), 
start=literal(1)),
+            pa.array([1, 1, 0], type=pa.int64()),
+        ),
+        (
+            f.regexp_count(column("a"), literal("(ell|orl)")),
+            pa.array([1, 1, 0], type=pa.int64()),
+        ),
+        (
+            f.regexp_instr(column("a"), literal("(ell|orl)")),
+            pa.array([2, 2, 0], type=pa.int64()),
+        ),
+        (
+            f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)),
+            pa.array([4, 4, 0], type=pa.int64()),
+        ),
+        (
+            f.regexp_instr(
+                column("a"),
+                literal("(x)?([hw])"),
+                start=literal(1),
+                n=literal(1),
+                flags=literal("i"),
+                sub_expr=literal(2),
+            ),
+            pa.array([1, 1, 0], type=pa.int64()),
+        ),
+        (
+            f.regexp_instr(column("a"), literal("([hw])"), flags=literal("i")),
+            pa.array([1, 1, 0], type=pa.int64()),
+        ),
+        (
+            f.regexp_instr(column("a"), literal("(x)?([HW])"), 
sub_expr=literal(2)),
             pa.array([1, 1, 0], type=pa.int64()),
         ),
     ],
diff --git a/src/functions.rs b/src/functions.rs
index 5c802920..90b3a0a4 100644
--- a/src/functions.rs
+++ b/src/functions.rs
@@ -189,6 +189,29 @@ fn regexp_count(
     .into())
 }
 
+#[pyfunction]
+#[pyo3(signature = (values, regex, start=None, n=None, flags=None, 
subexpr=None))]
+/// Returns the position in a string where the specified occurrence of a 
regular expression is located
+fn regexp_instr(
+    values: PyExpr,
+    regex: PyExpr,
+    start: Option<PyExpr>,
+    n: Option<PyExpr>,
+    flags: Option<PyExpr>,
+    subexpr: Option<PyExpr>,
+) -> PyResult<PyExpr> {
+    Ok(functions::expr_fn::regexp_instr(
+        values.into(),
+        regex.into(),
+        start.map(|x| x.expr).or(Some(lit(1))),
+        n.map(|x| x.expr).or(Some(lit(1))),
+        None,
+        flags.map(|x| x.expr).or(Some(lit(""))),
+        subexpr.map(|x| x.expr).or(Some(lit(0))),
+    )
+    .into())
+}
+
 /// Creates a new Sort Expr
 #[pyfunction]
 fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> 
PyResult<PySortExpr> {
@@ -988,6 +1011,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> 
PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(radians))?;
     m.add_wrapped(wrap_pyfunction!(random))?;
     m.add_wrapped(wrap_pyfunction!(regexp_count))?;
+    m.add_wrapped(wrap_pyfunction!(regexp_instr))?;
     m.add_wrapped(wrap_pyfunction!(regexp_like))?;
     m.add_wrapped(wrap_pyfunction!(regexp_match))?;
     m.add_wrapped(wrap_pyfunction!(regexp_replace))?;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to