This is an automated email from the ASF dual-hosted git repository.
timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new 675e41ed feat: add regexp_instr function (#1382)
675e41ed is described below
commit 675e41ed988360fd0758639e3fa52a2536282ebd
Author: Daniel Mesejo <[email protected]>
AuthorDate: Wed Feb 18 17:38:59 2026 +0100
feat: add regexp_instr function (#1382)
* feat: add regexp_instr function
The current implementation of regexp_instr in Datafusion, does not support
endoption. Hence None is passed in the implementation of the function
exposing it to Python.
* chore: add test for all optional arguments
* fix: make start truly optional in regexp_count
---
python/datafusion/functions.py | 37 +++++++++++++++++++++++++++++++++++--
python/tests/test_functions.py | 33 ++++++++++++++++++++++++++++++++-
src/functions.rs | 24 ++++++++++++++++++++++++
3 files changed, 91 insertions(+), 3 deletions(-)
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index 2aed9dd3..431afcc3 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -225,6 +225,7 @@ __all__ = [
"range",
"rank",
"regexp_count",
+ "regexp_instr",
"regexp_like",
"regexp_match",
"regexp_replace",
@@ -816,7 +817,7 @@ def regexp_replace(
def regexp_count(
- string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None
+ string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None
= None
) -> Expr:
"""Returns the number of matches in a string.
@@ -825,10 +826,42 @@ def regexp_count(
"""
if flags is not None:
flags = flags.expr
- start = start.expr if start is not None else Expr.expr
+ start = start.expr if start is not None else start
return Expr(f.regexp_count(string.expr, pattern.expr, start, flags))
+def regexp_instr(
+ values: Expr,
+ regex: Expr,
+ start: Expr | None = None,
+ n: Expr | None = None,
+ flags: Expr | None = None,
+ sub_expr: Expr | None = None,
+) -> Expr:
+ """Returns the position of a regular expression match in a string.
+
+ Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at
position
+ ``start`` (the first position is 1). Returns the starting or ending
position based
+ on ``end_position``. Use ``flags`` to control regex behavior and
``sub_expr`` to
+ return the position of a specific capture group instead of the entire
match.
+ """
+ start = start.expr if start is not None else None
+ n = n.expr if n is not None else None
+ flags = flags.expr if flags is not None else None
+ sub_expr = sub_expr.expr if sub_expr is not None else None
+
+ return Expr(
+ f.regexp_instr(
+ values.expr,
+ regex.expr,
+ start,
+ n,
+ flags,
+ sub_expr,
+ )
+ )
+
+
def repeat(string: Expr, n: Expr) -> Expr:
"""Repeats the ``string`` to ``n`` times."""
return Expr(f.repeat(string.expr, n.expr))
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index 34c8c5c9..7b3332ed 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -769,7 +769,38 @@ def test_array_function_obj_tests(stmt, py_expr):
pa.array(["H-o", "W-d", "!"], type=pa.string_view()),
),
(
- f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)),
+ f.regexp_count(column("a"), literal("(ell|orl)"),
start=literal(1)),
+ pa.array([1, 1, 0], type=pa.int64()),
+ ),
+ (
+ f.regexp_count(column("a"), literal("(ell|orl)")),
+ pa.array([1, 1, 0], type=pa.int64()),
+ ),
+ (
+ f.regexp_instr(column("a"), literal("(ell|orl)")),
+ pa.array([2, 2, 0], type=pa.int64()),
+ ),
+ (
+ f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)),
+ pa.array([4, 4, 0], type=pa.int64()),
+ ),
+ (
+ f.regexp_instr(
+ column("a"),
+ literal("(x)?([hw])"),
+ start=literal(1),
+ n=literal(1),
+ flags=literal("i"),
+ sub_expr=literal(2),
+ ),
+ pa.array([1, 1, 0], type=pa.int64()),
+ ),
+ (
+ f.regexp_instr(column("a"), literal("([hw])"), flags=literal("i")),
+ pa.array([1, 1, 0], type=pa.int64()),
+ ),
+ (
+ f.regexp_instr(column("a"), literal("(x)?([HW])"),
sub_expr=literal(2)),
pa.array([1, 1, 0], type=pa.int64()),
),
],
diff --git a/src/functions.rs b/src/functions.rs
index 5c802920..90b3a0a4 100644
--- a/src/functions.rs
+++ b/src/functions.rs
@@ -189,6 +189,29 @@ fn regexp_count(
.into())
}
+#[pyfunction]
+#[pyo3(signature = (values, regex, start=None, n=None, flags=None,
subexpr=None))]
+/// Returns the position in a string where the specified occurrence of a
regular expression is located
+fn regexp_instr(
+ values: PyExpr,
+ regex: PyExpr,
+ start: Option<PyExpr>,
+ n: Option<PyExpr>,
+ flags: Option<PyExpr>,
+ subexpr: Option<PyExpr>,
+) -> PyResult<PyExpr> {
+ Ok(functions::expr_fn::regexp_instr(
+ values.into(),
+ regex.into(),
+ start.map(|x| x.expr).or(Some(lit(1))),
+ n.map(|x| x.expr).or(Some(lit(1))),
+ None,
+ flags.map(|x| x.expr).or(Some(lit(""))),
+ subexpr.map(|x| x.expr).or(Some(lit(0))),
+ )
+ .into())
+}
+
/// Creates a new Sort Expr
#[pyfunction]
fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) ->
PyResult<PySortExpr> {
@@ -988,6 +1011,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) ->
PyResult<()> {
m.add_wrapped(wrap_pyfunction!(radians))?;
m.add_wrapped(wrap_pyfunction!(random))?;
m.add_wrapped(wrap_pyfunction!(regexp_count))?;
+ m.add_wrapped(wrap_pyfunction!(regexp_instr))?;
m.add_wrapped(wrap_pyfunction!(regexp_like))?;
m.add_wrapped(wrap_pyfunction!(regexp_match))?;
m.add_wrapped(wrap_pyfunction!(regexp_replace))?;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]