This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e99e3a645 improve LIKE regex (#6145)
0e99e3a645 is described below

commit 0e99e3a64532665218bcb0d048c4e9961e39a913
Author: Samuel Colvin <[email protected]>
AuthorDate: Mon Jul 29 19:45:11 2024 +0100

    improve LIKE regex (#6145)
---
 arrow-string/src/predicate.rs | 83 +++++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 26 deletions(-)

diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index 01e3710a6d..c7ccffb3ad 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -140,39 +140,54 @@ fn ends_with_ignore_ascii_case(haystack: &str, needle: 
&str) -> bool {
 
 /// Transforms a like `pattern` to a regex compatible pattern. To achieve 
that, it does:
 ///
-/// 1. Replace like wildcards for regex expressions as the pattern will be 
evaluated using regex match: `%` => `.*` and `_` => `.`
-/// 2. Escape regex meta characters to match them and not be evaluated as 
regex special chars. For example: `.` => `\\.`
-/// 3. Replace escaped like wildcards removing the escape characters to be 
able to match it as a regex. For example: `\\%` => `%`
+/// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at 
the start or end of the pattern,
+///    where the regex is just truncated - e.g. `%foo%` => `foo` rather than 
`^.*foo.*$`)
+/// 2. Replace `LIKE` single-character wildcards `_` => `.`
+/// 3. Escape regex meta characters to match them and not be evaluated as 
regex special chars. e.g. `.` => `\\.`
+/// 4. Replace escaped `LIKE` wildcards removing the escape characters to be 
able to match it as a regex. e.g. `\\%` => `%`
 fn regex_like(pattern: &str, case_insensitive: bool) -> Result<Regex, 
ArrowError> {
     let mut result = String::with_capacity(pattern.len() * 2);
-    result.push('^');
     let mut chars_iter = pattern.chars().peekable();
+    match chars_iter.peek() {
+        // if the pattern starts with `%`, we avoid starting the regex with a 
slow but meaningless `^.*`
+        Some('%') => {
+            chars_iter.next();
+        }
+        _ => result.push('^'),
+    };
+
     while let Some(c) = chars_iter.next() {
-        if c == '\\' {
-            let next = chars_iter.peek();
-            match next {
-                Some(next) if is_like_pattern(*next) => {
-                    result.push(*next);
-                    // Skipping the next char as it is already appended
-                    chars_iter.next();
+        match c {
+            '\\' => {
+                match chars_iter.peek() {
+                    Some(next) if is_like_pattern(*next) => {
+                        result.push(*next);
+                        // Skipping the next char as it is already appended
+                        chars_iter.next();
+                    }
+                    _ => {
+                        result.push('\\');
+                        result.push('\\');
+                    }
                 }
-                _ => {
-                    result.push('\\');
+            }
+            '%' => result.push_str(".*"),
+            '_' => result.push('.'),
+            c => {
+                if regex_syntax::is_meta_character(c) {
                     result.push('\\');
                 }
+                result.push(c);
             }
-        } else if regex_syntax::is_meta_character(c) {
-            result.push('\\');
-            result.push(c);
-        } else if c == '%' {
-            result.push_str(".*");
-        } else if c == '_' {
-            result.push('.');
-        } else {
-            result.push(c);
         }
     }
-    result.push('$');
+    // instead of ending the regex with `.*$` and making it needlessly slow, 
we just end the regex
+    if result.ends_with(".*") {
+        result.pop();
+        result.pop();
+    } else {
+        result.push('$');
+    }
     RegexBuilder::new(&result)
         .case_insensitive(case_insensitive)
         .dot_matches_new_line(true)
@@ -197,9 +212,25 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_replace_like_wildcards() {
-        let a_eq = "_%";
-        let expected = "^..*$";
+    fn test_replace_start_end_percent() {
+        let a_eq = "%foobar%";
+        let expected = "foobar";
+        let r = regex_like(a_eq, false).unwrap();
+        assert_eq!(r.to_string(), expected);
+    }
+
+    #[test]
+    fn test_replace_middle_percent() {
+        let a_eq = "foo%bar";
+        let expected = "^foo.*bar$";
+        let r = regex_like(a_eq, false).unwrap();
+        assert_eq!(r.to_string(), expected);
+    }
+
+    #[test]
+    fn test_replace_underscore() {
+        let a_eq = "foo_bar";
+        let expected = "^foo.bar$";
         let r = regex_like(a_eq, false).unwrap();
         assert_eq!(r.to_string(), expected);
     }

Reply via email to