rluvaton commented on code in PR #6926:
URL: https://github.com/apache/arrow-rs/pull/6926#discussion_r1900231741
##########
arrow-string/src/predicate.rs:
##########
@@ -190,62 +270,97 @@ impl<'a> Predicate<'a> {
BooleanArray::new(values, nulls)
} else {
BooleanArray::from_unary(array, |haystack| {
- ends_with(haystack, v,
equals_ignore_ascii_case_kernel) != negate
+ ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*,
equals_ignore_ascii_case_kernel) != negate
})
}
}
- Predicate::Regex(v) => {
+ Self::Regex(v) => {
BooleanArray::from_unary(array, |haystack|
v.is_match(haystack) != negate)
}
}
}
+
+ fn regex_like(pattern: &Self::UnsizedItem, case_insensitive: bool) ->
Result<Self::RegexType, ArrowError> {
+ let regex_pattern =
transform_pattern_like_to_regex_compatible_pattern(pattern$($as_char_iter)+,
pattern.len());
+ <$RegexBuilder>::new(®ex_pattern)
+ .case_insensitive(case_insensitive)
+ .dot_matches_new_line(true)
+ .build()
+ .map_err(|e| {
+ ArrowError::InvalidArgumentError(format!(
+ "Unable to build regex from LIKE pattern: {e}"
+ ))
+ })
+ }
+}
+ }
}
-fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) ->
bool) -> bool {
- lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel)
+impl_predicate!(
+
+type PredicateUnsizedItem = str;
+type MatchingRegexBuilder = RegexBuilder;
+type ViewArray = StringViewArray;
+
+impl<'a> PredicateImpl<'a> for Predicate<'a> {
+ type UnsizedItem = PredicateUnsizedItem;
+ type RegexType = Regex;
+
+ ...
+
}
-/// This is faster than `str::starts_with` for small strings.
-/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
-fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8,
&u8)) -> bool) -> bool {
- if needle.len() > haystack.len() {
- false
- } else {
- zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel)
- }
+fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] {
+ item.as_bytes()
}
-/// This is faster than `str::ends_with` for small strings.
-/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
-fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8))
-> bool) -> bool {
- if needle.len() > haystack.len() {
- false
- } else {
- zip(
- haystack.as_bytes().iter().rev(),
- needle.as_bytes().iter().rev(),
- )
- .all(byte_eq_kernel)
- }
+
+fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator<Item=char> {
+ pattern.chars()
}
-fn equals_kernel((n, h): (&u8, &u8)) -> bool {
- n == h
+const PERCENT: &'static PredicateUnsizedItem = "%";
+const PERCENT_ESCAPED: &'static PredicateUnsizedItem = "\\%";
+);
+
+impl_predicate!(
+
+type PredicateUnsizedItem = [u8];
+type MatchingRegexBuilder = BinaryRegexBuilder;
+type ViewArray = BinaryViewArray;
+
+impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> {
+ type UnsizedItem = PredicateUnsizedItem;
+ type RegexType = BinaryRegex;
+
+ ...
+
}
-fn equals_ignore_ascii_case_kernel((n, h): (&u8, &u8)) -> bool {
- n.eq_ignore_ascii_case(h)
+fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] {
+ item
+}
+
+fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator<Item=char> {
+ pattern.iter().map(|&b| b as char)
}
+const PERCENT: &'static PredicateUnsizedItem = b"%";
+const PERCENT_ESCAPED: &'static PredicateUnsizedItem = b"\\%";
+);
Review Comment:
Tried to make the way the macro implement more readable by doing this look
alike skeleton code.
I needed to have a function that convert to bytes, to char iterator, percent
+ percent escaped values.
in order to see the context of those functions, instead of just passing the
implementation (just passing `str::as_bytes` for string and nothing for binary)
I created the imaginary wrapper `as_bytes` function to see the input and output
(Note that we are not really creating the functions `as_bytes` and
`to_char_iter`, or the consts `PERCENT` and `PERCENT_ESCAPED`)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]