tlm365 commented on code in PR #6849:
URL: https://github.com/apache/arrow-rs/pull/6849#discussion_r1894744658
##########
arrow-string/src/regexp.rs:
##########
@@ -333,42 +353,74 @@ fn get_scalar_pattern_flag<'a, OffsetSize:
OffsetSizeTrait>(
}
}
+fn get_scalar_pattern_flag_utf8view<'a>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array.as_string_view();
+ let regex = regex.is_valid(0).then(|| regex.value(0));
+
+ if let Some(flag_array) = flag_array {
+ let flag = flag_array.as_string_view();
+ (regex, flag.is_valid(0).then(|| flag.value(0)))
+ } else {
+ (regex, None)
+ }
+}
+
+macro_rules! process_regexp_match {
+ ($array:expr, $regex:expr, $list_builder:expr) => {
+ $array
+ .iter()
+ .map(|value| {
+ match value {
+ // Required for Postgres compatibility:
+ // SELECT regexp_match('foobarbequebaz', ''); = {""}
+ Some(_) if $regex.as_str().is_empty() => {
+ $list_builder.values().append_value("");
+ $list_builder.append(true);
+ }
+ Some(value) => match $regex.captures(value) {
+ Some(caps) => {
+ let mut iter = caps.iter();
+ if caps.len() > 1 {
+ iter.next();
+ }
+ for m in iter.flatten() {
+
$list_builder.values().append_value(m.as_str());
+ }
+ $list_builder.append(true);
+ }
+ None => $list_builder.append(false),
+ },
+ None => $list_builder.append(false),
+ }
+ Ok(())
+ })
+ .collect::<Result<Vec<()>, ArrowError>>()?
+ };
+}
+
fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex: &Regex,
) -> Result<ArrayRef, ArrowError> {
let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
let mut list_builder = ListBuilder::new(builder);
- array
- .iter()
- .map(|value| {
- match value {
- // Required for Postgres compatibility:
- // SELECT regexp_match('foobarbequebaz', ''); = {""}
- Some(_) if regex.as_str() == "" => {
- list_builder.values().append_value("");
- list_builder.append(true);
- }
- Some(value) => match regex.captures(value) {
- Some(caps) => {
- let mut iter = caps.iter();
- if caps.len() > 1 {
- iter.next();
- }
- for m in iter.flatten() {
- list_builder.values().append_value(m.as_str());
- }
+ process_regexp_match!(array, regex, list_builder);
- list_builder.append(true);
- }
- None => list_builder.append(false),
- },
- _ => list_builder.append(false),
- }
- Ok(())
- })
- .collect::<Result<Vec<()>, ArrowError>>()?;
+ Ok(Arc::new(list_builder.finish()))
+}
+
+fn regexp_scalar_match_utf8view(
+ array: &StringViewArray,
+ regex: &Regex,
+) -> Result<ArrayRef, ArrowError> {
+ let builder = StringViewBuilder::with_capacity(0);
+ let mut list_builder = ListBuilder::new(builder);
+
+ process_regexp_match!(array, regex, list_builder);
Review Comment:
@wiedld Thanks so much for reviewing ❤️
> Confirmed this expands to the original logic, for both StringView (here)
and StringArrays (above), to match on an array of regex. Why the use of macros?
> It's the same logic (for a single regex) into a macro to use for both
StringView and StringArray. Why the macro?
I just want to reduce duplicate code here. I remember that I have struggled
when I implement generic functions instead of macro. But if it makes code hard
to read/debug/maintain I can try to remove the macro and using another solution.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]