pitrou commented on code in PR #45900:
URL: https://github.com/apache/arrow/pull/45900#discussion_r2010299255


##########
cpp/src/arrow/compute/kernels/scalar_string_test.cc:
##########
@@ -314,16 +314,26 @@ TYPED_TEST(TestBinaryKernels, NonUtf8Regex) {
                      this->MakeArray({"\xfc\x40", "this \xfc\x40 that 
\xfc\x40"}),
                      this->MakeArray({"bazz", "this bazz that \xfc\x40"}), 
&options);
   }
-  // TODO the following test is broken (GH-45735)
   {
-    ExtractRegexOptions options("(?P<letter>[\\xfc])(?P<digit>\\d)");
-    auto null_bitmap = std::make_shared<Buffer>("0");
+    ExtractRegexOptions options("(?P<letter>[\xfc])(?P<digit>\\d+)");
     auto output = StructArray::Make(
-        {this->MakeArray({"\xfc", "1"}), this->MakeArray({"\xfc", "2"})},
-        {field("letter", this->type()), field("digit", this->type())}, 
null_bitmap);
-    this->CheckUnary("extract_regex", this->MakeArray({"foo\xfc 1bar", 
"\x02\xfc\x40"}),
+        {this->MakeArray({"\xfc", "\xfc"}), this->MakeArray({"14", "2"})},
+        {field("letter", this->type()), field("digit", this->type())});
+    this->CheckUnary("extract_regex",
+                     this->MakeArray({"foo\xfc\x31\x34 bar", "\x02\xfc\x32"}),
                      std::static_pointer_cast<Array>(*output), &options);
   }
+  {
+    ExtractRegexSpanOptions options("(?P<letter>[\xfc])(?P<digit>\\d+)");
+    auto offset_type = is_binary_like(this->type()->id()) ? int32() : int64();
+    auto out_type = struct_({field("letter", fixed_size_list(offset_type, 2)),
+                             field("digit", fixed_size_list(offset_type, 2))});
+    this->CheckUnary("extract_regex_span",
+                     this->MakeArray({"foo\xfc\x31\x34 bar", "\x02\xfc\x32"}), 
out_type,
+                     R"([{"letter": [3, 1], "digit": [4,2]},
+                                             {"letter": [1, 1], "digit": [2, 
1]}])",

Review Comment:
   We can make the alignment nicer here.
   ```suggestion
                        R"([{"letter": [3, 1], "digit": [4, 2]},
                            {"letter": [1, 1], "digit": [2, 1]}])",
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2246,8 +2239,15 @@ struct ExtractRegexData : public BaseExtractRegexData {
 
 Result<TypeHolder> ResolveExtractRegexOutput(KernelContext* ctx,
                                              const std::vector<TypeHolder>& 
types) {
+  auto input_type = types[0].type;
+  if (input_type == nullptr) {
+    // No input type specified
+    return nullptr;
+  }
+  DCHECK(is_base_binary_like(input_type->id()));
+  auto is_utf8 = is_string(input_type->id());
   ExtractRegexOptions options = ExtractRegexState::Get(ctx);
-  ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+  ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options, is_utf8));

Review Comment:
   Can we also change the `is_utf8` parameter to be required, to avoid relying 
on its default value?
   I mean replace all instances of `bool is_utf8 = true` with `bool is_utf8`.



##########
cpp/src/arrow/compute/kernels/scalar_string_test.cc:
##########
@@ -371,18 +381,28 @@ TYPED_TEST(TestBinaryKernels, NonUtf8WithNullRegex) {
                      this->template MakeArray<std::string>({{"\x00\x40", 2}}),
                      this->type(), R"(["bazz"])", &options);
   }
-  // TODO the following test is broken (GH-45735)
   {
-    ExtractRegexOptions options("(?P<null>[\\x00])(?P<digit>\\d)");
-    auto null_bitmap = std::make_shared<Buffer>("0");
+    ExtractRegexOptions options(std::string("(?P<null>[\x00])(?P<digit>\\d+)", 
27));
     auto output = StructArray::Make(
-        {this->template MakeArray<std::string>({{"\x00", 1}, {"1", 1}}),
-         this->template MakeArray<std::string>({{"\x00", 1}, {"2", 1}})},
-        {field("null", this->type()), field("digit", this->type())}, 
null_bitmap);
-    this->CheckUnary(
-        "extract_regex",
-        this->template MakeArray<std::string>({{"foo\x00 1bar", 9}, 
{"\x02\x00\x40", 3}}),
-        std::static_pointer_cast<Array>(*output), &options);
+        {this->template MakeArray<std::string>({{"\x00", 1}, {"\x00", 1}}),
+         this->template MakeArray<std::string>({"14", "2"})},
+        {field("null", this->type()), field("digit", this->type())});
+    this->CheckUnary("extract_regex",
+                     this->template MakeArray<std::string>(
+                         {{"foo\x00\x31\x34 bar", 10}, {"\x02\x00\x32", 3}}),
+                     std::static_pointer_cast<Array>(*output), &options);
+  }
+  {
+    ExtractRegexSpanOptions 
options(std::string("(?P<null>[\x00])(?P<digit>\\d+)", 27));
+    auto offset_type = is_binary_like(this->type()->id()) ? int32() : int64();
+    auto out_type = struct_({field("null", fixed_size_list(offset_type, 2)),
+                             field("digit", fixed_size_list(offset_type, 2))});
+    this->CheckUnary("extract_regex_span",
+                     this->template MakeArray<std::string>(
+                         {{"foo\x00\x31\x34 bar", 10}, {"\x02\x00\x32", 3}}),
+                     out_type, R"([{"null": [3, 1], "digit": [4,2]},
+                                             {"null": [1, 1], "digit": [2, 
1]}])",

Review Comment:
   Here as well
   ```suggestion
                        out_type, R"([{"null": [3, 1], "digit": [4, 2]},
                                      {"null": [1, 1], "digit": [2, 1]}])",
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to