This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new cb2e915869d [refactor](opt) improve BE code readability of 
multi_match_any function (#39354)
cb2e915869d is described below

commit cb2e915869d1590640ba44b25c40207b3603ae99
Author: Chester <cheste...@tencent.com>
AuthorDate: Mon Sep 23 09:34:30 2024 +0800

    [refactor](opt) improve BE code readability of multi_match_any function 
(#39354)
    
    To improve BE code readability of **multi_match_any** function, this PR
    refactored codes by:
    1. optimize the head files from deprecated C++ header 'stddef.h' to
    'cstddef'
    2. use readability-qualified-auto
    3. use readability-braces-around-statements
    4. extract common codes of `vector_constant()` and `vector_vector()`into
    two functions:
    `prepare_regexps_and_scratch()` and `on_match()`
    5. simplify codes of `execute_impl()` dealing with null input by:
    removing two rarely used variables `haystack_nullable` and
    `needles_nullable`;
    adding the function `handle_nullable_column()`
---
 .../functions/functions_multi_string_search.cpp    | 200 +++++++++++----------
 1 file changed, 101 insertions(+), 99 deletions(-)

diff --git a/be/src/vec/functions/functions_multi_string_search.cpp 
b/be/src/vec/functions/functions_multi_string_search.cpp
index f7a1b8d7a90..7736a1a039b 100644
--- a/be/src/vec/functions/functions_multi_string_search.cpp
+++ b/be/src/vec/functions/functions_multi_string_search.cpp
@@ -20,10 +20,10 @@
 
 #include <hs/hs_common.h>
 #include <hs/hs_runtime.h>
-#include <stddef.h>
 
 #include <algorithm>
 #include <boost/iterator/iterator_facade.hpp>
+#include <cstddef>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -80,42 +80,30 @@ public:
         auto haystack_column = block.get_by_position(arguments[0]).column;
         auto needles_column = block.get_by_position(arguments[1]).column;
 
-        bool haystack_nullable = false;
-        bool needles_nullable = false;
-
-        if (haystack_column->is_nullable()) {
-            haystack_nullable = true;
-        }
-
-        if (needles_column->is_nullable()) {
-            needles_nullable = true;
-        }
-
         auto haystack_ptr = remove_nullable(haystack_column);
         auto needles_ptr = remove_nullable(needles_column);
 
-        const ColumnString* col_haystack_vector =
-                check_and_get_column<ColumnString>(&*haystack_ptr);
+        const auto* col_haystack_vector = 
check_and_get_column<ColumnString>(&*haystack_ptr);
         const ColumnConst* col_haystack_const =
                 check_and_get_column_const<ColumnString>(&*haystack_ptr);
 
-        const ColumnArray* col_needles_vector =
-                check_and_get_column<ColumnArray>(needles_ptr.get());
+        const auto* col_needles_vector = 
check_and_get_column<ColumnArray>(needles_ptr.get());
         const ColumnConst* col_needles_const =
                 check_and_get_column_const<ColumnArray>(needles_ptr.get());
 
-        if (!col_needles_const && !col_needles_vector)
+        if (!col_needles_const && !col_needles_vector) {
             return Status::InvalidArgument(
                     "function '{}' encountered unsupported needles column, 
found {}", name,
                     needles_column->get_name());
+        }
 
-        if (col_haystack_const && col_needles_vector)
+        if (col_haystack_const && col_needles_vector) {
             return Status::InvalidArgument(
                     "function '{}' doesn't support search with non-constant 
needles "
                     "in constant haystack",
                     name);
+        }
 
-        using ResultType = typename Impl::ResultType;
         auto col_res = ColumnVector<ResultType>::create();
         auto col_offsets = ColumnArray::ColumnOffsets::create();
 
@@ -140,25 +128,8 @@ public:
             return status;
         }
 
-        if (haystack_nullable) {
-            auto column_nullable = 
check_and_get_column<ColumnNullable>(haystack_column.get());
-            auto& null_map = column_nullable->get_null_map_data();
-            for (size_t i = 0; i != input_rows_count; ++i) {
-                if (null_map[i] == 1) {
-                    vec_res[i] = 0;
-                }
-            }
-        }
-
-        if (needles_nullable) {
-            auto column_nullable = 
check_and_get_column<ColumnNullable>(needles_column.get());
-            auto& null_map = column_nullable->get_null_map_data();
-            for (size_t i = 0; i != input_rows_count; ++i) {
-                if (null_map[i] == 1) {
-                    vec_res[i] = 0;
-                }
-            }
-        }
+        handle_nullable_column(haystack_column, vec_res, input_rows_count);
+        handle_nullable_column(needles_column, vec_res, input_rows_count);
 
         block.replace_by_position(result, std::move(col_res));
 
@@ -166,9 +137,25 @@ public:
     }
 
 private:
+    using ResultType = typename Impl::ResultType;
+
     const bool allow_hyperscan_ = true;
     const size_t max_hyperscan_regexp_length_ = 0;       // not limited
     const size_t max_hyperscan_regexp_total_length_ = 0; // not limited
+
+    /// Handles nullable column by setting result to 0 if the input is null
+    void handle_nullable_column(const ColumnPtr& column, 
PaddedPODArray<ResultType>& vec_res,
+                                size_t input_rows_count) const {
+        if (column->is_nullable()) {
+            const auto* column_nullable = assert_cast<const 
ColumnNullable*>(column.get());
+            const auto& null_map = column_nullable->get_null_map_data();
+            for (size_t i = 0; i != input_rows_count; ++i) {
+                if (null_map[i] == 1) {
+                    vec_res[i] = 0;
+                }
+            }
+        }
+    }
 };
 
 /// For more readable instantiations of MultiMatchAnyImpl<>
@@ -187,17 +174,67 @@ struct FunctionMultiMatchAnyImpl {
 
     static auto get_return_type() { return 
std::make_shared<DataTypeNumber<ResultType>>(); }
 
+    /**
+     * Prepares the regular expressions and scratch space for Hyperscan.
+     *
+     * This function takes a vector of needles (substrings to search for) and 
initializes
+     * the regular expressions and scratch space required for Hyperscan, a 
high-performance
+     * regular expression matching library.
+     *
+     */
+    static Status prepare_regexps_and_scratch(const std::vector<StringRef>& 
needles,
+                                              multiregexps::Regexps*& regexps,
+                                              multiregexps::ScratchPtr& 
smart_scratch) {
+        multiregexps::DeferredConstructedRegexpsPtr 
deferred_constructed_regexps =
+                multiregexps::getOrSet</*SaveIndices*/
+                                       FindAnyIndex, 
WithEditDistance>(needles, std::nullopt);
+        regexps = deferred_constructed_regexps->get();
+
+        hs_scratch_t* scratch = nullptr;
+        hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
+
+        if (err != HS_SUCCESS) {
+            return Status::InternalError("could not clone scratch space for 
vectorscan");
+        }
+
+        smart_scratch.reset(scratch);
+        return Status::OK();
+    }
+
+    /**
+     * Static callback function to handle the match results of the hs_scan 
function.
+     *
+     * This function is called when a matching substring is found while 
scanning with
+     * Hyperscan. It updates the result based on the match information.
+     *
+     */
+    static int on_match([[maybe_unused]] unsigned int id, unsigned long long 
/* from */, // NOLINT
+                        unsigned long long /* to */,                           
          // NOLINT
+                        unsigned int /* flags */, void* context) {
+        if constexpr (FindAnyIndex) {
+            *reinterpret_cast<ResultType*>(context) = id;
+        } else if constexpr (FindAny) {
+            *reinterpret_cast<ResultType*>(context) = 1;
+        }
+        /// Once we hit the callback, there is no need to search for others.
+        return 1;
+    }
+
     static Status vector_constant(const ColumnString::Chars& haystack_data,
                                   const ColumnString::Offsets& 
haystack_offsets,
                                   const Array& needles_arr, 
PaddedPODArray<ResultType>& res,
                                   PaddedPODArray<UInt64>& offsets, bool 
allow_hyperscan,
                                   size_t max_hyperscan_regexp_length,
                                   size_t max_hyperscan_regexp_total_length) {
-        if (!allow_hyperscan) return Status::InvalidArgument("Hyperscan 
functions are disabled");
+        if (!allow_hyperscan) {
+            return Status::InvalidArgument("Hyperscan functions are disabled");
+        }
 
         std::vector<StringRef> needles;
         needles.reserve(needles_arr.size());
-        for (const auto& needle : needles_arr) 
needles.emplace_back(needle.get<StringRef>());
+        for (const auto& needle : needles_arr) {
+            needles.emplace_back(needle.get<StringRef>());
+        }
 
         res.resize(haystack_offsets.size());
 
@@ -206,44 +243,26 @@ struct FunctionMultiMatchAnyImpl {
             return Status::OK();
         }
 
-        multiregexps::DeferredConstructedRegexpsPtr 
deferred_constructed_regexps =
-                multiregexps::getOrSet</*SaveIndices*/ FindAnyIndex, 
WithEditDistance>(
-                        needles, std::nullopt);
-        multiregexps::Regexps* regexps = deferred_constructed_regexps->get();
-
-        hs_scratch_t* scratch = nullptr;
-        hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
+        multiregexps::Regexps* regexps = nullptr;
+        multiregexps::ScratchPtr smart_scratch;
+        RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps, 
smart_scratch));
 
-        if (err != HS_SUCCESS)
-            return Status::InternalError("could not clone scratch space for 
vectorscan");
-
-        multiregexps::ScratchPtr smart_scratch(scratch);
-
-        auto on_match = []([[maybe_unused]] unsigned int id,
-                           unsigned long long /* from */, // NOLINT
-                           unsigned long long /* to */,   // NOLINT
-                           unsigned int /* flags */, void* context) -> int {
-            if constexpr (FindAnyIndex)
-                *reinterpret_cast<ResultType*>(context) = id;
-            else if constexpr (FindAny)
-                *reinterpret_cast<ResultType*>(context) = 1;
-            /// Once we hit the callback, there is no need to search for 
others.
-            return 1;
-        };
         const size_t haystack_offsets_size = haystack_offsets.size();
         UInt64 offset = 0;
         for (size_t i = 0; i < haystack_offsets_size; ++i) {
             UInt64 length = haystack_offsets[i] - offset;
             /// vectorscan restriction.
-            if (length > std::numeric_limits<UInt32>::max())
+            if (length > std::numeric_limits<UInt32>::max()) {
                 return Status::InternalError("too long string to search");
+            }
             /// zero the result, scan, check, update the offset.
             res[i] = 0;
-            err = hs_scan(regexps->getDB(),
-                          reinterpret_cast<const char*>(haystack_data.data()) 
+ offset,
-                          static_cast<unsigned>(length), 0, 
smart_scratch.get(), on_match, &res[i]);
-            if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED)
+            hs_error_t err = hs_scan(
+                    regexps->getDB(), reinterpret_cast<const 
char*>(haystack_data.data()) + offset,
+                    static_cast<unsigned>(length), 0, smart_scratch.get(), 
on_match, &res[i]);
+            if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) {
                 return Status::InternalError("failed to scan with vectorscan");
+            }
             offset = haystack_offsets[i];
         }
 
@@ -257,20 +276,22 @@ struct FunctionMultiMatchAnyImpl {
                                 PaddedPODArray<ResultType>& res, 
PaddedPODArray<UInt64>& offsets,
                                 bool allow_hyperscan, size_t 
max_hyperscan_regexp_length,
                                 size_t max_hyperscan_regexp_total_length) {
-        if (!allow_hyperscan) return Status::InvalidArgument("Hyperscan 
functions are disabled");
+        if (!allow_hyperscan) {
+            return Status::InvalidArgument("Hyperscan functions are disabled");
+        }
 
         res.resize(haystack_offsets.size());
 
         size_t prev_haystack_offset = 0;
         size_t prev_needles_offset = 0;
 
-        auto& nested_column =
+        const auto& nested_column =
                 
vectorized::check_and_get_column<vectorized::ColumnNullable>(needles_data)
                         ->get_nested_column();
-        const ColumnString* needles_data_string = 
check_and_get_column<ColumnString>(nested_column);
+        const auto* needles_data_string = 
check_and_get_column<ColumnString>(nested_column);
 
         if (!needles_data_string) {
-            return Status::InvalidArgument("needles should be string");
+            return Status::InvalidArgument("needles should be string column");
         }
 
         std::vector<StringRef> needles;
@@ -287,46 +308,27 @@ struct FunctionMultiMatchAnyImpl {
                 continue;
             }
 
-            multiregexps::DeferredConstructedRegexpsPtr 
deferred_constructed_regexps =
-                    multiregexps::getOrSet</*SaveIndices*/ FindAnyIndex, 
WithEditDistance>(
-                            needles, std::nullopt);
-            multiregexps::Regexps* regexps = 
deferred_constructed_regexps->get();
-
-            hs_scratch_t* scratch = nullptr;
-            hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
-
-            if (err != HS_SUCCESS)
-                return Status::InternalError("could not clone scratch space 
for vectorscan");
-
-            multiregexps::ScratchPtr smart_scratch(scratch);
-
-            auto on_match = []([[maybe_unused]] unsigned int id,
-                               unsigned long long /* from */, // NOLINT
-                               unsigned long long /* to */,   // NOLINT
-                               unsigned int /* flags */, void* context) -> int 
{
-                if constexpr (FindAnyIndex)
-                    *reinterpret_cast<ResultType*>(context) = id;
-                else if constexpr (FindAny)
-                    *reinterpret_cast<ResultType*>(context) = 1;
-                /// Once we hit the callback, there is no need to search for 
others.
-                return 1;
-            };
+            multiregexps::Regexps* regexps = nullptr;
+            multiregexps::ScratchPtr smart_scratch;
+            RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps, 
smart_scratch));
 
             const size_t cur_haystack_length = haystack_offsets[i] - 
prev_haystack_offset;
 
             /// vectorscan restriction.
-            if (cur_haystack_length > std::numeric_limits<UInt32>::max())
+            if (cur_haystack_length > std::numeric_limits<UInt32>::max()) {
                 return Status::InternalError("too long string to search");
+            }
 
             /// zero the result, scan, check, update the offset.
             res[i] = 0;
-            err = hs_scan(
+            hs_error_t err = hs_scan(
                     regexps->getDB(),
                     reinterpret_cast<const char*>(haystack_data.data()) + 
prev_haystack_offset,
                     static_cast<unsigned>(cur_haystack_length), 0, 
smart_scratch.get(), on_match,
                     &res[i]);
-            if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED)
+            if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) {
                 return Status::InternalError("failed to scan with vectorscan");
+            }
 
             prev_haystack_offset = haystack_offsets[i];
             prev_needles_offset = needles_offsets[i];


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to