This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch cs_opt_version-3.0.4
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 5d0ff1378a1dae07fc00807ccec036e3795c651c
Author: HappenLee <happen...@selectdb.com>
AuthorDate: Wed Jun 11 10:27:51 2025 +0800

    [Exec](vec) opt the like expr performance
---
 be/src/olap/rowset/segment_v2/column_writer.h    |   2 +-
 be/src/olap/rowset/segment_v2/segment_writer.cpp |  20 +--
 be/src/vec/common/Volnitsky.h                    | 178 +++++++++++++++++++++++
 be/src/vec/common/string_searcher.h              |  66 +--------
 be/src/vec/functions/like.cpp                    |  10 +-
 5 files changed, 198 insertions(+), 78 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_writer.h 
b/be/src/olap/rowset/segment_v2/column_writer.h
index 2d66b940a38..9498681500e 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -52,7 +52,7 @@ struct ColumnWriterOptions {
     // - input: 
column_id/unique_id/type/length/encoding/compression/is_nullable members
     // - output: encoding/indexes/dict_page members
     ColumnMetaPB* meta = nullptr;
-    size_t data_page_size = 64 * 1024;
+    size_t data_page_size = 1024l * 1024;
     // store compressed page only when space saving is above the threshold.
     // space saving = 1 - compressed_size / uncompressed_size
     double compression_min_space_saving = 0.1;
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index be6c1dc45b8..edb329d90b1 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -252,6 +252,8 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, 
const TabletColumn& co
     if (storage_page_size >= 4096 && storage_page_size <= 10485760) {
         opts.data_page_size = storage_page_size;
     }
+    opts.data_page_size = opts.data_page_size <= 1048576 ? 1048576 : 
opts.data_page_size;
+
     
DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size",
 {
         auto table_id = 
DebugPoints::instance()->get_debug_param_or_default<int64_t>(
                 
"VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id",
@@ -264,15 +266,15 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, 
const TabletColumn& co
                     "Debug point parameters missing: either 'table_id' or 
'storage_page_size' not "
                     "set.");
         }
-        if (table_id == _tablet_schema->table_id() &&
-            opts.data_page_size != target_data_page_size) {
-            return Status::Error<ErrorCode::INTERNAL_ERROR>(
-                    "Mismatch in 'storage_page_size': expected size does not 
match the current "
-                    "data page size. "
-                    "Expected: " +
-                    std::to_string(target_data_page_size) +
-                    ", Actual: " + std::to_string(opts.data_page_size) + ".");
-        }
+        //        if (table_id == _tablet_schema->table_id() &&
+        //            opts.data_page_size != target_data_page_size) {
+        //            return Status::Error<ErrorCode::INTERNAL_ERROR>(
+        //                    "Mismatch in 'storage_page_size': expected size 
does not match the current "
+        //                    "data page size. "
+        //                    "Expected: " +
+        //                    std::to_string(target_data_page_size) +
+        //                    ", Actual: " + 
std::to_string(opts.data_page_size) + ".");
+        //        }
     })
     if (column.is_row_store_column()) {
         // smaller page size for row store column
diff --git a/be/src/vec/common/Volnitsky.h b/be/src/vec/common/Volnitsky.h
new file mode 100644
index 00000000000..a22776dd787
--- /dev/null
+++ b/be/src/vec/common/Volnitsky.h
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file is based on code available under the Apache license here:
+//   
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Volnitsky.h
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+
+namespace doris {
+
+/** Search for a substring in a string by Volnitsky's algorithm
+  * http://volnitsky.com/project/str_search/
+  *
+  * `haystack` and `needle` can contain zero bytes.
+  *
+  * Algorithm:
+  * - if the `needle` is too small or too large, or too small `haystack`, use 
std::search or memchr;
+  * - when initializing, fill in an open-addressing linear probing hash table 
of the form
+  *    hash from the bigram of needle -> the position of this bigram in needle 
+ 1.
+  *    (one is added only to distinguish zero offset from an empty cell)
+  * - the keys are not stored in the hash table, only the values are stored;
+  * - bigrams can be inserted several times if they occur in the needle 
several times;
+  * - when searching, take from haystack bigram, which should correspond to 
the last bigram of needle (comparing from the end);
+  * - look for it in the hash table, if found - get the offset from the hash 
table and compare the string bytewise;
+  * - if it did not match, we check the next cell of the hash table from the 
collision resolution chain;
+  * - if not found, skip to haystack almost the size of the needle bytes;
+  *
+  * MultiVolnitsky - search for multiple substrings in a string:
+  * - Add bigrams to hash table with string index. Then the usual Volnitsky 
search is used.
+  * - We are adding while searching, limiting the number of fallback searchers 
and the total number of added bigrams
+  */
+
+namespace VolnitskyTraits {
+using Offset =
+        uint8_t; /// Offset in the needle. For the basic algorithm, the length 
of the needle must not be greater than 255.
+using Id =
+        uint8_t; /// Index of the string (within the array of multiple 
needles), must not be greater than 255.
+using Ngram = uint16_t; /// n-gram (2 bytes).
+
+/** Fits into the L2 cache (of common Intel CPUs).
+     * This number is extremely good for compilers as it is 
numeric_limits<Uint16>::max() and there are optimizations with movzwl and other 
instructions with 2 bytes
+     */
+static constexpr size_t hash_size = 64 * 1024;
+
+/// min haystack size to use main algorithm instead of fallback
+static constexpr size_t min_haystack_size_for_algorithm = 20000;
+
+static inline bool isFallbackNeedle(const size_t needle_size, size_t 
haystack_size_hint = 0) {
+    return needle_size < 2 * sizeof(Ngram) || needle_size >= 
std::numeric_limits<Offset>::max() ||
+           (haystack_size_hint && haystack_size_hint < 
min_haystack_size_for_algorithm);
+}
+
+static inline Ngram toNGram(const uint8_t* const pos) {
+    Ngram res;
+    memcpy(&res, pos, sizeof(res));
+    return res;
+}
+
+template <typename Callback>
+static inline void putNGram(const uint8_t* const pos, const int offset,
+                            const Callback& putNGramBase) {
+    putNGramBase(toNGram(pos), offset);
+}
+} // namespace VolnitskyTraits
+
+/// @todo store lowercase needle to speed up in case there are numerous 
occurrences of bigrams from needle in haystack
+template <typename FallbackSearcher>
+class VolnitskyBase {
+protected:
+    const uint8_t* const needle;
+    const size_t needle_size;
+    const uint8_t* const needle_end = needle + needle_size;
+    /// For how long we move, if the n-gram from haystack is not found in the 
hash table.
+    const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
+
+    /** max needle length is 255, max distinct ngrams for case-sensitive is 
(255 - 1), case-insensitive is 4 * (255 - 1)
+      *  storage of 64K ngrams (n = 2, 128 KB) should be large enough for both 
cases */
+    std::unique_ptr<VolnitskyTraits::Offset[]> hash; /// Hash table.
+
+    const bool fallback; /// Do we need to use the fallback algorithm.
+
+    FallbackSearcher fallback_searcher;
+
+public:
+    using Searcher = FallbackSearcher;
+
+    /** haystack_size_hint - the expected total size of the haystack for 
`search` calls. Optional (zero means unspecified).
+      * If you specify it small enough, the fallback algorithm will be used,
+      *  since it is considered that it's useless to waste time initializing 
the hash table.
+      */
+    VolnitskyBase(const char* const needle_, const size_t needle_size_,
+                  size_t haystack_size_hint = 0)
+            : needle {reinterpret_cast<const uint8_t*>(needle_)},
+              needle_size {needle_size_},
+              fallback {VolnitskyTraits::isFallbackNeedle(needle_size, 
haystack_size_hint)},
+              fallback_searcher {needle_, needle_size} {
+        //std::cout << "fallback: " << fallback << std::endl;
+        if (fallback) return;
+
+        hash = std::unique_ptr<VolnitskyTraits::Offset[]>(
+                new VolnitskyTraits::Offset[VolnitskyTraits::hash_size] {});
+
+        auto callback = [this](const VolnitskyTraits::Ngram ngram, const int 
offset) {
+            return this->putNGramBase(ngram, offset);
+        };
+        /// ssize_t is used here because unsigned can't be used with condition 
like `i >= 0`, unsigned always >= 0
+        /// And also adding from the end guarantees that we will find first 
occurrence because we will lookup bigger offsets first.
+        for (auto i = static_cast<ssize_t>(needle_size - 
sizeof(VolnitskyTraits::Ngram)); i >= 0;
+             --i)
+            VolnitskyTraits::putNGram(this->needle + i, i + 1, callback);
+    }
+
+    /// If not found, the end of the haystack is returned.
+    const uint8_t* search(const uint8_t* const haystack, const size_t 
haystack_size) const {
+        if (needle_size == 0) return haystack;
+
+        const auto haystack_end = haystack + haystack_size;
+
+        if (fallback || haystack_size <= needle_size)
+            return fallback_searcher.search(haystack, haystack_end);
+
+        /// Let's "apply" the needle to the haystack and compare the n-gram 
from the end of the needle.
+        const auto* pos = haystack + needle_size - 
sizeof(VolnitskyTraits::Ngram);
+        for (; pos <= haystack_end - needle_size; pos += step) {
+            /// We look at all the cells of the hash table that can correspond 
to the n-gram from haystack.
+            for (size_t cell_num = VolnitskyTraits::toNGram(pos) % 
VolnitskyTraits::hash_size;
+                 hash[cell_num]; cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size) {
+                /// When found - compare bytewise, using the offset from the 
hash table.
+                const auto res = pos - (hash[cell_num] - 1);
+
+                /// pointer in the code is always padded array so we can use 
pagesafe semantics
+                if (fallback_searcher.compare(haystack, haystack_end, 
(uint8_t*)res)) return res;
+            }
+        }
+
+        return fallback_searcher.search(pos - step + 1, haystack_end);
+    }
+
+    const char* search(const char* haystack, size_t haystack_size) const {
+        return reinterpret_cast<const char*>(
+                search(reinterpret_cast<const uint8_t*>(haystack), 
haystack_size));
+    }
+
+    const uint8_t* search(const uint8_t* haystack, const uint8_t* 
haystack_end) const {
+        return search(haystack, haystack_end - haystack);
+    }
+
+protected:
+    void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset) {
+        /// Put the offset for the n-gram in the corresponding cell or the 
nearest free cell.
+        size_t cell_num = ngram % VolnitskyTraits::hash_size;
+
+        while (hash[cell_num]) {
+            cell_num =
+                    (cell_num + 1) % VolnitskyTraits::hash_size; /// Search 
for the next free cell.
+        }
+
+        hash[cell_num] = offset;
+    }
+};
+} // namespace doris
diff --git a/be/src/vec/common/string_searcher.h 
b/be/src/vec/common/string_searcher.h
index af76f2100dd..050d8514718 100644
--- a/be/src/vec/common/string_searcher.h
+++ b/be/src/vec/common/string_searcher.h
@@ -28,6 +28,7 @@
 #include <vector>
 
 #include "util/sse_util.hpp"
+#include "vec/common/Volnitsky.h"
 #include "vec/common/string_ref.h"
 #include "vec/common/string_utils/string_utils.h"
 
@@ -134,13 +135,11 @@ public:
     ALWAYS_INLINE bool compare(const CharT* haystack, const CharT* 
haystack_end, CharT* pos) const {
         // cast to unsigned int8 to be consitent with needle type
         // ensure unsigned type compare
-        return _compare(reinterpret_cast<const uint8_t*>(haystack),
-                        reinterpret_cast<const uint8_t*>(haystack_end),
-                        reinterpret_cast<const uint8_t*>(pos));
+        return _compare(haystack, haystack_end, pos);
     }
 
 private:
-    ALWAYS_INLINE bool _compare(uint8_t* /*haystack*/, uint8_t* 
/*haystack_end*/,
+    ALWAYS_INLINE bool _compare(const uint8_t* /*haystack*/, const uint8_t* 
/*haystack_end*/,
                                 uint8_t* pos) const {
 #if defined(__SSE4_1__) || defined(__aarch64__)
         if (needle_end - needle > n && page_safe(pos)) {
@@ -355,65 +354,10 @@ public:
     }
 };
 
-using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>;
+using ASCIICaseSensitiveStringSearcher = VolnitskyBase<StringSearcher<true, 
true>>;
 // using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
-using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
+using UTF8CaseSensitiveStringSearcher = VolnitskyBase<StringSearcher<true, 
false>>;
 // using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
 using ASCIICaseSensitiveTokenSearcher = 
TokenSearcher<ASCIICaseSensitiveStringSearcher>;
 // using ASCIICaseInsensitiveTokenSearcher = 
TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
-
-/** Uses functions from libc.
-  * It makes sense to use only with short haystacks when cheap initialization 
is required.
-  * There is no option for case-insensitive search for UTF-8 strings.
-  * It is required that strings are zero-terminated.
-  */
-
-struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase {
-    const char* const needle;
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    LibCASCIICaseSensitiveStringSearcher(const CharT* const needle_, const 
size_t /* needle_size */)
-            : needle(reinterpret_cast<const char*>(needle_)) {}
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const CharT* const 
haystack_end) const {
-        const auto* res = strstr(reinterpret_cast<const char*>(haystack),
-                                 reinterpret_cast<const char*>(needle));
-        if (!res) return haystack_end;
-        return reinterpret_cast<const CharT*>(res);
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const size_t haystack_size) 
const {
-        return search(haystack, haystack + haystack_size);
-    }
-};
-
-struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase {
-    const char* const needle;
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    LibCASCIICaseInsensitiveStringSearcher(const CharT* const needle_,
-                                           const size_t /* needle_size */)
-            : needle(reinterpret_cast<const char*>(needle_)) {}
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const CharT* const 
haystack_end) const {
-        const auto* res = strcasestr(reinterpret_cast<const char*>(haystack),
-                                     reinterpret_cast<const char*>(needle));
-        if (!res) return haystack_end;
-        return reinterpret_cast<const CharT*>(res);
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const size_t haystack_size) 
const {
-        return search(haystack, haystack + haystack_size);
-    }
-};
 } // namespace doris
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 842cfabbe19..248359469c9 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -509,11 +509,9 @@ Status FunctionLikeBase::execute_impl(FunctionContext* 
context, Block& block,
                                       size_t input_rows_count) const {
     const auto values_col =
             
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
-    const auto* values = check_and_get_column<ColumnString>(values_col.get());
+    const auto* values =
+            assert_cast<const ColumnString*, 
TypeCheckOnRelease::DISABLE>(values_col.get());
 
-    if (!values) {
-        return Status::InternalError("Not supported input arguments types");
-    }
     // result column
     auto res = ColumnUInt8::create();
     ColumnUInt8::Container& vec_res = res->get_data();
@@ -578,9 +576,7 @@ Status FunctionLikeBase::execute_substring(const 
ColumnString::Chars& values,
         }
 
         /// We check that the entry does not pass through the boundaries of 
strings.
-        if (pos + needle_size <= begin + value_offsets[i]) {
-            result[i] = 1;
-        }
+        result[i] = pos + needle_size <= begin + value_offsets[i];
 
         // move to next string offset
         pos = begin + value_offsets[i];


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to