This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch cs_opt_version-3.0.4 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 5d0ff1378a1dae07fc00807ccec036e3795c651c Author: HappenLee <happen...@selectdb.com> AuthorDate: Wed Jun 11 10:27:51 2025 +0800 [Exec](vec) opt the like expr performance --- be/src/olap/rowset/segment_v2/column_writer.h | 2 +- be/src/olap/rowset/segment_v2/segment_writer.cpp | 20 +-- be/src/vec/common/Volnitsky.h | 178 +++++++++++++++++++++++ be/src/vec/common/string_searcher.h | 66 +-------- be/src/vec/functions/like.cpp | 10 +- 5 files changed, 198 insertions(+), 78 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 2d66b940a38..9498681500e 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -52,7 +52,7 @@ struct ColumnWriterOptions { // - input: column_id/unique_id/type/length/encoding/compression/is_nullable members // - output: encoding/indexes/dict_page members ColumnMetaPB* meta = nullptr; - size_t data_page_size = 64 * 1024; + size_t data_page_size = 1024l * 1024; // store compressed page only when space saving is above the threshold. // space saving = 1 - compressed_size / uncompressed_size double compression_min_space_saving = 0.1; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index be6c1dc45b8..edb329d90b1 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -252,6 +252,8 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co if (storage_page_size >= 4096 && storage_page_size <= 10485760) { opts.data_page_size = storage_page_size; } + opts.data_page_size = opts.data_page_size <= 1048576 ? 1048576 : opts.data_page_size; + DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { auto table_id = DebugPoints::instance()->get_debug_param_or_default<int64_t>( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", @@ -264,15 +266,15 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co "Debug point parameters missing: either 'table_id' or 'storage_page_size' not " "set."); } - if (table_id == _tablet_schema->table_id() && - opts.data_page_size != target_data_page_size) { - return Status::Error<ErrorCode::INTERNAL_ERROR>( - "Mismatch in 'storage_page_size': expected size does not match the current " - "data page size. " - "Expected: " + - std::to_string(target_data_page_size) + - ", Actual: " + std::to_string(opts.data_page_size) + "."); - } + // if (table_id == _tablet_schema->table_id() && + // opts.data_page_size != target_data_page_size) { + // return Status::Error<ErrorCode::INTERNAL_ERROR>( + // "Mismatch in 'storage_page_size': expected size does not match the current " + // "data page size. " + // "Expected: " + + // std::to_string(target_data_page_size) + + // ", Actual: " + std::to_string(opts.data_page_size) + "."); + // } }) if (column.is_row_store_column()) { // smaller page size for row store column diff --git a/be/src/vec/common/Volnitsky.h b/be/src/vec/common/Volnitsky.h new file mode 100644 index 00000000000..a22776dd787 --- /dev/null +++ b/be/src/vec/common/Volnitsky.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is based on code available under the Apache license here: +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Volnitsky.h + +#pragma once + +#include <algorithm> +#include <functional> + +namespace doris { + +/** Search for a substring in a string by Volnitsky's algorithm + * http://volnitsky.com/project/str_search/ + * + * `haystack` and `needle` can contain zero bytes. + * + * Algorithm: + * - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr; + * - when initializing, fill in an open-addressing linear probing hash table of the form + * hash from the bigram of needle -> the position of this bigram in needle + 1. + * (one is added only to distinguish zero offset from an empty cell) + * - the keys are not stored in the hash table, only the values are stored; + * - bigrams can be inserted several times if they occur in the needle several times; + * - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end); + * - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise; + * - if it did not match, we check the next cell of the hash table from the collision resolution chain; + * - if not found, skip to haystack almost the size of the needle bytes; + * + * MultiVolnitsky - search for multiple substrings in a string: + * - Add bigrams to hash table with string index. Then the usual Volnitsky search is used. + * - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams + */ + +namespace VolnitskyTraits { +using Offset = + uint8_t; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. +using Id = + uint8_t; /// Index of the string (within the array of multiple needles), must not be greater than 255. +using Ngram = uint16_t; /// n-gram (2 bytes). + +/** Fits into the L2 cache (of common Intel CPUs). + * This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes + */ +static constexpr size_t hash_size = 64 * 1024; + +/// min haystack size to use main algorithm instead of fallback +static constexpr size_t min_haystack_size_for_algorithm = 20000; + +static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0) { + return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max() || + (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm); +} + +static inline Ngram toNGram(const uint8_t* const pos) { + Ngram res; + memcpy(&res, pos, sizeof(res)); + return res; +} + +template <typename Callback> +static inline void putNGram(const uint8_t* const pos, const int offset, + const Callback& putNGramBase) { + putNGramBase(toNGram(pos), offset); +} +} // namespace VolnitskyTraits + +/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack +template <typename FallbackSearcher> +class VolnitskyBase { +protected: + const uint8_t* const needle; + const size_t needle_size; + const uint8_t* const needle_end = needle + needle_size; + /// For how long we move, if the n-gram from haystack is not found in the hash table. + const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1; + + /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1) + * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ + std::unique_ptr<VolnitskyTraits::Offset[]> hash; /// Hash table. + + const bool fallback; /// Do we need to use the fallback algorithm. + + FallbackSearcher fallback_searcher; + +public: + using Searcher = FallbackSearcher; + + /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified). + * If you specify it small enough, the fallback algorithm will be used, + * since it is considered that it's useless to waste time initializing the hash table. + */ + VolnitskyBase(const char* const needle_, const size_t needle_size_, + size_t haystack_size_hint = 0) + : needle {reinterpret_cast<const uint8_t*>(needle_)}, + needle_size {needle_size_}, + fallback {VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}, + fallback_searcher {needle_, needle_size} { + //std::cout << "fallback: " << fallback << std::endl; + if (fallback) return; + + hash = std::unique_ptr<VolnitskyTraits::Offset[]>( + new VolnitskyTraits::Offset[VolnitskyTraits::hash_size] {}); + + auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { + return this->putNGramBase(ngram, offset); + }; + /// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0 + /// And also adding from the end guarantees that we will find first occurrence because we will lookup bigger offsets first. + for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; + --i) + VolnitskyTraits::putNGram(this->needle + i, i + 1, callback); + } + + /// If not found, the end of the haystack is returned. + const uint8_t* search(const uint8_t* const haystack, const size_t haystack_size) const { + if (needle_size == 0) return haystack; + + const auto haystack_end = haystack + haystack_size; + + if (fallback || haystack_size <= needle_size) + return fallback_searcher.search(haystack, haystack_end); + + /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle. + const auto* pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram); + for (; pos <= haystack_end - needle_size; pos += step) { + /// We look at all the cells of the hash table that can correspond to the n-gram from haystack. + for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; + hash[cell_num]; cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) { + /// When found - compare bytewise, using the offset from the hash table. + const auto res = pos - (hash[cell_num] - 1); + + /// pointer in the code is always padded array so we can use pagesafe semantics + if (fallback_searcher.compare(haystack, haystack_end, (uint8_t*)res)) return res; + } + } + + return fallback_searcher.search(pos - step + 1, haystack_end); + } + + const char* search(const char* haystack, size_t haystack_size) const { + return reinterpret_cast<const char*>( + search(reinterpret_cast<const uint8_t*>(haystack), haystack_size)); + } + + const uint8_t* search(const uint8_t* haystack, const uint8_t* haystack_end) const { + return search(haystack, haystack_end - haystack); + } + +protected: + void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset) { + /// Put the offset for the n-gram in the corresponding cell or the nearest free cell. + size_t cell_num = ngram % VolnitskyTraits::hash_size; + + while (hash[cell_num]) { + cell_num = + (cell_num + 1) % VolnitskyTraits::hash_size; /// Search for the next free cell. + } + + hash[cell_num] = offset; + } +}; +} // namespace doris diff --git a/be/src/vec/common/string_searcher.h b/be/src/vec/common/string_searcher.h index af76f2100dd..050d8514718 100644 --- a/be/src/vec/common/string_searcher.h +++ b/be/src/vec/common/string_searcher.h @@ -28,6 +28,7 @@ #include <vector> #include "util/sse_util.hpp" +#include "vec/common/Volnitsky.h" #include "vec/common/string_ref.h" #include "vec/common/string_utils/string_utils.h" @@ -134,13 +135,11 @@ public: ALWAYS_INLINE bool compare(const CharT* haystack, const CharT* haystack_end, CharT* pos) const { // cast to unsigned int8 to be consitent with needle type // ensure unsigned type compare - return _compare(reinterpret_cast<const uint8_t*>(haystack), - reinterpret_cast<const uint8_t*>(haystack_end), - reinterpret_cast<const uint8_t*>(pos)); + return _compare(haystack, haystack_end, pos); } private: - ALWAYS_INLINE bool _compare(uint8_t* /*haystack*/, uint8_t* /*haystack_end*/, + ALWAYS_INLINE bool _compare(const uint8_t* /*haystack*/, const uint8_t* /*haystack_end*/, uint8_t* pos) const { #if defined(__SSE4_1__) || defined(__aarch64__) if (needle_end - needle > n && page_safe(pos)) { @@ -355,65 +354,10 @@ public: } }; -using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>; +using ASCIICaseSensitiveStringSearcher = VolnitskyBase<StringSearcher<true, true>>; // using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>; -using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>; +using UTF8CaseSensitiveStringSearcher = VolnitskyBase<StringSearcher<true, false>>; // using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>; using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>; // using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>; - -/** Uses functions from libc. - * It makes sense to use only with short haystacks when cheap initialization is required. - * There is no option for case-insensitive search for UTF-8 strings. - * It is required that strings are zero-terminated. - */ - -struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase { - const char* const needle; - - template <typename CharT> - // requires (sizeof(CharT) == 1) - LibCASCIICaseSensitiveStringSearcher(const CharT* const needle_, const size_t /* needle_size */) - : needle(reinterpret_cast<const char*>(needle_)) {} - - template <typename CharT> - // requires (sizeof(CharT) == 1) - const CharT* search(const CharT* haystack, const CharT* const haystack_end) const { - const auto* res = strstr(reinterpret_cast<const char*>(haystack), - reinterpret_cast<const char*>(needle)); - if (!res) return haystack_end; - return reinterpret_cast<const CharT*>(res); - } - - template <typename CharT> - // requires (sizeof(CharT) == 1) - const CharT* search(const CharT* haystack, const size_t haystack_size) const { - return search(haystack, haystack + haystack_size); - } -}; - -struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase { - const char* const needle; - - template <typename CharT> - // requires (sizeof(CharT) == 1) - LibCASCIICaseInsensitiveStringSearcher(const CharT* const needle_, - const size_t /* needle_size */) - : needle(reinterpret_cast<const char*>(needle_)) {} - - template <typename CharT> - // requires (sizeof(CharT) == 1) - const CharT* search(const CharT* haystack, const CharT* const haystack_end) const { - const auto* res = strcasestr(reinterpret_cast<const char*>(haystack), - reinterpret_cast<const char*>(needle)); - if (!res) return haystack_end; - return reinterpret_cast<const CharT*>(res); - } - - template <typename CharT> - // requires (sizeof(CharT) == 1) - const CharT* search(const CharT* haystack, const size_t haystack_size) const { - return search(haystack, haystack + haystack_size); - } -}; } // namespace doris diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index 842cfabbe19..248359469c9 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -509,11 +509,9 @@ Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block, size_t input_rows_count) const { const auto values_col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); - const auto* values = check_and_get_column<ColumnString>(values_col.get()); + const auto* values = + assert_cast<const ColumnString*, TypeCheckOnRelease::DISABLE>(values_col.get()); - if (!values) { - return Status::InternalError("Not supported input arguments types"); - } // result column auto res = ColumnUInt8::create(); ColumnUInt8::Container& vec_res = res->get_data(); @@ -578,9 +576,7 @@ Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, } /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle_size <= begin + value_offsets[i]) { - result[i] = 1; - } + result[i] = pos + needle_size <= begin + value_offsets[i]; // move to next string offset pos = begin + value_offsets[i]; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org