(doris) branch master updated: [feature](invert index) match_regexp feature added (#28257)

jianliangqi Tue, 19 Dec 2023 22:30:50 -0800

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new bcc32b5b265 [feature](invert index) match_regexp feature added (#28257)
bcc32b5b265 is described below

commit bcc32b5b265841caed525300fd72e9732ea1f3b9
Author: zzzxl <[email protected]>
AuthorDate: Wed Dec 20 14:30:35 2023 +0800

    [feature](invert index) match_regexp feature added (#28257)
---
 be/src/exec/olap_common.h                          |   2 +
 be/src/exec/olap_utils.h                           |  13 ++-
 be/src/olap/match_predicate.cpp                    |   3 +
 .../inverted_index/query/conjunction_query.cpp     |   6 +-
 .../inverted_index/query/disjunction_query.cpp     |  17 ++--
 .../inverted_index/query/disjunction_query.h       |   1 -
 .../inverted_index/query/regexp_query.cpp          |  98 +++++++++++++++++++
 .../query/{disjunction_query.h => regexp_query.h}  |  32 +++---
 .../rowset/segment_v2/inverted_index_query_type.h  |   4 +
 .../rowset/segment_v2/inverted_index_reader.cpp    | 108 ++++++++++++++++-----
 .../olap/rowset/segment_v2/inverted_index_reader.h |  16 ++-
 be/src/vec/functions/function_tokenize.cpp         |   8 +-
 be/src/vec/functions/match.cpp                     |  38 ++++----
 be/src/vec/functions/match.h                       |  17 ++++
 .../antlr4/org/apache/doris/nereids/DorisLexer.g4  |   1 +
 .../antlr4/org/apache/doris/nereids/DorisParser.g4 |   2 +-
 fe/fe-core/src/main/cup/sql_parser.cup             |   5 +-
 .../org/apache/doris/analysis/MatchPredicate.java  |  11 +++
 .../doris/nereids/parser/LogicalPlanBuilder.java   |   7 ++
 .../doris/nereids/trees/expressions/Match.java     |   2 +
 .../nereids/trees/expressions/MatchRegexp.java     |  49 ++++++++++
 .../expressions/visitor/ExpressionVisitor.java     |   5 +
 fe/fe-core/src/main/jflex/sql_scanner.flex         |   1 +
 gensrc/thrift/Opcodes.thrift                       |   1 +
 .../inverted_index_p0/test_index_match_regexp.out  |  16 +++
 .../test_index_match_regexp.groovy                 |  89 +++++++++++++++++
 26 files changed, 468 insertions(+), 84 deletions(-)

diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index acf81a48eb4..cdca939c6ed 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -306,6 +306,8 @@ public:
                 condition.__set_condition_op("match_phrase");
             } else if (value.first == MatchType::MATCH_PHRASE_PREFIX) {
                 condition.__set_condition_op("match_phrase_prefix");
+            } else if (value.first == MatchType::MATCH_REGEXP) {
+                condition.__set_condition_op("match_regexp");
             } else if (value.first == MatchType::MATCH_ELEMENT_EQ) {
                 condition.__set_condition_op("match_element_eq");
             } else if (value.first == MatchType::MATCH_ELEMENT_LT) {
diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h
index 5efcc012364..106ded98c7d 100644
--- a/be/src/exec/olap_utils.h
+++ b/be/src/exec/olap_utils.h
@@ -170,6 +170,7 @@ enum class MatchType {
     MATCH_ELEMENT_LE = 6,
     MATCH_ELEMENT_GE = 7,
     MATCH_PHRASE_PREFIX = 8,
+    MATCH_REGEXP = 9,
 };
 
 inline MatchType to_match_type(TExprOpcode::type type) {
@@ -186,6 +187,9 @@ inline MatchType to_match_type(TExprOpcode::type type) {
     case TExprOpcode::type::MATCH_PHRASE_PREFIX:
         return MatchType::MATCH_PHRASE_PREFIX;
         break;
+    case TExprOpcode::type::MATCH_REGEXP:
+        return MatchType::MATCH_REGEXP;
+        break;
     case TExprOpcode::type::MATCH_ELEMENT_EQ:
         return MatchType::MATCH_ELEMENT_EQ;
         break;
@@ -217,6 +221,8 @@ inline MatchType to_match_type(const std::string& 
condition_op) {
         return MatchType::MATCH_PHRASE;
     } else if (condition_op.compare("match_phrase_prefix") == 0) {
         return MatchType::MATCH_PHRASE_PREFIX;
+    } else if (condition_op.compare("match_regexp") == 0) {
+        return MatchType::MATCH_REGEXP;
     } else if (condition_op.compare("match_element_eq") == 0) {
         return MatchType::MATCH_ELEMENT_EQ;
     } else if (condition_op.compare("match_element_lt") == 0) {
@@ -235,6 +241,7 @@ inline bool is_match_condition(const std::string& op) {
     if (0 == strcasecmp(op.c_str(), "match_any") || 0 == 
strcasecmp(op.c_str(), "match_all") ||
         0 == strcasecmp(op.c_str(), "match_phrase") ||
         0 == strcasecmp(op.c_str(), "match_phrase_prefix") ||
+        0 == strcasecmp(op.c_str(), "match_regexp") ||
         0 == strcasecmp(op.c_str(), "match_element_eq") ||
         0 == strcasecmp(op.c_str(), "match_element_lt") ||
         0 == strcasecmp(op.c_str(), "match_element_gt") ||
@@ -248,9 +255,9 @@ inline bool is_match_condition(const std::string& op) {
 inline bool is_match_operator(const TExprOpcode::type& op_type) {
     return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == 
op_type ||
            TExprOpcode::MATCH_PHRASE == op_type || 
TExprOpcode::MATCH_PHRASE_PREFIX == op_type ||
-           TExprOpcode::MATCH_ELEMENT_EQ == op_type || 
TExprOpcode::MATCH_ELEMENT_LT == op_type ||
-           TExprOpcode::MATCH_ELEMENT_GT == op_type || 
TExprOpcode::MATCH_ELEMENT_LE == op_type ||
-           TExprOpcode::MATCH_ELEMENT_GE == op_type;
+           TExprOpcode::MATCH_REGEXP == op_type || 
TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
+           TExprOpcode::MATCH_ELEMENT_LT == op_type || 
TExprOpcode::MATCH_ELEMENT_GT == op_type ||
+           TExprOpcode::MATCH_ELEMENT_LE == op_type || 
TExprOpcode::MATCH_ELEMENT_GE == op_type;
 }
 
 } // namespace doris
diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp
index aa4d993a62e..36f167d0d04 100644
--- a/be/src/olap/match_predicate.cpp
+++ b/be/src/olap/match_predicate.cpp
@@ -110,6 +110,9 @@ InvertedIndexQueryType 
MatchPredicate::_to_inverted_index_query_type(MatchType m
     case MatchType::MATCH_PHRASE_PREFIX:
         ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
         break;
+    case MatchType::MATCH_REGEXP:
+        ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY;
+        break;
     case MatchType::MATCH_ELEMENT_EQ:
         ret = InvertedIndexQueryType::EQUAL_QUERY;
         break;
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
index b77edc79ade..b2448a8fa8e 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -38,12 +38,12 @@ ConjunctionQuery::~ConjunctionQuery() {
 }
 
 void ConjunctionQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
-    if (terms.size() < 1) {
-        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    if (terms.empty()) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms 
empty");
     }
 
     std::vector<TermIterator> iterators;
-    for (auto& term : terms) {
+    for (const auto& term : terms) {
         std::wstring ws_term = StringUtil::string_to_wstring(term);
         Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
         _terms.push_back(t);
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
index 07a159b3222..7b797d7b54a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
@@ -22,26 +22,25 @@ namespace doris {
 DisjunctionQuery::DisjunctionQuery(IndexReader* reader) : _reader(reader) {}
 
 DisjunctionQuery::~DisjunctionQuery() {
-    for (auto& term : _terms) {
-        if (term) {
-            _CLDELETE(term);
-        }
-    }
     for (auto& term_doc : _term_docs) {
         if (term_doc) {
             _CLDELETE(term_doc);
         }
     }
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
 }
 
 void DisjunctionQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
-    if (terms.size() < 1) {
-        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    if (terms.empty()) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "DisjunctionQuery::add: terms 
empty");
     }
 
-    for (auto& term : terms) {
+    for (const auto& term : terms) {
         std::wstring ws_term = StringUtil::string_to_wstring(term);
-        _wsterms.emplace_back(&ws_term);
         Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
         _terms.push_back(t);
         TermDocs* term_doc = _reader->termDocs(t);
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
index f42fd69dabc..bb0a837f42a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
@@ -39,7 +39,6 @@ public:
 
 private:
     IndexReader* _reader = nullptr;
-    std::vector<std::wstring*> _wsterms;
     std::vector<Term*> _terms;
     std::vector<TermDocs*> _term_docs;
     std::vector<TermIterator> _term_iterators;
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
new file mode 100644
index 00000000000..83c5401bac0
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "regexp_query.h"
+
+#include <CLucene/config/repl_wchar.h>
+#include <hs/hs.h>
+
+#include "common/logging.h"
+
+namespace doris::segment_v2 {
+
+RegexpQuery::RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher)
+        : _searcher(searcher), query(searcher->getReader()) {}
+
+void RegexpQuery::add(const std::wstring& field_name, const std::string& 
pattern) {
+    hs_database_t* database = nullptr;
+    hs_compile_error_t* compile_err = nullptr;
+    hs_scratch_t* scratch = nullptr;
+
+    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | 
HS_FLAG_UTF8,
+                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != 
HS_SUCCESS) {
+        LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message;
+        hs_free_compile_error(compile_err);
+        return;
+    }
+
+    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
+        LOG(ERROR) << "hyperscan could not allocate scratch space.";
+        hs_free_database(database);
+        return;
+    }
+
+    auto on_match = [](unsigned int id, unsigned long long from, unsigned long 
long to,
+                       unsigned int flags, void* context) -> int {
+        *((bool*)context) = true;
+        return 0;
+    };
+
+    Term* term = nullptr;
+    TermEnum* enumerator = nullptr;
+    std::vector<std::string> terms;
+    int32_t count = 0;
+
+    try {
+        enumerator = _searcher->getReader()->terms();
+        while (enumerator->next()) {
+            term = enumerator->term();
+            std::string input = lucene_wcstoutf8string(term->text(), 
term->textLength());
+
+            bool is_match = false;
+            if (hs_scan(database, input.data(), input.size(), 0, scratch, 
on_match,
+                        (void*)&is_match) != HS_SUCCESS) {
+                LOG(ERROR) << "hyperscan match failed: " << input;
+                break;
+            }
+
+            if (is_match) {
+                terms.emplace_back(std::move(input));
+                if (++count >= _max_expansions) {
+                    break;
+                }
+            }
+
+            _CLDECDELETE(term);
+        }
+    }
+    _CLFINALLY({
+        _CLDECDELETE(term);
+        enumerator->close();
+        _CLDELETE(enumerator);
+
+        hs_free_scratch(scratch);
+        hs_free_database(database);
+    })
+
+    query.add(field_name, terms);
+}
+
+void RegexpQuery::search(roaring::Roaring& roaring) {
+    query.search(roaring);
+}
+
+} // namespace doris::segment_v2
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
similarity index 59%
copy from be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
copy to be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
index f42fd69dabc..3791ad50d8f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
@@ -17,32 +17,30 @@
 
 #pragma once
 
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
+#include <memory>
 
-#include "roaring/roaring.hh"
+#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h"
 
 CL_NS_USE(index)
+CL_NS_USE(search)
 
-namespace doris {
+namespace doris::segment_v2 {
 
-class DisjunctionQuery {
+class RegexpQuery {
 public:
-    DisjunctionQuery(IndexReader* reader);
-    ~DisjunctionQuery();
+    RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher);
+    ~RegexpQuery() = default;
 
-    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
+    void set_max_expansions(int32_t max_expansions) { _max_expansions = 
max_expansions; }
+
+    void add(const std::wstring& field_name, const std::string& pattern);
     void search(roaring::Roaring& roaring);
 
 private:
-    IndexReader* _reader = nullptr;
-    std::vector<std::wstring*> _wsterms;
-    std::vector<Term*> _terms;
-    std::vector<TermDocs*> _term_docs;
-    std::vector<TermIterator> _term_iterators;
+    std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
+    int32_t _max_expansions = 50;
+    DisjunctionQuery query;
 };
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h 
b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
index 6d91c3e2ecf..844cec27b46 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
@@ -77,6 +77,7 @@ enum class InvertedIndexQueryType {
     MATCH_ALL_QUERY = 6,
     MATCH_PHRASE_QUERY = 7,
     MATCH_PHRASE_PREFIX_QUERY = 8,
+    MATCH_REGEXP_QUERY = 9,
 };
 
 inline std::string query_type_to_string(InvertedIndexQueryType query_type) {
@@ -111,6 +112,9 @@ inline std::string 
query_type_to_string(InvertedIndexQueryType query_type) {
     case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: {
         return "MPHRASEPREFIX";
     }
+    case InvertedIndexQueryType::MATCH_REGEXP_QUERY: {
+        return "MREGEXP";
+    }
     default:
         return "";
     }
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index a567859a3bb..8a226ac123f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -60,6 +60,7 @@
 #include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
 #include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
 #include "olap/rowset/segment_v2/inverted_index_cache.h"
 #include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
 #include "olap/types.h"
@@ -83,7 +84,8 @@ bool 
InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) {
     return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY);
+            query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY);
 }
 
 bool InvertedIndexReader::indexExists(io::Path& index_file_path) {
@@ -134,10 +136,13 @@ std::unique_ptr<lucene::util::Reader> 
InvertedIndexReader::create_reader(
     return reader;
 }
 
-std::vector<std::string> InvertedIndexReader::get_analyse_result(
-        lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer,
-        const std::string& field_name, InvertedIndexQueryType query_type, bool 
drop_duplicates) {
-    std::vector<std::string> analyse_result;
+void InvertedIndexReader::get_analyse_result(std::vector<std::string>& 
analyse_result,
+                                             lucene::util::Reader* reader,
+                                             lucene::analysis::Analyzer* 
analyzer,
+                                             const std::string& field_name,
+                                             InvertedIndexQueryType query_type,
+                                             bool drop_duplicates) {
+    analyse_result.clear();
 
     std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
     std::unique_ptr<lucene::analysis::TokenStream> token_stream(
@@ -161,8 +166,6 @@ std::vector<std::string> 
InvertedIndexReader::get_analyse_result(
         std::set<std::string> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
         analyse_result.assign(unrepeated_result.begin(), 
unrepeated_result.end());
     }
-
-    return analyse_result;
 }
 
 Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* 
cache_handle,
@@ -239,19 +242,25 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
     auto index_file_name = InvertedIndexDescriptor::get_index_file_name(
             path.filename(), _index_meta.index_id(), 
_index_meta.get_index_suffix());
     auto index_file_path = index_dir / index_file_name;
-    InvertedIndexCtxSPtr inverted_index_ctx = 
std::make_shared<InvertedIndexCtx>();
-    inverted_index_ctx->parser_type = 
get_inverted_index_parser_type_from_string(
-            get_parser_string_from_properties(_index_meta.properties()));
-    inverted_index_ctx->parser_mode =
-            get_parser_mode_string_from_properties(_index_meta.properties());
-    inverted_index_ctx->char_filter_map =
-            
get_parser_char_filter_map_from_properties(_index_meta.properties());
+
     try {
-        auto analyzer = create_analyzer(inverted_index_ctx.get());
-        auto reader = create_reader(inverted_index_ctx.get(), search_str);
-        inverted_index_ctx->analyzer = analyzer.get();
-        std::vector<std::string> analyse_result =
-                get_analyse_result(reader.get(), analyzer.get(), column_name, 
query_type);
+        std::vector<std::string> analyse_result;
+        if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
+            analyse_result.emplace_back(search_str);
+        } else {
+            InvertedIndexCtxSPtr inverted_index_ctx = 
std::make_shared<InvertedIndexCtx>();
+            inverted_index_ctx->parser_type = 
get_inverted_index_parser_type_from_string(
+                    
get_parser_string_from_properties(_index_meta.properties()));
+            inverted_index_ctx->parser_mode =
+                    
get_parser_mode_string_from_properties(_index_meta.properties());
+            inverted_index_ctx->char_filter_map =
+                    
get_parser_char_filter_map_from_properties(_index_meta.properties());
+            auto analyzer = create_analyzer(inverted_index_ctx.get());
+            auto reader = create_reader(inverted_index_ctx.get(), search_str);
+            inverted_index_ctx->analyzer = analyzer.get();
+            get_analyse_result(analyse_result, reader.get(), analyzer.get(), 
column_name,
+                               query_type);
+        }
 
         if (analyse_result.empty()) {
             auto msg = fmt::format(
@@ -261,7 +270,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             if (query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
                 query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
                 query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
-                query_type == 
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
+                query_type == 
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+                query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
                 LOG(WARNING) << msg;
                 return Status::OK();
             } else {
@@ -290,7 +300,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
                 str_tokens += " ";
             }
 
-            auto cache = InvertedIndexQueryCache::instance();
+            auto* cache = InvertedIndexQueryCache::instance();
             InvertedIndexQueryCache::CacheKey cache_key;
             cache_key.index_path = index_file_path;
             cache_key.column_name = column_name;
@@ -345,13 +355,49 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
                 }
             }
             query_match_bitmap = *term_match_bitmap;
+        } else if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
+            const std::string& pattern = analyse_result[0];
+
+            std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
+            auto* cache = InvertedIndexQueryCache::instance();
+
+            InvertedIndexQueryCache::CacheKey cache_key;
+            cache_key.index_path = index_file_path;
+            cache_key.column_name = column_name;
+            cache_key.query_type = query_type;
+            cache_key.value = pattern;
+            InvertedIndexQueryCacheHandle cache_handle;
+            if (cache->lookup(cache_key, &cache_handle)) {
+                stats->inverted_index_query_cache_hit++;
+                term_match_bitmap = cache_handle.get_bitmap();
+            } else {
+                stats->inverted_index_query_cache_miss++;
+                InvertedIndexCacheHandle inverted_index_cache_handle;
+                
RETURN_IF_ERROR(InvertedIndexSearcherCache::instance()->get_index_searcher(
+                        _fs, _index_dir.c_str(), _index_file_name, 
&inverted_index_cache_handle,
+                        stats, type(), _has_null));
+                auto searcher_variant = 
inverted_index_cache_handle.get_index_searcher();
+                if (FulltextIndexSearcherPtr* searcher_ptr =
+                            
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant)) {
+                    term_match_bitmap = std::make_shared<roaring::Roaring>();
+
+                    Status res = match_regexp_index_search(stats, 
runtime_state, field_ws, pattern,
+                                                           *searcher_ptr, 
term_match_bitmap);
+                    if (!res.ok()) {
+                        return res;
+                    }
+                }
+                term_match_bitmap->runOptimize();
+                cache->insert(cache_key, term_match_bitmap, &cache_handle);
+            }
+            query_match_bitmap = *term_match_bitmap;
         } else {
             bool first = true;
             for (auto token : analyse_result) {
                 std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
 
                 // try to get term bitmap match result from cache to avoid 
query index on cache hit
-                auto cache = InvertedIndexQueryCache::instance();
+                auto* cache = InvertedIndexQueryCache::instance();
                 // use EQUAL_QUERY type here since cache is for each term/token
                 //auto token = lucene_wcstoutf8string(token_ws.c_str(), 
token_ws.length());
                 std::wstring token_ws = StringUtil::string_to_wstring(token);
@@ -495,6 +541,24 @@ Status 
FullTextIndexReader::match_phrase_prefix_index_search(
     return Status::OK();
 }
 
+Status FullTextIndexReader::match_regexp_index_search(
+        OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
+        const std::string& pattern, const FulltextIndexSearcherPtr& 
index_searcher,
+        const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+    TQueryOptions queryOptions = runtime_state->query_options();
+    try {
+        SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+        RegexpQuery query(index_searcher);
+        query.set_max_expansions(queryOptions.inverted_index_max_expansions);
+        query.add(field_ws, pattern);
+        query.search(*term_match_bitmap);
+    } catch (const CLuceneError& e) {
+        return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
+                                                                      
e.what());
+    }
+    return Status::OK();
+}
+
 void FullTextIndexReader::check_null_bitmap(const FulltextIndexSearcherPtr& 
index_searcher,
                                             bool& null_bitmap_already_read) {
     // try to reuse index_searcher's directory to read null_bitmap to cache
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index e14e4bcc47a..67fe0e4ae63 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -108,11 +108,12 @@ public:
 
     [[nodiscard]] bool has_null() const { return _has_null; }
 
-    static std::vector<std::string> get_analyse_result(lucene::util::Reader* 
reader,
-                                                       
lucene::analysis::Analyzer* analyzer,
-                                                       const std::string& 
field_name,
-                                                       InvertedIndexQueryType 
query_type,
-                                                       bool drop_duplicates = 
true);
+    static void get_analyse_result(std::vector<std::string>& analyse_result,
+                                   lucene::util::Reader* reader,
+                                   lucene::analysis::Analyzer* analyzer,
+                                   const std::string& field_name, 
InvertedIndexQueryType query_type,
+                                   bool drop_duplicates = true);
+
     static std::unique_ptr<lucene::util::Reader> 
create_reader(InvertedIndexCtx* inverted_index_ctx,
                                                                const 
std::string& value);
     static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer(
@@ -172,6 +173,11 @@ private:
             const FulltextIndexSearcherPtr& index_searcher,
             const std::shared_ptr<roaring::Roaring>& term_match_bitmap);
 
+    Status match_regexp_index_search(OlapReaderStatistics* stats, 
RuntimeState* runtime_state,
+                                     const std::wstring& field_ws, const 
std::string& pattern,
+                                     const FulltextIndexSearcherPtr& 
index_searcher,
+                                     const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
+
     void check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher,
                            bool& null_bitmap_already_read);
 };
diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index 648f79db305..42f27a116ed 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -79,10 +79,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString& 
src_column_string,
         auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
                 &inverted_index_ctx, tokenize_str.to_string());
 
-        std::vector<std::string> query_tokens =
-                doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                        reader.get(), inverted_index_ctx.analyzer, "tokenize",
-                        
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+        std::vector<std::string> query_tokens;
+        doris::segment_v2::InvertedIndexReader::get_analyse_result(
+                query_tokens, reader.get(), inverted_index_ctx.analyzer, 
"tokenize",
+                doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
         for (auto token : query_tokens) {
             const size_t old_size = column_string_chars.size();
             const size_t split_part_size = token.length();
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 6b8f6a4d8ea..38145342a0b 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -130,10 +130,10 @@ inline std::vector<std::string> 
FunctionMatchBase::analyse_data_token(
             auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(
                     inverted_index_ctx, str_ref.to_string());
 
-            std::vector<std::string> element_tokens =
-                    doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                            reader.get(), inverted_index_ctx->analyzer, 
column_name, query_type,
-                            false);
+            std::vector<std::string> element_tokens;
+            doris::segment_v2::InvertedIndexReader::get_analyse_result(
+                    element_tokens, reader.get(), 
inverted_index_ctx->analyzer, column_name,
+                    query_type, false);
             data_tokens.insert(data_tokens.end(), element_tokens.begin(), 
element_tokens.end());
         }
     } else {
@@ -141,8 +141,9 @@ inline std::vector<std::string> 
FunctionMatchBase::analyse_data_token(
         auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
                                                                             
str_ref.to_string());
 
-        data_tokens = 
doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                reader.get(), inverted_index_ctx->analyzer, column_name, 
query_type, false);
+        
doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens, 
reader.get(),
+                                                                   
inverted_index_ctx->analyzer,
+                                                                   
column_name, query_type, false);
     }
     return data_tokens;
 }
@@ -161,10 +162,10 @@ Status FunctionMatchAny::execute_match(const std::string& 
column_name,
                << inverted_index_parser_type_to_string(parser_type);
     auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
                                                                         
match_query_str);
-    std::vector<std::string> query_tokens =
-            doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                    reader.get(), inverted_index_ctx->analyzer, column_name,
-                    
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
+    std::vector<std::string> query_tokens;
+    doris::segment_v2::InvertedIndexReader::get_analyse_result(
+            query_tokens, reader.get(), inverted_index_ctx->analyzer, 
column_name,
+            doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
     if (query_tokens.empty()) {
         LOG(WARNING) << fmt::format(
                 "token parser result is empty for query, "
@@ -206,10 +207,10 @@ Status FunctionMatchAll::execute_match(const std::string& 
column_name,
                << inverted_index_parser_type_to_string(parser_type);
     auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
                                                                         
match_query_str);
-    std::vector<std::string> query_tokens =
-            doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                    reader.get(), inverted_index_ctx->analyzer, column_name,
-                    
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
+    std::vector<std::string> query_tokens;
+    doris::segment_v2::InvertedIndexReader::get_analyse_result(
+            query_tokens, reader.get(), inverted_index_ctx->analyzer, 
column_name,
+            doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
     if (query_tokens.empty()) {
         LOG(WARNING) << fmt::format(
                 "token parser result is empty for query, "
@@ -257,10 +258,10 @@ Status FunctionMatchPhrase::execute_match(const 
std::string& column_name,
                << inverted_index_parser_type_to_string(parser_type);
     auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
                                                                         
match_query_str);
-    std::vector<std::string> query_tokens =
-            doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                    reader.get(), inverted_index_ctx->analyzer, column_name,
-                    
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+    std::vector<std::string> query_tokens;
+    doris::segment_v2::InvertedIndexReader::get_analyse_result(
+            query_tokens, reader.get(), inverted_index_ctx->analyzer, 
column_name,
+            doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
     if (query_tokens.empty()) {
         LOG(WARNING) << fmt::format(
                 "token parser result is empty for query, "
@@ -315,6 +316,7 @@ void register_function_match(SimpleFunctionFactory& 
factory) {
     factory.register_function<FunctionMatchAll>();
     factory.register_function<FunctionMatchPhrase>();
     factory.register_function<FunctionMatchPhrasePrefix>();
+    factory.register_function<FunctionMatchRegexp>();
     factory.register_function<FunctionMatchElementEQ>();
     factory.register_function<FunctionMatchElementLT>();
     factory.register_function<FunctionMatchElementGT>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index ee32ee0eaf2..5ca981e7021 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -145,6 +145,23 @@ public:
     }
 };
 
+class FunctionMatchRegexp : public FunctionMatchBase {
+public:
+    static constexpr auto name = "match_regexp";
+    static FunctionPtr create() { return 
std::make_shared<FunctionMatchRegexp>(); }
+
+    String get_name() const override { return name; }
+
+    Status execute_match(const std::string& column_name, const std::string& 
match_query_str,
+                         size_t input_rows_count, const ColumnString* 
string_col,
+                         InvertedIndexCtx* inverted_index_ctx,
+                         const ColumnArray::Offsets64* array_offsets,
+                         ColumnUInt8::Container& result) const override {
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "FunctionMatchRegexp not support execute_match");
+    }
+};
+
 class FunctionMatchElementEQ : public FunctionMatchBase {
 public:
     static constexpr auto name = "match_element_eq";
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 
b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
index eb9cbcc685c..c7e823ac65d 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
@@ -346,6 +346,7 @@ MATCH_ELEMENT_LE: 'ELEMENT_LE';
 MATCH_ELEMENT_LT: 'ELEMENT_LT';
 MATCH_PHRASE: 'MATCH_PHRASE';
 MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX';
+MATCH_REGEXP: 'MATCH_REGEXP';
 MATERIALIZED: 'MATERIALIZED';
 MAX: 'MAX';
 MAXVALUE: 'MAXVALUE';
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 
b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
index 77e8188131f..0bfc2d313ce 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
@@ -594,7 +594,7 @@ rowConstructorItem
 predicate
     : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression
     | NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression
-    | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | 
MATCH_PHRASE_PREFIX) pattern=valueExpression
+    | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | 
MATCH_PHRASE_PREFIX | MATCH_REGEXP) pattern=valueExpression
     | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN
     | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN
     | IS NOT? kind=NULL
diff --git a/fe/fe-core/src/main/cup/sql_parser.cup 
b/fe/fe-core/src/main/cup/sql_parser.cup
index db3f934b6df..838f6d7a757 100644
--- a/fe/fe-core/src/main/cup/sql_parser.cup
+++ b/fe/fe-core/src/main/cup/sql_parser.cup
@@ -482,6 +482,7 @@ terminal String
     KW_MATCH_ALL,
     KW_MATCH_PHRASE,
     KW_MATCH_PHRASE_PREFIX,
+    KW_MATCH_REGEXP,
     KW_MATCH_ELEMENT_EQ,
     KW_MATCH_ELEMENT_LT,
     KW_MATCH_ELEMENT_GT,
@@ -987,7 +988,7 @@ precedence left KW_AND;
 precedence left KW_NOT, NOT;
 precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS;
 precedence left KW_LIKE, KW_REGEXP;
-precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, 
KW_MATCH_PHRASE_PREFIX, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, 
KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE;
+precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, 
KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH, KW_MATCH_ELEMENT_EQ, 
KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, 
KW_MATCH_ELEMENT_GE;
 precedence left EQUAL, LESSTHAN, GREATERTHAN;
 precedence left ADD, SUBTRACT;
 precedence left AT, STAR, DIVIDE, MOD, KW_DIV;
@@ -7170,6 +7171,8 @@ match_predicate ::=
   {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1, 
e2); :}
   | expr:e1 KW_MATCH_PHRASE_PREFIX expr:e2
   {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, 
e1, e2); :}
+  | expr:e1 KW_MATCH_REGEXP expr:e2
+  {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_REGEXP, e1, 
e2); :}
   | expr:e1 KW_MATCH_ELEMENT_EQ expr:e2
   {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, 
e2); :}
   | expr:e1 KW_MATCH_ELEMENT_LT expr:e2
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
index 49a0796c19b..f106aec956c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
@@ -51,6 +51,7 @@ public class MatchPredicate extends Predicate {
         MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL),
         MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
         MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", 
TExprOpcode.MATCH_PHRASE_PREFIX),
+        MATCH_REGEXP("MATCH_REGEXP", "match_regexp", TExprOpcode.MATCH_REGEXP),
         MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", 
TExprOpcode.MATCH_ELEMENT_EQ),
         MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", 
TExprOpcode.MATCH_ELEMENT_LT),
         MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", 
TExprOpcode.MATCH_ELEMENT_GT),
@@ -158,6 +159,16 @@ public class MatchPredicate extends Predicate {
                     symbolNotUsed,
                     Lists.<Type>newArrayList(new ArrayType(t), t),
                     Type.BOOLEAN));
+            
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_REGEXP.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(t, t),
+                    Type.BOOLEAN));
+            
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_REGEXP.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
         }
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index d0e731b6237..c2c67174489 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -231,6 +231,7 @@ import org.apache.doris.nereids.trees.expressions.MatchAll;
 import org.apache.doris.nereids.trees.expressions.MatchAny;
 import org.apache.doris.nereids.trees.expressions.MatchPhrase;
 import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
+import org.apache.doris.nereids.trees.expressions.MatchRegexp;
 import org.apache.doris.nereids.trees.expressions.Mod;
 import org.apache.doris.nereids.trees.expressions.Multiply;
 import org.apache.doris.nereids.trees.expressions.NamedExpression;
@@ -2937,6 +2938,12 @@ public class LogicalPlanBuilder extends 
DorisParserBaseVisitor<Object> {
                         getExpression(ctx.pattern)
                     );
                     break;
+                case DorisParser.MATCH_REGEXP:
+                    outExpression = new MatchRegexp(
+                        valueExpression,
+                        getExpression(ctx.pattern)
+                    );
+                    break;
                 default:
                     throw new ParseException("Unsupported predicate type: " + 
ctx.kind.getText(), ctx);
             }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
index cafe2824fa7..976e46830ef 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
@@ -51,6 +51,8 @@ public abstract class Match extends BinaryOperator implements 
PropagateNullable
                 return Operator.MATCH_PHRASE;
             case "MATCH_PHRASE_PREFIX":
                 return Operator.MATCH_PHRASE_PREFIX;
+            case "MATCH_REGEXP":
+                return Operator.MATCH_REGEXP;
             default:
                 throw new AnalysisException("UnSupported type for match: " + 
symbol);
         }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java
new file mode 100644
index 00000000000..6bb55aeb897
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions;
+
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * like expression: a MATCH_REGEXP '^h\\w*'.
+ */
+public class MatchRegexp extends Match {
+    public MatchRegexp(Expression left, Expression right) {
+        super(ImmutableList.of(left, right), "MATCH_REGEXP");
+    }
+
+    private MatchRegexp(List<Expression> children) {
+        super(children, "MATCH_REGEXP");
+    }
+
+    @Override
+    public MatchRegexp withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 2);
+        return new MatchRegexp(children);
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitMatchRegexp(this, context);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
index b53d22916a1..561648f9000 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
@@ -61,6 +61,7 @@ import org.apache.doris.nereids.trees.expressions.MatchAll;
 import org.apache.doris.nereids.trees.expressions.MatchAny;
 import org.apache.doris.nereids.trees.expressions.MatchPhrase;
 import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
+import org.apache.doris.nereids.trees.expressions.MatchRegexp;
 import org.apache.doris.nereids.trees.expressions.Mod;
 import org.apache.doris.nereids.trees.expressions.Multiply;
 import org.apache.doris.nereids.trees.expressions.NamedExpression;
@@ -500,6 +501,10 @@ public abstract class ExpressionVisitor<R, C>
         return visitMatch(matchPhrasePrefix, context);
     }
 
+    public R visitMatchRegexp(MatchRegexp matchRegexp, C context) {
+        return visitMatch(matchRegexp, context);
+    }
+
     public R visitAny(Any any, C context) {
         return visit(any, context);
     }
diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex 
b/fe/fe-core/src/main/jflex/sql_scanner.flex
index 955555c2709..68db0e9bf40 100644
--- a/fe/fe-core/src/main/jflex/sql_scanner.flex
+++ b/fe/fe-core/src/main/jflex/sql_scanner.flex
@@ -321,6 +321,7 @@ import org.apache.doris.qe.SqlModeHelper;
         keywordMap.put("match_all", new 
Integer(SqlParserSymbols.KW_MATCH_ALL));
         keywordMap.put("match_phrase", new 
Integer(SqlParserSymbols.KW_MATCH_PHRASE));
         keywordMap.put("match_phrase_prefix", new 
Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX));
+        keywordMap.put("match_regexp", new 
Integer(SqlParserSymbols.KW_MATCH_REGEXP));
         keywordMap.put("element_eq", new 
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ));
         keywordMap.put("element_lt", new 
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT));
         keywordMap.put("element_gt", new 
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT));
diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift
index 0afa53566d9..72a1d80e036 100644
--- a/gensrc/thrift/Opcodes.thrift
+++ b/gensrc/thrift/Opcodes.thrift
@@ -94,4 +94,5 @@ enum TExprOpcode {
     MATCH_ELEMENT_LE,
     MATCH_ELEMENT_GE,
     MATCH_PHRASE_PREFIX,
+    MATCH_REGEXP,
 }
diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out 
b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
new file mode 100644
index 00000000000..eab27de65ee
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1000
+
+-- !sql --
+54
+
+-- !sql --
+910
+
+-- !sql --
+60
+
+-- !sql --
+38
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy 
b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
new file mode 100644
index 00000000000..4c1ee1a5b0b
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_match_regexp", "p0"){
+    def indexTbName1 = "test_index_match_regexp"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+      `@timestamp` int(11) NULL COMMENT "",
+      `clientip` varchar(20) NULL COMMENT "",
+      `request` text NULL COMMENT "",
+      `status` int(11) NULL COMMENT "",
+      `size` int(11) NULL COMMENT "",
+      INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+        
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+            
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+                    assertEquals("success", json.Status.toLowerCase())
+                    if (expected_succ_rows >= 0) {
+                        assertEquals(json.NumberLoadedRows, expected_succ_rows)
+                    } else {
+                        assertEquals(json.NumberTotalRows, 
json.NumberLoadedRows + json.NumberUnselectedRows)
+                        assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes 
> 0)
+                }
+            }
+        }
+    }
+
+    try {
+        load_httplogs_data.call(indexTbName1, 'test_index_match_regexp', 
'true', 'json', 'documents-1000.json')
+
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp '^h'; """
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp '^team'; """
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp 's\$'; """
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp 'er\$'; """
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp '.*tickets.*'; """
+    } finally {
+        //try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [feature](invert index) match_regexp feature added (#28257)

Reply via email to