This is an automated email from the ASF dual-hosted git repository.
airborne pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
from 25496122781 [ci](perf) adjust restart doris wait time to 5m (#52181)
add 7b4d330af5c [feature](inverted index) add custom analyzer (#50143)
No new revisions were added by this update.
Summary of changes:
be/src/agent/agent_server.cpp | 13 +
be/src/agent/task_worker_pool.cpp | 27 +
be/src/agent/task_worker_pool.h | 4 +
be/src/common/config.cpp | 2 +
be/src/common/config.h | 2 +
be/src/olap/inverted_index_parser.cpp | 9 +
be/src/olap/inverted_index_parser.h | 7 +
be/src/olap/rowset/segment_v2/column_reader.cpp | 8 +-
...unction_query.h => abstract_analysis_factory.h} | 27 +-
.../inverted_index/analysis_factory_mgr.cpp | 75 +
.../inverted_index/analysis_factory_mgr.h | 51 +
.../inverted_index/analyzer/analyzer.cpp | 148 +-
.../segment_v2/inverted_index/analyzer/analyzer.h | 19 +-
.../analyzer/basic/basic_tokenizer.cpp | 3 +-
.../inverted_index/analyzer/custom_analyzer.cpp | 108 ++
.../inverted_index/analyzer/custom_analyzer.h | 86 +
.../analyzer/custom_analyzer_config.cpp | 62 +
.../analyzer/custom_analyzer_config.h | 76 +
.../inverted_index/query/conjunction_query.cpp | 50 +-
.../inverted_index/query/conjunction_query.h | 6 +-
.../inverted_index/query/disjunction_query.cpp | 24 +-
.../inverted_index/query/disjunction_query.h | 2 +-
.../inverted_index/query/phrase_edge_query.cpp | 20 +-
.../inverted_index/query/phrase_edge_query.h | 2 +-
.../inverted_index/query/phrase_prefix_query.cpp | 41 +-
.../inverted_index/query/phrase_query.cpp | 152 +-
.../segment_v2/inverted_index/query/phrase_query.h | 9 +-
.../phrase_query/ordered_sloppy_phrase_matcher.cpp | 15 +-
.../inverted_index/query/prefix_query.cpp | 30 +-
.../segment_v2/inverted_index/query/prefix_query.h | 5 +-
.../rowset/segment_v2/inverted_index/query/query.h | 19 +-
.../segment_v2/inverted_index/query/query_info.h | 58 +
.../inverted_index/query/regexp_query.cpp | 8 +-
.../inverted_index/query_v2/term_query.cpp | 7 +-
.../inverted_index/query_v2/term_query.h | 10 +-
.../rowset/segment_v2/inverted_index/setting.h | 136 ++
.../token_filter/ascii_folding_filter.cpp | 2020 ++++++++++++++++++++
.../ascii_folding_filter.h} | 35 +-
.../ascii_folding_filter_factory.h} | 32 +-
.../token_filter/lower_case_filter.h | 84 +
.../lower_case_filter_factory.h} | 34 +-
.../token_filter.h} | 28 +-
.../token_filter_factory.h} | 28 +-
.../token_filter/word_delimiter_filter.cpp | 249 +++
.../token_filter/word_delimiter_filter.h | 143 ++
.../token_filter/word_delimiter_filter_factory.h | 201 ++
.../token_filter/word_delimiter_iterator.cpp | 279 +++
.../token_filter/word_delimiter_iterator.h | 88 +
.../segment_v2/inverted_index/token_stream.h | 60 +
.../tokenizer/keyword/keyword_tokenizer.h | 73 +
.../keyword/keyword_tokenizer_factory.h} | 33 +-
.../inverted_index/tokenizer/ngram/char_matcher.h | 110 ++
.../ngram/edge_ngram_tokenizer.h} | 27 +-
.../tokenizer/ngram/edge_ngram_tokenizer_factory.h | 65 +
.../tokenizer/ngram/ngram_tokenizer.cpp | 148 ++
.../tokenizer/ngram/ngram_tokenizer.h | 78 +
.../tokenizer/ngram/ngram_tokenizer_factory.cpp | 83 +
.../tokenizer/ngram/ngram_tokenizer_factory.h | 64 +
.../tokenizer/standard/standard_tokenizer.h | 81 +
.../standard/standard_tokenizer_factory.h} | 31 +-
.../tokenizer/standard/standard_tokenizer_impl.cpp | 1982 +++++++++++++++++++
.../tokenizer/standard/standard_tokenizer_impl.h | 299 +++
.../inverted_index/tokenizer/tokenizer.h | 52 +
.../tokenizer_factory.h} | 28 +-
.../inverted_index/util/docid_set_iterator.h | 2 +-
.../segment_v2/inverted_index/util/term_iterator.h | 41 +-
.../inverted_index/util/term_position_iterator.h | 43 +-
.../inverted_index/util/union_term_iterator.h | 30 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 19 +-
.../rowset/segment_v2/inverted_index_writer.cpp | 37 +-
be/src/runtime/exec_env.h | 8 +
be/src/runtime/exec_env_init.cpp | 5 +
be/src/runtime/index_policy/index_policy_mgr.cpp | 173 ++
be/src/runtime/index_policy/index_policy_mgr.h | 52 +
be/src/vec/exprs/vmatch_predicate.cpp | 1 +
be/src/vec/exprs/vmatch_predicate.h | 2 +-
be/src/vec/functions/function_tokenize.cpp | 97 +-
be/src/vec/functions/function_tokenize.h | 5 +-
be/src/vec/functions/match.cpp | 104 +-
be/src/vec/functions/match.h | 21 +-
.../ananlyzer/custom_analyzer_test.cpp | 357 ++++
.../compaction/util/index_compaction_utils.cpp | 4 +-
.../query/phrase_edge_query_test.cpp | 28 +-
.../query/phrase_prefix_query_test.cpp | 29 +-
.../ordered_sloppy_phrase_matcher_test.cpp | 2 +-
.../inverted_index/query/phrase_query_test.cpp | 70 +-
.../segment_v2/inverted_index/setting_test.cpp | 184 ++
.../ascii_folding_filter_factory_test.cpp | 215 +++
.../lower_case_filter_factory_test.cpp | 109 ++
.../word_delimiter_filter_factory_test.cpp | 147 ++
.../token_filter/word_delimiter_filter_test.cpp | 331 ++++
.../token_filter/word_delimiter_iterator_test.cpp | 267 +++
.../tokenizer/edge_ngram_tokenizer_test.cpp | 241 +++
.../tokenizer/keyword_analyzer_test.cpp | 147 ++
.../tokenizer/ngram_tokenizer_test.cpp | 205 ++
.../tokenizer/standard_tokenizer_factory_test.cpp | 149 ++
.../util/union_term_iterator_test.cpp | 39 +-
.../runtime/index_policy/index_policy_mgr_test.cpp | 179 ++
be/test/vec/function/function_match_test.cpp | 10 +-
...lti_match.cpp => function_multi_match_test.cpp} | 0
.../antlr4/org/apache/doris/nereids/DorisLexer.g4 | 3 +
.../antlr4/org/apache/doris/nereids/DorisParser.g4 | 12 +
fe/fe-core/src/main/cup/sql_parser.cup | 46 +
.../doris/analysis/CreateIndexPolicyStmt.java | 97 +
.../apache/doris/analysis/DropIndexPolicyStmt.java | 78 +
.../apache/doris/analysis/InvertedIndexUtil.java | 25 +-
.../org/apache/doris/analysis/MatchPredicate.java | 5 +
.../apache/doris/analysis/ShowIndexPolicyStmt.java | 68 +
.../main/java/org/apache/doris/catalog/Env.java | 19 +
.../main/java/org/apache/doris/catalog/Index.java | 4 +
.../java/org/apache/doris/common/FeNameFormat.java | 4 +
.../AsciiFoldingTokenFilterValidator.java | 48 +
.../doris/indexpolicy/BasePolicyValidator.java | 52 +
.../doris/indexpolicy/DropIndexPolicyLog.java | 51 +
.../indexpolicy/EdgeNGramTokenizerValidator.java | 106 +
.../org/apache/doris/indexpolicy/IndexPolicy.java | 120 ++
.../apache/doris/indexpolicy/IndexPolicyMgr.java | 430 +++++
.../doris/indexpolicy/IndexPolicyTypeEnum.java | 31 +-
.../indexpolicy/KeywordTokenizerValidator.java | 36 +-
.../indexpolicy/LowerCaseTokenFilterValidator.java | 36 +-
.../doris/indexpolicy/NGramTokenizerValidator.java | 106 +
.../doris/indexpolicy/PolicyPropertyValidator.java | 23 +-
.../indexpolicy/StandardTokenizerValidator.java | 53 +
.../WordDelimiterTokenFilterValidator.java | 120 ++
.../org/apache/doris/journal/JournalEntity.java | 12 +
.../org/apache/doris/master/ReportHandler.java | 99 +-
.../doris/nereids/parser/LogicalPlanBuilder.java | 84 +
.../expressions/functions/scalar/Tokenize.java | 4 +-
.../apache/doris/nereids/trees/plans/PlanType.java | 11 +-
.../plans/commands/CreateIndexAnalyzerCommand.java | 75 +
.../commands/CreateIndexTokenFilterCommand.java | 75 +
.../commands/CreateIndexTokenizerCommand.java | 75 +
.../plans/commands/DropIndexAnalyzerCommand.java | 62 +
.../commands/DropIndexTokenFilterCommand.java | 62 +
.../plans/commands/DropIndexTokenizerCommand.java | 62 +
.../plans/commands/ShowIndexAnalyzerCommand.java | 60 +
.../commands/ShowIndexTokenFilterCommand.java | 60 +
.../plans/commands/ShowIndexTokenizerCommand.java | 60 +
.../trees/plans/visitor/CommandVisitor.java | 54 +
.../java/org/apache/doris/persist/EditLog.java | 20 +
.../org/apache/doris/persist/OperationType.java | 4 +
.../doris/persist/meta/MetaPersistMethod.java | 6 +
.../doris/persist/meta/PersistMetaModules.java | 2 +-
.../main/java/org/apache/doris/qe/DdlExecutor.java | 6 +
.../java/org/apache/doris/qe/ShowExecutor.java | 9 +
.../java/org/apache/doris/task/AgentBatchTask.java | 10 +
.../org/apache/doris/task/PushIndexPolicyTask.java | 63 +
fe/fe-core/src/main/jflex/sql_scanner.flex | 3 +
.../doris/indexpolicy/PolicyValidatorTests.java | 203 ++
gensrc/script/doris_builtins_functions.py | 2 +-
gensrc/thrift/AgentService.thrift | 19 +
gensrc/thrift/Exprs.thrift | 1 +
gensrc/thrift/MasterService.thrift | 1 +
gensrc/thrift/Types.thrift | 1 +
.../analyzer/test_custom_analyzer.out | Bin 0 -> 1217 bytes
.../analyzer/test_custom_analyzer1.out | Bin 0 -> 528 bytes
.../data/inverted_index_p0/test_tokenize.out | Bin 3771 -> 11439 bytes
.../nereids_function_p0/scalar_function/Array.out | Bin 773605 -> 774955
bytes
.../nereids_function_p0/scalar_function/Array1.out | Bin 33282709 -> 33283025
bytes
.../suites/function_p0/test_array_map.groovy | 4 +-
.../analyzer/test_custom_analyzer.groovy | 141 ++
.../analyzer/test_custom_analyzer1.groovy | 72 +
162 files changed, 13272 insertions(+), 832 deletions(-)
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> abstract_analysis_factory.h} (53%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/query/query_info.h
create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/setting.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter.cpp
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> token_filter/ascii_folding_filter.h} (51%)
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> token_filter/ascii_folding_filter_factory.h} (54%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter.h
copy be/src/olap/rowset/segment_v2/inverted_index/{query_v2/term_query.cpp =>
token_filter/lower_case_filter_factory.h} (53%)
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> token_filter/token_filter.h} (53%)
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> token_filter/token_filter_factory.h} (53%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_iterator.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_iterator.h
create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/token_stream.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
copy be/src/olap/rowset/segment_v2/inverted_index/{query_v2/term_query.h =>
tokenizer/keyword/keyword_tokenizer_factory.h} (51%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/char_matcher.h
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> tokenizer/ngram/edge_ngram_tokenizer.h} (53%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer_factory.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer_factory.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer.h
copy be/src/olap/rowset/segment_v2/inverted_index/{query_v2/term_query.h =>
tokenizer/standard/standard_tokenizer_factory.h} (50%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h
copy be/src/olap/rowset/segment_v2/inverted_index/{query/disjunction_query.h
=> tokenizer/tokenizer_factory.h} (53%)
create mode 100644 be/src/runtime/index_policy/index_policy_mgr.cpp
create mode 100644 be/src/runtime/index_policy/index_policy_mgr.h
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/custom_analyzer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/setting_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_iterator_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/edge_ngram_tokenizer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/keyword_analyzer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/ngram_tokenizer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/standard_tokenizer_factory_test.cpp
create mode 100644 be/test/runtime/index_policy/index_policy_mgr_test.cpp
rename be/test/vec/function/{function_multi_match.cpp =>
function_multi_match_test.cpp} (100%)
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/analysis/CreateIndexPolicyStmt.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/analysis/DropIndexPolicyStmt.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/analysis/ShowIndexPolicyStmt.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/AsciiFoldingTokenFilterValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/BasePolicyValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/DropIndexPolicyLog.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/EdgeNGramTokenizerValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
copy be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query.cpp =>
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java
(53%)
copy be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query.cpp =>
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/KeywordTokenizerValidator.java
(54%)
copy be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query.cpp =>
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/LowerCaseTokenFilterValidator.java
(54%)
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/NGramTokenizerValidator.java
copy be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query.cpp =>
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PolicyPropertyValidator.java
(53%)
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/StandardTokenizerValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/WordDelimiterTokenFilterValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexAnalyzerCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexTokenFilterCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexTokenizerCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexAnalyzerCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexTokenFilterCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexTokenizerCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexAnalyzerCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexTokenFilterCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexTokenizerCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/task/PushIndexPolicyTask.java
create mode 100644
fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
create mode 100644
regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
create mode 100644
regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer1.out
create mode 100644
regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
create mode 100644
regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer1.groovy
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]