This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6509c3b3d36 [test](index compaction) Add index compaction full flow UT
test (#45746)
6509c3b3d36 is described below
commit 6509c3b3d362e52e9ed9a92041dd44d009352431
Author: qiye <[email protected]>
AuthorDate: Thu Dec 26 21:11:55 2024 +0800
[test](index compaction) Add index compaction full flow UT test (#45746)
1. Add index compaction full flow UT tests
2. Add index compaction performance test, disable by default.
---
.../index_compaction_performance_test.cpp | 265 ++++++
.../compaction/index_compaction_test.cpp | 912 ++++++++++++++++++++-
.../compaction/util/index_compaction_utils.cpp | 275 +++++--
.../inverted_index/data/sorted_wikipedia-50-1.json | 50 ++
.../inverted_index/data/sorted_wikipedia-50-2.json | 50 ++
5 files changed, 1442 insertions(+), 110 deletions(-)
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_performance_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_performance_test.cpp
new file mode 100644
index 00000000000..566680e8b1e
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_performance_test.cpp
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock.h>
+
+#include <filesystem>
+#include <map>
+#include <string>
+
+#include "olap/utils.h"
+#include "util/index_compaction_utils.cpp"
+
+namespace doris {
+
+using namespace doris::vectorized;
+
+constexpr static uint32_t MAX_PATH_LEN = 1024;
+constexpr static std::string_view dest_dir = "./ut_dir/inverted_index_test";
+constexpr static std::string_view tmp_dir = "./ut_dir/tmp";
+
+class DISABLED_IndexCompactionPerformanceTest : public ::testing::Test {
+protected:
+ void SetUp() override {
+ // absolute dir
+ char buffer[MAX_PATH_LEN];
+ EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+ _current_dir = std::string(buffer);
+ _absolute_dir = _current_dir + std::string(dest_dir);
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok());
+
+ // tmp dir
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok());
+ std::vector<StorePath> paths;
+ paths.emplace_back(std::string(tmp_dir), 1024000000);
+ auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+ Status st = tmp_file_dirs->init();
+ EXPECT_TRUE(st.ok()) << st.to_json();
+ ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+ // storage engine
+ doris::EngineOptions options;
+ auto engine = std::make_unique<StorageEngine>(options);
+ _engine_ref = engine.get();
+ _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir);
+ static_cast<void>(_data_dir->update_capacity());
+ ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+ config::enable_segcompaction = false;
+ config::string_type_length_soft_limit_bytes = 2147483643;
+ config::inverted_index_dict_path =
+ _current_dir +
"/be/src/clucene/src/contribs-lib/CLucene/analysis/jieba/dict";
+ }
+ void TearDown() override {
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+ _engine_ref = nullptr;
+ ExecEnv::GetInstance()->set_storage_engine(nullptr);
+ }
+
+ DISABLED_IndexCompactionPerformanceTest() = default;
+ ~DISABLED_IndexCompactionPerformanceTest() override = default;
+
+ void _build_wiki_tablet(const KeysType& keys_type,
+ const InvertedIndexStorageFormatPB& storage_format,
+ const std::map<std::string, std::string>&
properties) {
+ // tablet_schema
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(keys_type);
+ schema_pb.set_inverted_index_storage_format(storage_format);
+
+ IndexCompactionUtils::construct_column(schema_pb.add_column(), 0,
"STRING", "title");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10001,
+ "idx_content", 1, "STRING",
"content", properties);
+ IndexCompactionUtils::construct_column(schema_pb.add_column(), 2,
"STRING", "redirect");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(), 3,
"STRING", "namespace");
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+ // unique table must contain the DELETE_SIGN column
+ auto* column_pb = schema_pb.add_column();
+ IndexCompactionUtils::construct_column(column_pb, 4, "TINYINT",
DELETE_SIGN);
+ column_pb->set_length(1);
+ column_pb->set_index_length(1);
+ column_pb->set_is_nullable(false);
+ }
+ _tablet_schema = std::make_shared<TabletSchema>();
+ _tablet_schema->init_from_pb(schema_pb);
+
+ // tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+ tablet_meta->_enable_unique_key_merge_on_write = true;
+ }
+
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+ EXPECT_TRUE(_tablet->init().ok());
+ }
+
+ void _run_normal_wiki_test() {
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+ std::string data_dir =
+ _current_dir +
"/be/test/olap/rowset/segment_v2/inverted_index/data/performance";
+ std::vector<std::string> data_files;
+ for (const auto& entry :
std::filesystem::directory_iterator(data_dir)) {
+ if (entry.is_regular_file()) {
+ std::string filename = entry.path().filename().string();
+ if (filename.starts_with("wikipedia") &&
filename.ends_with(".json")) {
+ std::cout << "Found file: " << filename << std::endl;
+ data_files.push_back(entry.path().string());
+ }
+ }
+ }
+
+ std::vector<RowsetSharedPtr> rowsets(data_files.size());
+ auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 1); };
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets, true);
+
+ auto custom_check_index = [](const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 1);
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 1);
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1));
+ EXPECT_TRUE(compaction._output_rowset->num_segments() == 1)
+ << compaction._output_rowset->num_segments();
+ };
+
+ RowsetSharedPtr output_rowset_index;
+ Status st;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, true,
+ output_rowset_index,
custom_check_index,
+ 10000000);
+ std::cout << "index compaction time: " <<
watch.get_elapse_second() << "s" << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ const auto& seg_path = output_rowset_index->segment_path(0);
+ EXPECT_TRUE(seg_path.has_value()) << seg_path.error();
+ auto inverted_index_file_reader_index =
IndexCompactionUtils::init_index_file_reader(
+ output_rowset_index, seg_path.value(),
+ _tablet_schema->get_inverted_index_storage_format());
+
+ auto custom_check_normal = [](const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 1);
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 0);
+ EXPECT_TRUE(compaction._output_rowset->num_segments() == 1);
+ };
+
+ RowsetSharedPtr output_rowset_normal;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, false,
+ output_rowset_normal,
custom_check_normal,
+ 10000000);
+ std::cout << "normal compaction time: " <<
watch.get_elapse_second() << "s"
+ << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ const auto& seg_path_normal = output_rowset_normal->segment_path(0);
+ EXPECT_TRUE(seg_path_normal.has_value()) << seg_path_normal.error();
+ auto inverted_index_file_reader_normal =
IndexCompactionUtils::init_index_file_reader(
+ output_rowset_normal, seg_path_normal.value(),
+ _tablet_schema->get_inverted_index_storage_format());
+
+ // check index file terms
+ for (int idx = 10001; idx < 10002; idx++) {
+ auto dir_idx = inverted_index_file_reader_index->_open(idx, "");
+ EXPECT_TRUE(dir_idx.has_value()) << dir_idx.error();
+ auto dir_normal = inverted_index_file_reader_normal->_open(idx,
"");
+ EXPECT_TRUE(dir_normal.has_value()) << dir_normal.error();
+ st =
IndexCompactionUtils::check_idx_file_correctness(dir_idx->get(),
+
dir_normal->get());
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ }
+ }
+
+private:
+ TabletSchemaSPtr _tablet_schema = nullptr;
+ StorageEngine* _engine_ref = nullptr;
+ std::unique_ptr<DataDir> _data_dir = nullptr;
+ TabletSharedPtr _tablet = nullptr;
+ std::string _absolute_dir;
+ std::string _current_dir;
+ int64_t _inc_id = 1000;
+};
+
+TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_dup_v2_english) {
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2,
properties);
+ _run_normal_wiki_test();
+}
+
+TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_dup_v2_unicode) {
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2,
properties);
+ _run_normal_wiki_test();
+}
+
+TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_dup_v2_chinese) {
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2,
properties);
+ _run_normal_wiki_test();
+}
+
+TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_mow_v2_english) {
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2, properties);
+ _run_normal_wiki_test();
+}
+
+TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_mow_v2_unicode) {
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2, properties);
+ _run_normal_wiki_test();
+}
+
+TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_mow_v2_chinese) {
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2, properties);
+ _run_normal_wiki_test();
+}
+} // namespace doris
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp
index 264786570e7..64aec3ffa4a 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp
@@ -17,6 +17,7 @@
#include <gmock/gmock.h>
+#include "olap/utils.h"
#include "util/index_compaction_utils.cpp"
namespace doris {
@@ -55,7 +56,25 @@ protected:
_data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir);
static_cast<void>(_data_dir->update_capacity());
ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+ config::inverted_index_dict_path =
+ _current_dir +
"/be/src/clucene/src/contribs-lib/CLucene/analysis/jieba/dict";
+ }
+ void TearDown() override {
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+ _engine_ref = nullptr;
+ ExecEnv::GetInstance()->set_storage_engine(nullptr);
+ // reset config
+ config::inverted_index_max_buffered_docs = -1;
+ config::compaction_batch_size = -1;
+ config::inverted_index_compaction_enable = false;
+ }
+ IndexCompactionTest() = default;
+ ~IndexCompactionTest() override = default;
+
+ void _build_tablet() {
// tablet_schema
TabletSchemaPB schema_pb;
schema_pb.set_keys_type(KeysType::DUP_KEYS);
@@ -65,8 +84,10 @@ protected:
"key_index", 0, "INT", "key");
IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10001,
"v1_index", 1, "STRING", "v1");
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10002,
- "v2_index", 2, "STRING", "v2",
true);
+ "v2_index", 2, "STRING", "v2",
properties);
IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10003,
"v3_index", 3, "INT", "v3");
_tablet_schema = std::make_shared<TabletSchema>();
@@ -78,16 +99,625 @@ protected:
_tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
EXPECT_TRUE(_tablet->init().ok());
}
- void TearDown() override {
-
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
-
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
-
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
- _engine_ref = nullptr;
- ExecEnv::GetInstance()->set_storage_engine(nullptr);
+
+ void _build_wiki_tablet(const KeysType& keys_type,
+ const InvertedIndexStorageFormatPB&
storage_format) {
+ // tablet_schema
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(keys_type);
+ schema_pb.set_inverted_index_storage_format(storage_format);
+
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10000,
+ "idx_title", 0, "STRING",
"title",
+ std::map<std::string,
std::string>(), true);
+ // parser = english, support_phrase = true, lower_case = true,
char_filter = none
+ std::map<std::string, std::string> properties;
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10001,
+ "idx_content_1", 1, "STRING",
"content_1",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = true, lower_case = true,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10002,
+ "idx_content_2", 2, "STRING",
"content_2",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = true, lower_case = false,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10003,
+ "idx_content_3", 3, "STRING",
"content_3",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = true, lower_case = false,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10004,
+ "idx_content_4", 4, "STRING",
"content_4",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = false, lower_case = true,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10005,
+ "idx_content_5", 5, "STRING",
"content_5",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = false, lower_case = true,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10006,
+ "idx_content_6", 6, "STRING",
"content_6",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = false, lower_case = false,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10007,
+ "idx_content_7", 7, "STRING",
"content_7",
+ properties);
+ properties.clear();
+ // parser = english, support_phrase = false, lower_case = false,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_ENGLISH);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10008,
+ "idx_content_8", 8, "STRING",
"content_8",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = true, lower_case = true,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10009,
+ "idx_content_9", 9, "STRING",
"content_9",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = true, lower_case = true,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10010,
+ "idx_content_10", 10, "STRING",
"content_10",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = true, lower_case = false,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10011,
+ "idx_content_11", 11, "STRING",
"content_11",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = true, lower_case = false,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10012,
+ "idx_content_12", 12, "STRING",
"content_12",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = false, lower_case = true,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10013,
+ "idx_content_13", 13, "STRING",
"content_13",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = false, lower_case = true,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10014,
+ "idx_content_14", 14, "STRING",
"content_14",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = false, lower_case = false,
char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10015,
+ "idx_content_15", 15, "STRING",
"content_15",
+ properties);
+ properties.clear();
+ // parser = unicode, support_phrase = false, lower_case = false,
char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_UNICODE);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10016,
+ "idx_content_16", 16, "STRING",
"content_16",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
true, lower_case = true, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10017,
+ "idx_content_17", 17, "STRING",
"content_17",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
true, lower_case = true, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10018,
+ "idx_content_18", 18, "STRING",
"content_18",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
true, lower_case = false, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10019,
+ "idx_content_19", 19, "STRING",
"content_19",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
true, lower_case = false, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10020,
+ "idx_content_20", 20, "STRING",
"content_20",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
false, lower_case = true, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10021,
+ "idx_content_21", 21, "STRING",
"content_21",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
false, lower_case = true, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10022,
+ "idx_content_22", 22, "STRING",
"content_22",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
false, lower_case = false, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10023,
+ "idx_content_23", 23, "STRING",
"content_23",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = fine_grained, support_phrase =
false, lower_case = false, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
INVERTED_INDEX_PARSER_FINE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10024,
+ "idx_content_24", 24, "STRING",
"content_24",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
true, lower_case = true, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10025,
+ "idx_content_25", 25, "STRING",
"content_25",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
true, lower_case = true, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10026,
+ "idx_content_26", 26, "STRING",
"content_26",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
true, lower_case = false, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10027,
+ "idx_content_27", 27, "STRING",
"content_27",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
true, lower_case = false, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10028,
+ "idx_content_28", 28, "STRING",
"content_28",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
false, lower_case = true, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10029,
+ "idx_content_29", 29, "STRING",
"content_29",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
false, lower_case = true, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_TRUE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10030,
+ "idx_content_30", 30, "STRING",
"content_30",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
false, lower_case = false, char_filter = none
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, "");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10031,
+ "idx_content_31", 31, "STRING",
"content_31",
+ properties);
+ properties.clear();
+ // parser = chinese, parser_mode = coarse_grained, support_phrase =
false, lower_case = false, char_filter = char_replace
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_CHINESE);
+ properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY);
+ properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY,
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO);
+ properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_FALSE);
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
"char_replace");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._");
+ properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " ");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10032,
+ "idx_content_32", 32, "STRING",
"content_32",
+ properties);
+ properties.clear();
+ // parser = none, ignore_above = 256
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_NONE);
+ properties.emplace(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY, "256");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10033,
+ "idx_content_33", 33, "STRING",
"content_33",
+ properties);
+ properties.clear();
+ // parser = none, ignore_above = 16383
+ properties.emplace(INVERTED_INDEX_PARSER_KEY,
INVERTED_INDEX_PARSER_NONE);
+ properties.emplace(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY, "16383");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10034,
+ "idx_content_34", 34, "STRING",
"content_34",
+ properties);
+
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10035,
+ "idx_redirect", 35, "STRING",
"redirect");
+ IndexCompactionUtils::construct_column(schema_pb.add_column(),
schema_pb.add_index(), 10036,
+ "idx_namespace", 36, "STRING",
"namespace");
+
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+ // unique table must contain the DELETE_SIGN column
+ auto* column_pb = schema_pb.add_column();
+ IndexCompactionUtils::construct_column(column_pb, 37, "TINYINT",
DELETE_SIGN);
+ column_pb->set_length(1);
+ column_pb->set_index_length(1);
+ column_pb->set_is_nullable(false);
+ }
+
+ _tablet_schema = std::make_shared<TabletSchema>();
+ _tablet_schema->init_from_pb(schema_pb);
+
+ // tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+ tablet_meta->_enable_unique_key_merge_on_write = true;
+ }
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+ EXPECT_TRUE(_tablet->init().ok());
}
- IndexCompactionTest() = default;
- ~IndexCompactionTest() override = default;
+ void _run_normal_wiki_test(bool with_delete = false, const std::string&
delete_pred = "",
+ int64_t max_rows_per_segment = 100000,
+ int output_rowset_segment_number = 1) {
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+ std::string data_file1 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json";
+ std::string data_file2 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json";
+ // for MOW table to delete
+ std::string data_file3 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json";
+ std::vector<std::string> data_files;
+ data_files.push_back(data_file1);
+ data_files.push_back(data_file2);
+ data_files.push_back(data_file3);
+
+ std::vector<RowsetSharedPtr> rowsets(data_files.size());
+ auto custom_check_build_rowsets = [this](const int32_t& size) {
+ auto keys_type = _tablet_schema->keys_type();
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+ EXPECT_EQ(size, _tablet_schema->num_columns() - 1);
+ } else {
+ EXPECT_EQ(size, _tablet_schema->num_columns());
+ }
+ };
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets, false, 50);
+
+ if (with_delete) {
+ // create delete predicate rowset and add to tablet
+ auto delete_rowset =
IndexCompactionUtils::create_delete_predicate_rowset(
+ _tablet_schema, delete_pred, _inc_id);
+ EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok());
+ EXPECT_TRUE(_tablet->rowset_map().size() == (data_files.size() +
1));
+ rowsets.push_back(delete_rowset);
+ EXPECT_TRUE(rowsets.size() == (data_files.size() + 1));
+ }
+ auto custom_check_index = [this, output_rowset_segment_number](
+ const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+ auto keys_type = _tablet_schema->keys_type();
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns() - 1);
+ EXPECT_EQ(ctx.columns_to_do_index_compaction.size(),
+ _tablet_schema->num_columns() - 1);
+ } else {
+
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns());
+ EXPECT_EQ(ctx.columns_to_do_index_compaction.size(),
_tablet_schema->num_columns());
+ }
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(0));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(2));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(3));
+ EXPECT_EQ(compaction._output_rowset->num_segments(),
output_rowset_segment_number)
+ << compaction._output_rowset->num_segments();
+ };
+
+ RowsetSharedPtr output_rowset_index;
+ Status st;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, true,
+ output_rowset_index,
custom_check_index,
+ max_rows_per_segment);
+ std::cout << "index compaction time: " <<
watch.get_elapse_second() << "s" << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ auto custom_check_normal = [this, output_rowset_segment_number](
+ const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+ auto keys_type = _tablet_schema->keys_type();
+ if (keys_type == KeysType::UNIQUE_KEYS) {
+
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns() - 1);
+ } else {
+
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns());
+ }
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.empty());
+ EXPECT_TRUE(compaction._output_rowset->num_segments() ==
output_rowset_segment_number)
+ << compaction._output_rowset->num_segments();
+ };
+
+ RowsetSharedPtr output_rowset_normal;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, false,
+ output_rowset_normal,
custom_check_normal,
+ max_rows_per_segment);
+ std::cout << "normal compaction time: " <<
watch.get_elapse_second() << "s"
+ << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ auto num_segments_idx = output_rowset_index->num_segments();
+ auto num_segments_normal = output_rowset_normal->num_segments();
+ for (int idx = 10000; idx < 10037; idx++) {
+ if (num_segments_idx == num_segments_normal == 1) {
+ // check index file terms for single segment
+ const auto& seg_path = output_rowset_index->segment_path(0);
+ EXPECT_TRUE(seg_path.has_value()) << seg_path.error();
+ auto inverted_index_file_reader_index =
+ IndexCompactionUtils::init_index_file_reader(
+ output_rowset_index, seg_path.value(),
+
_tablet_schema->get_inverted_index_storage_format());
+
+ const auto& seg_path_normal =
output_rowset_normal->segment_path(0);
+ EXPECT_TRUE(seg_path_normal.has_value()) <<
seg_path_normal.error();
+ auto inverted_index_file_reader_normal =
+ IndexCompactionUtils::init_index_file_reader(
+ output_rowset_normal, seg_path_normal.value(),
+
_tablet_schema->get_inverted_index_storage_format());
+
+ auto dir_idx = inverted_index_file_reader_index->_open(idx,
"");
+ EXPECT_TRUE(dir_idx.has_value()) << dir_idx.error();
+ auto dir_normal =
inverted_index_file_reader_normal->_open(idx, "");
+ EXPECT_TRUE(dir_normal.has_value()) << dir_normal.error();
+ st =
IndexCompactionUtils::check_idx_file_correctness(dir_idx->get(),
+
dir_normal->get());
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ } else {
+ // check index file terms for multiple segments
+ std::vector<std::unique_ptr<DorisCompoundReader>>
dirs_idx(num_segments_idx);
+ for (int i = 0; i < num_segments_idx; i++) {
+ const auto& seg_path =
output_rowset_index->segment_path(i);
+ EXPECT_TRUE(seg_path.has_value()) << seg_path.error();
+ auto inverted_index_file_reader_index =
+ IndexCompactionUtils::init_index_file_reader(
+ output_rowset_index, seg_path.value(),
+
_tablet_schema->get_inverted_index_storage_format());
+ auto dir_idx =
inverted_index_file_reader_index->_open(idx, "");
+ EXPECT_TRUE(dir_idx.has_value()) << dir_idx.error();
+ dirs_idx[i] = std::move(dir_idx.value());
+ }
+ std::vector<std::unique_ptr<DorisCompoundReader>>
dirs_normal(num_segments_normal);
+ for (int i = 0; i < num_segments_normal; i++) {
+ const auto& seg_path =
output_rowset_normal->segment_path(i);
+ EXPECT_TRUE(seg_path.has_value()) << seg_path.error();
+ auto inverted_index_file_reader_normal =
+ IndexCompactionUtils::init_index_file_reader(
+ output_rowset_normal, seg_path.value(),
+
_tablet_schema->get_inverted_index_storage_format());
+ auto dir_normal =
inverted_index_file_reader_normal->_open(idx, "");
+ EXPECT_TRUE(dir_normal.has_value()) << dir_normal.error();
+ dirs_normal[i] = std::move(dir_normal.value());
+ }
+ st =
IndexCompactionUtils::check_idx_file_correctness(dirs_idx, dirs_normal);
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ }
+ }
+ }
private:
TabletSchemaSPtr _tablet_schema = nullptr;
@@ -96,9 +726,11 @@ private:
TabletSharedPtr _tablet = nullptr;
std::string _absolute_dir;
std::string _current_dir;
+ int64_t _inc_id = 1000;
};
TEST_F(IndexCompactionTest, tes_write_index_normally) {
+ _build_tablet();
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
std::string data_file1 =
@@ -111,8 +743,9 @@ TEST_F(IndexCompactionTest, tes_write_index_normally) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -179,6 +812,7 @@ TEST_F(IndexCompactionTest, tes_write_index_normally) {
}
TEST_F(IndexCompactionTest, test_col_unique_ids_empty) {
+ _build_tablet();
// clear column unique id in tablet index 10001 and rebuild tablet_schema
TabletSchemaPB schema_pb;
_tablet_schema->to_schema_pb(&schema_pb);
@@ -198,8 +832,9 @@ TEST_F(IndexCompactionTest, test_col_unique_ids_empty) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 3); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -229,6 +864,7 @@ TEST_F(IndexCompactionTest, test_col_unique_ids_empty) {
}
TEST_F(IndexCompactionTest, test_tablet_index_id_not_equal) {
+ _build_tablet();
// replace unique id from 2 to 1 in tablet index 10002 and rebuild
tablet_schema
TabletSchemaPB schema_pb;
_tablet_schema->to_schema_pb(&schema_pb);
@@ -248,8 +884,9 @@ TEST_F(IndexCompactionTest, test_tablet_index_id_not_equal)
{
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 3); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -279,6 +916,7 @@ TEST_F(IndexCompactionTest, test_tablet_index_id_not_equal)
{
}
TEST_F(IndexCompactionTest, test_tablet_schema_tablet_index_is_null) {
+ _build_tablet();
// set index suffix in tablet index 10001 and rebuild tablet_schema
// simulate the case that index is null, tablet_schema->inverted_index(1)
will return nullptr
TabletSchemaPB schema_pb;
@@ -299,8 +937,9 @@ TEST_F(IndexCompactionTest,
test_tablet_schema_tablet_index_is_null) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 3); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -330,6 +969,7 @@ TEST_F(IndexCompactionTest,
test_tablet_schema_tablet_index_is_null) {
}
TEST_F(IndexCompactionTest, test_rowset_schema_tablet_index_is_null) {
+ _build_tablet();
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
std::string data_file1 =
@@ -342,8 +982,9 @@ TEST_F(IndexCompactionTest,
test_rowset_schema_tablet_index_is_null) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -375,7 +1016,7 @@ TEST_F(IndexCompactionTest,
test_rowset_schema_tablet_index_is_null) {
_tablet_schema->get_inverted_index_storage_format());
// check index file
- // index 10001 cannot be found in idx file
+ // index 10001 should be found in idx file, it can be produced by normal
compaction
auto dir_idx_compaction = inverted_index_file_reader_index->_open(10001,
"");
EXPECT_TRUE(dir_idx_compaction.has_value()) << dir_idx_compaction.error();
// check index 10001 term stats
@@ -386,6 +1027,7 @@ TEST_F(IndexCompactionTest,
test_rowset_schema_tablet_index_is_null) {
}
TEST_F(IndexCompactionTest, test_tablet_index_properties_not_equal) {
+ _build_tablet();
// add mock property in tablet index 10001 and rebuild tablet_schema
// simulate the case that index properties not equal among input rowsets
TabletSchemaSPtr mock_schema = std::make_shared<TabletSchema>();
@@ -407,8 +1049,9 @@ TEST_F(IndexCompactionTest,
test_tablet_index_properties_not_equal) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -443,6 +1086,7 @@ TEST_F(IndexCompactionTest,
test_tablet_index_properties_not_equal) {
}
TEST_F(IndexCompactionTest, test_is_skip_index_compaction_not_empty) {
+ _build_tablet();
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
std::string data_file1 =
@@ -455,8 +1099,9 @@ TEST_F(IndexCompactionTest,
test_is_skip_index_compaction_not_empty) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -491,6 +1136,7 @@ TEST_F(IndexCompactionTest,
test_is_skip_index_compaction_not_empty) {
}
TEST_F(IndexCompactionTest, test_rowset_fs_nullptr) {
+ _build_tablet();
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
std::string data_file1 =
@@ -503,8 +1149,9 @@ TEST_F(IndexCompactionTest, test_rowset_fs_nullptr) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -529,6 +1176,7 @@ TEST_F(IndexCompactionTest, test_rowset_fs_nullptr) {
}
TEST_F(IndexCompactionTest, test_input_row_num_zero) {
+ _build_tablet();
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
std::string data_file1 =
@@ -541,8 +1189,9 @@ TEST_F(IndexCompactionTest, test_input_row_num_zero) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -582,6 +1231,7 @@ TEST_F(IndexCompactionTest, test_input_row_num_zero) {
}
TEST_F(IndexCompactionTest, test_cols_to_do_index_compaction_empty) {
+ _build_tablet();
// add mock property in tablet index 10001, 10002 and rebuild tablet_schema
// simulate the case that index properties not equal among input rowsets
// the two cols will skip index compaction and make
ctx.columns_to_do_index_compaction empty
@@ -606,8 +1256,9 @@ TEST_F(IndexCompactionTest,
test_cols_to_do_index_compaction_empty) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
auto custom_check_index = [](const BaseCompaction& compaction, const
RowsetWriterContext& ctx) {
EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4);
@@ -644,6 +1295,7 @@ TEST_F(IndexCompactionTest,
test_cols_to_do_index_compaction_empty) {
}
TEST_F(IndexCompactionTest, test_index_compaction_with_delete) {
+ _build_tablet();
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
std::string data_file1 =
@@ -656,12 +1308,13 @@ TEST_F(IndexCompactionTest,
test_index_compaction_with_delete) {
std::vector<RowsetSharedPtr> rowsets(data_files.size());
auto custom_check_build_rowsets = [](const int32_t& size) {
EXPECT_EQ(size, 4); };
- IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet,
_engine_ref, rowsets,
- data_files,
custom_check_build_rowsets);
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets);
// create delete predicate rowset and add to tablet
auto delete_rowset = IndexCompactionUtils::create_delete_predicate_rowset(
- _tablet_schema, "v1='great'", inc_id++);
+ _tablet_schema, "v1='great'", _inc_id);
EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok());
EXPECT_TRUE(_tablet->rowset_map().size() == 3);
rowsets.push_back(delete_rowset);
@@ -731,4 +1384,197 @@ TEST_F(IndexCompactionTest,
test_index_compaction_with_delete) {
IndexCompactionUtils::check_meta_and_file(output_rowset_normal,
_tablet_schema, query_map);
}
+TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2) {
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test();
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2) {
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test();
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_with_partial_delete) {
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test(true, "namespace='Adel, OR'");
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_with_partial_delete) {
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test(true, "namespace='Adel, OR'");
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_with_total_delete) {
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2);
+ std::string delete_pred = "title IS NOT NULL";
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+ std::string data_file1 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json";
+ std::string data_file2 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json";
+ // for MOW table to delete
+ std::string data_file3 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json";
+ std::vector<std::string> data_files;
+ data_files.push_back(data_file1);
+ data_files.push_back(data_file2);
+ data_files.push_back(data_file3);
+
+ std::vector<RowsetSharedPtr> rowsets(data_files.size());
+ auto custom_check_build_rowsets = [this](const int32_t& size) {
+ EXPECT_EQ(size, _tablet_schema->num_columns());
+ };
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets, false, 50);
+
+ // create delete predicate rowset and add to tablet
+ auto delete_rowset =
IndexCompactionUtils::create_delete_predicate_rowset(_tablet_schema,
+
delete_pred, _inc_id);
+ EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok());
+ EXPECT_TRUE(_tablet->rowset_map().size() == (data_files.size() + 1));
+ rowsets.push_back(delete_rowset);
+ EXPECT_TRUE(rowsets.size() == (data_files.size() + 1));
+
+ auto custom_check_index = [this](const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+ EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns());
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() ==
_tablet_schema->num_columns());
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(0));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(2));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(3));
+ EXPECT_TRUE(compaction._output_rowset->num_segments() == 0);
+ };
+
+ RowsetSharedPtr output_rowset_index;
+ Status st;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, true,
+ output_rowset_index,
custom_check_index);
+ std::cout << "index compaction time: " << watch.get_elapse_second() <<
"s" << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ auto custom_check_normal = [this](const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+ EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns());
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 0);
+ EXPECT_TRUE(compaction._output_rowset->num_segments() == 0);
+ };
+
+ RowsetSharedPtr output_rowset_normal;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, false,
+ output_rowset_normal,
custom_check_normal);
+ std::cout << "normal compaction time: " << watch.get_elapse_second()
<< "s" << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_with_total_delete) {
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2);
+ std::string delete_pred = "title IS NOT NULL";
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+ std::string data_file1 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json";
+ std::string data_file2 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json";
+ // for MOW table to delete
+ std::string data_file3 =
+ _current_dir +
+
"/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json";
+ std::vector<std::string> data_files;
+ data_files.push_back(data_file1);
+ data_files.push_back(data_file2);
+ data_files.push_back(data_file3);
+
+ std::vector<RowsetSharedPtr> rowsets(data_files.size());
+ auto custom_check_build_rowsets = [this](const int32_t& size) {
+ EXPECT_EQ(size, _tablet_schema->num_columns() - 1);
+ };
+ IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>(
+ _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets,
data_files, _inc_id,
+ custom_check_build_rowsets, false, 50);
+
+ // create delete predicate rowset and add to tablet
+ auto delete_rowset =
IndexCompactionUtils::create_delete_predicate_rowset(_tablet_schema,
+
delete_pred, _inc_id);
+ EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok());
+ EXPECT_TRUE(_tablet->rowset_map().size() == (data_files.size() + 1));
+ rowsets.push_back(delete_rowset);
+ EXPECT_TRUE(rowsets.size() == (data_files.size() + 1));
+
+ auto custom_check_index = [this](const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+ EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns() - 1);
+ EXPECT_EQ(ctx.columns_to_do_index_compaction.size(),
_tablet_schema->num_columns() - 1);
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(0));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(2));
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(3));
+ EXPECT_TRUE(compaction._output_rowset->num_segments() == 0);
+ };
+
+ RowsetSharedPtr output_rowset_index;
+ Status st;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, true,
+ output_rowset_index,
custom_check_index);
+ std::cout << "index compaction time: " << watch.get_elapse_second() <<
"s" << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ auto custom_check_normal = [this](const BaseCompaction& compaction,
+ const RowsetWriterContext& ctx) {
+ EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(),
+ _tablet_schema->num_columns() - 1);
+ EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 0);
+ EXPECT_TRUE(compaction._output_rowset->num_segments() == 0);
+ };
+
+ RowsetSharedPtr output_rowset_normal;
+ {
+ OlapStopWatch watch;
+ st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref,
_tablet, false,
+ output_rowset_normal,
custom_check_normal);
+ std::cout << "normal compaction time: " << watch.get_elapse_second()
<< "s" << std::endl;
+ }
+ EXPECT_TRUE(st.ok()) << st.to_string();
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_multiple_dest_segments) {
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test(false, "", 50, 3);
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_multiple_dest_segments) {
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test(false, "", 50, 2);
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_multiple_src_lucene_segments)
{
+ config::inverted_index_max_buffered_docs = 100;
+ _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test();
+}
+
+TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_multiple_src_lucene_segments)
{
+ config::inverted_index_max_buffered_docs = 100;
+ _build_wiki_tablet(KeysType::UNIQUE_KEYS,
InvertedIndexStorageFormatPB::V2);
+ _run_normal_wiki_test();
+}
} // namespace doris
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
index 530dca8054c..02353fc5441 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
@@ -21,6 +21,7 @@
#include <iomanip>
#include <iostream>
#include <memory>
+#include <nlohmann/json.hpp>
#include <sstream>
#include <vector>
@@ -38,7 +39,6 @@
namespace doris {
-static int64_t inc_id = 1000;
const static std::string expected_output =
"Max Docs: 2000\n"
"Num Docs: 2000\n"
@@ -76,8 +76,18 @@ class IndexCompactionUtils {
std::string url;
int num;
};
+ struct WikiDataRow {
+ std::string title;
+ std::string content;
+ std::string redirect;
+ std::string space;
+ };
+
+ template <typename T>
+ static std::vector<T> read_data(const std::string& file_name);
- static std::vector<DataRow> read_data(const std::string file_name) {
+ template <>
+ std::vector<DataRow> read_data<DataRow>(const std::string& file_name) {
std::ifstream file(file_name);
EXPECT_TRUE(file.is_open());
@@ -103,6 +113,38 @@ class IndexCompactionUtils {
return data;
}
+ template <>
+ std::vector<WikiDataRow> read_data<WikiDataRow>(const std::string&
file_name) {
+ std::ifstream file(file_name);
+ EXPECT_TRUE(file.is_open());
+
+ std::vector<WikiDataRow> data;
+ std::string line;
+
+ while (std::getline(file, line)) {
+ if (line.empty()) {
+ continue;
+ }
+ // catch parse exception and continue
+ try {
+ nlohmann::json j = nlohmann::json::parse(line);
+ WikiDataRow row;
+ row.title = j.value("title", "null");
+ row.content = j.value("content", "null");
+ row.redirect = j.value("redirect", "null");
+ row.space = j.value("space", "null");
+
+ data.emplace_back(std::move(row));
+ } catch (const std::exception& e) {
+ std::cout << "parse json error: " << e.what() << std::endl;
+ continue;
+ }
+ }
+
+ file.close();
+ return data;
+ }
+
static bool query_bkd(const TabletIndex* index,
std::shared_ptr<InvertedIndexFileReader>&
inverted_index_file_reader,
const std::vector<int>& query_data,
@@ -233,85 +275,82 @@ class IndexCompactionUtils {
r->close();
_CLLDELETE(r);
}
- static Status check_idx_file_correctness(lucene::store::Directory*
index_reader,
- lucene::store::Directory*
tmp_index_reader) {
- lucene::index::IndexReader* idx_reader =
lucene::index::IndexReader::open(index_reader);
- lucene::index::IndexReader* tmp_idx_reader =
- lucene::index::IndexReader::open(tmp_index_reader);
-
+ static Status check_idx_file_correctness_impl(lucene::index::IndexReader*
idx_reader,
+ lucene::index::IndexReader*
normal_idx_reader) {
// compare numDocs
- if (idx_reader->numDocs() != tmp_idx_reader->numDocs()) {
+ if (idx_reader->numDocs() != normal_idx_reader->numDocs()) {
return Status::InternalError(
"index compaction correctness check failed, numDocs not
equal, idx_numDocs={}, "
- "tmp_idx_numDocs={}",
- idx_reader->numDocs(), tmp_idx_reader->numDocs());
+ "normal_idx_numDocs={}",
+ idx_reader->numDocs(), normal_idx_reader->numDocs());
}
lucene::index::TermEnum* term_enum = idx_reader->terms();
- lucene::index::TermEnum* tmp_term_enum = tmp_idx_reader->terms();
+ lucene::index::TermEnum* normal_term_enum = normal_idx_reader->terms();
lucene::index::TermDocs* term_docs = nullptr;
- lucene::index::TermDocs* tmp_term_docs = nullptr;
+ lucene::index::TermDocs* normal_term_docs = nullptr;
// iterate TermEnum
- while (term_enum->next() && tmp_term_enum->next()) {
+ while (term_enum->next() && normal_term_enum->next()) {
std::string token =
lucene_wcstoutf8string(term_enum->term(false)->text(),
term_enum->term(false)->textLength());
std::string field = lucene_wcstoutf8string(
term_enum->term(false)->field(),
lenOfString(term_enum->term(false)->field()));
- std::string tmp_token = lucene_wcstoutf8string(
- tmp_term_enum->term(false)->text(),
tmp_term_enum->term(false)->textLength());
- std::string tmp_field =
- lucene_wcstoutf8string(tmp_term_enum->term(false)->field(),
-
lenOfString(tmp_term_enum->term(false)->field()));
+ std::string normal_token =
+
lucene_wcstoutf8string(normal_term_enum->term(false)->text(),
+
normal_term_enum->term(false)->textLength());
+ std::string normal_field =
+
lucene_wcstoutf8string(normal_term_enum->term(false)->field(),
+
lenOfString(normal_term_enum->term(false)->field()));
// compare token and field
- if (field != tmp_field) {
+ if (field != normal_field) {
return Status::InternalError(
"index compaction correctness check failed, fields not
equal, field={}, "
- "tmp_field={}",
+ "normal_field={}",
field, field);
}
- if (token != tmp_token) {
+ if (token != normal_token) {
return Status::InternalError(
"index compaction correctness check failed, tokens not
equal, token={}, "
- "tmp_token={}",
- token, tmp_token);
+ "normal_token={}",
+ token, normal_token);
}
// get term's docId and freq
term_docs = idx_reader->termDocs(term_enum->term(false));
- tmp_term_docs =
tmp_idx_reader->termDocs(tmp_term_enum->term(false));
+ normal_term_docs =
normal_idx_reader->termDocs(normal_term_enum->term(false));
// compare term's docId and freq
- while (term_docs->next() && tmp_term_docs->next()) {
- if (term_docs->doc() != tmp_term_docs->doc() ||
- term_docs->freq() != tmp_term_docs->freq()) {
+ while (term_docs->next() && normal_term_docs->next()) {
+ if (term_docs->doc() != normal_term_docs->doc() ||
+ term_docs->freq() != normal_term_docs->freq()) {
return Status::InternalError(
"index compaction correctness check failed, docId
or freq not equal, "
- "docId={}, tmp_docId={}, freq={}, tmp_freq={}",
- term_docs->doc(), tmp_term_docs->doc(),
term_docs->freq(),
- tmp_term_docs->freq());
+ "docId={}, normal_docId={}, freq={},
normal_freq={}",
+ term_docs->doc(), normal_term_docs->doc(),
term_docs->freq(),
+ normal_term_docs->freq());
}
}
// check if there are remaining docs
- if (term_docs->next() || tmp_term_docs->next()) {
+ if (term_docs->next() || normal_term_docs->next()) {
return Status::InternalError(
"index compaction correctness check failed, number of
docs not equal for "
- "term={}, tmp_term={}",
- token, tmp_token);
+ "term={}, normal_term={}",
+ token, normal_token);
}
if (term_docs) {
term_docs->close();
_CLLDELETE(term_docs);
}
- if (tmp_term_docs) {
- tmp_term_docs->close();
- _CLLDELETE(tmp_term_docs);
+ if (normal_term_docs) {
+ normal_term_docs->close();
+ _CLLDELETE(normal_term_docs);
}
}
// check if there are remaining terms
- if (term_enum->next() || tmp_term_enum->next()) {
+ if (term_enum->next() || normal_term_enum->next()) {
return Status::InternalError(
"index compaction correctness check failed, number of
terms not equal");
}
@@ -319,27 +358,61 @@ class IndexCompactionUtils {
term_enum->close();
_CLLDELETE(term_enum);
}
- if (tmp_term_enum) {
- tmp_term_enum->close();
- _CLLDELETE(tmp_term_enum);
+ if (normal_term_enum) {
+ normal_term_enum->close();
+ _CLLDELETE(normal_term_enum);
}
if (idx_reader) {
idx_reader->close();
_CLLDELETE(idx_reader);
}
- if (tmp_idx_reader) {
- tmp_idx_reader->close();
- _CLLDELETE(tmp_idx_reader);
+ if (normal_idx_reader) {
+ normal_idx_reader->close();
+ _CLLDELETE(normal_idx_reader);
}
return Status::OK();
}
+ static Status check_idx_file_correctness(lucene::store::Directory*
index_reader,
+ lucene::store::Directory*
normal_index_reader) {
+ lucene::index::IndexReader* idx_reader =
lucene::index::IndexReader::open(index_reader);
+ lucene::index::IndexReader* normal_idx_reader =
+ lucene::index::IndexReader::open(normal_index_reader);
+
+ return check_idx_file_correctness_impl(idx_reader, normal_idx_reader);
+ }
+
+ static Status check_idx_file_correctness(
+ const std::vector<std::unique_ptr<DorisCompoundReader>>&
index_readers,
+ const std::vector<std::unique_ptr<DorisCompoundReader>>&
normal_index_readers) {
+ ValueArray<lucene::index::IndexReader*> readers(index_readers.size());
+ for (int i = 0; i < index_readers.size(); i++) {
+ lucene::index::IndexReader* idx_reader =
+ lucene::index::IndexReader::open(index_readers[i].get());
+ readers[i] = idx_reader;
+ }
+ ValueArray<lucene::index::IndexReader*>
normal_readers(normal_index_readers.size());
+ for (int i = 0; i < normal_index_readers.size(); i++) {
+ lucene::index::IndexReader* normal_idx_reader =
+
lucene::index::IndexReader::open(normal_index_readers[i].get());
+ normal_readers[i] = normal_idx_reader;
+ }
+
+ auto* idx_reader = new lucene::index::MultiReader(&readers, true);
+ auto* normal_idx_reader = new
lucene::index::MultiReader(&normal_readers, true);
+
+ return check_idx_file_correctness_impl(idx_reader, normal_idx_reader);
+ }
+
static Status do_compaction(
const std::vector<RowsetSharedPtr>& rowsets, StorageEngine*
engine_ref,
const TabletSharedPtr& tablet, bool is_index_compaction,
RowsetSharedPtr& rowset_ptr,
const std::function<void(const BaseCompaction&, const
RowsetWriterContext&)>
- custom_check = nullptr) {
+ custom_check = nullptr,
+ int64_t max_rows_per_segment = 100000) {
config::inverted_index_compaction_enable = is_index_compaction;
+ // control max rows in one block
+ config::compaction_batch_size = max_rows_per_segment;
// only base compaction can handle delete predicate
BaseCompaction compaction(*engine_ref, tablet);
compaction._input_rowsets = std::move(rowsets);
@@ -349,12 +422,13 @@ class IndexCompactionUtils {
create_input_rowsets_readers(compaction, input_rs_readers);
RowsetWriterContext ctx;
+ ctx.max_rows_per_segment = max_rows_per_segment;
RETURN_IF_ERROR(compaction.construct_output_rowset_writer(ctx));
compaction._stats.rowid_conversion =
compaction._rowid_conversion.get();
RETURN_IF_ERROR(Merger::vertical_merge_rowsets(
tablet, compaction.compaction_type(),
*(compaction._cur_tablet_schema),
- input_rs_readers, compaction._output_rs_writer.get(), 100000,
5,
+ input_rs_readers, compaction._output_rs_writer.get(),
max_rows_per_segment - 1, 5,
&compaction._stats));
const auto& dst_writer =
@@ -409,36 +483,41 @@ class IndexCompactionUtils {
}
static RowsetSharedPtr create_delete_predicate_rowset(const
TabletSchemaSPtr& schema,
- std::string pred,
int64_t version) {
+ std::string pred,
int64& inc_id) {
DeletePredicatePB del_pred;
del_pred.add_sub_predicates(pred);
del_pred.set_version(1);
RowsetMetaSharedPtr rsm(new RowsetMeta());
- init_rs_meta(rsm, version, version);
+ init_rs_meta(rsm, inc_id, inc_id);
RowsetId id;
- id.init(version);
+ id.init(inc_id);
rsm->set_rowset_id(id);
rsm->set_delete_predicate(std::move(del_pred));
rsm->set_tablet_schema(schema);
+ inc_id++;
return std::make_shared<BetaRowset>(schema, rsm, "");
}
static void construct_column(ColumnPB* column_pb, TabletIndexPB*
tablet_index, int64_t index_id,
const std::string& index_name, int32_t
col_unique_id,
const std::string& column_type, const
std::string& column_name,
- bool parser = false) {
+ const std::map<std::string, std::string>&
properties =
+ std::map<std::string, std::string>(),
+ bool is_key = false) {
column_pb->set_unique_id(col_unique_id);
column_pb->set_name(column_name);
column_pb->set_type(column_type);
- column_pb->set_is_key(false);
+ column_pb->set_is_key(is_key);
column_pb->set_is_nullable(true);
tablet_index->set_index_id(index_id);
tablet_index->set_index_name(index_name);
tablet_index->set_index_type(IndexType::INVERTED);
tablet_index->add_col_unique_id(col_unique_id);
- if (parser) {
- auto* properties = tablet_index->mutable_properties();
- (*properties)[INVERTED_INDEX_PARSER_KEY] =
INVERTED_INDEX_PARSER_UNICODE;
+ if (!properties.empty()) {
+ auto* pros = tablet_index->mutable_properties();
+ for (const auto& [key, value] : properties) {
+ (*pros)[key] = value;
+ }
}
}
@@ -521,7 +600,8 @@ class IndexCompactionUtils {
static RowsetWriterContext rowset_writer_context(const
std::unique_ptr<DataDir>& data_dir,
const TabletSchemaSPtr&
schema,
- const std::string&
tablet_path) {
+ const std::string&
tablet_path, int64& inc_id,
+ int64
max_rows_per_segment = 200) {
RowsetWriterContext context;
RowsetId rowset_id;
rowset_id.init(inc_id);
@@ -532,23 +612,28 @@ class IndexCompactionUtils {
context.tablet_schema = schema;
context.tablet_path = tablet_path;
context.version = Version(inc_id, inc_id);
- context.max_rows_per_segment = 200;
+ context.max_rows_per_segment = max_rows_per_segment;
inc_id++;
return context;
}
+ template <typename T>
static void build_rowsets(const std::unique_ptr<DataDir>& data_dir,
const TabletSchemaSPtr& schema, const
TabletSharedPtr& tablet,
StorageEngine* engine_ref,
std::vector<RowsetSharedPtr>& rowsets,
- const std::vector<std::string>& data_files,
- const std::function<void(const int32_t&)>
custom_check = nullptr) {
- std::vector<std::vector<DataRow>> data;
- for (auto file : data_files) {
- data.emplace_back(read_data(file));
+ const std::vector<std::string>& data_files,
int64& inc_id,
+ const std::function<void(const int32_t&)>
custom_check = nullptr,
+ const bool& is_performance = false,
+ int64 max_rows_per_segment = 200) {
+ std::vector<std::vector<T>> data;
+ for (const auto& file : data_files) {
+ data.emplace_back(read_data<T>(file));
}
for (int i = 0; i < data.size(); i++) {
const auto& res = RowsetFactory::create_rowset_writer(
- *engine_ref, rowset_writer_context(data_dir, schema,
tablet->tablet_path()),
+ *engine_ref,
+ rowset_writer_context(data_dir, schema,
tablet->tablet_path(), inc_id,
+ max_rows_per_segment),
false);
EXPECT_TRUE(res.has_value()) << res.error();
const auto& rowset_writer = res.value();
@@ -556,24 +641,58 @@ class IndexCompactionUtils {
vectorized::Block block = schema->create_block();
auto columns = block.mutate_columns();
for (const auto& row : data[i]) {
- vectorized::Field key = int32_t(row.key);
- vectorized::Field v1(row.word);
- vectorized::Field v2(row.url);
- vectorized::Field v3 = int32_t(row.num);
- columns[0]->insert(key);
- columns[1]->insert(v1);
- columns[2]->insert(v2);
- columns[3]->insert(v3);
+ if constexpr (std::is_same_v<T, DataRow>) {
+ vectorized::Field key = int32_t(row.key);
+ vectorized::Field v1(row.word);
+ vectorized::Field v2(row.url);
+ vectorized::Field v3 = int32_t(row.num);
+ columns[0]->insert(key);
+ columns[1]->insert(v1);
+ columns[2]->insert(v2);
+ columns[3]->insert(v3);
+ } else if constexpr (std::is_same_v<T, WikiDataRow>) {
+ vectorized::Field title(row.title);
+ vectorized::Field content(row.content);
+ vectorized::Field redirect(row.redirect);
+ vectorized::Field space(row.space);
+ columns[0]->insert(title);
+ if (is_performance) {
+ columns[1]->insert(content);
+ columns[2]->insert(redirect);
+ columns[3]->insert(space);
+ if (schema->keys_type() == UNIQUE_KEYS) {
+ uint8_t num = 0;
+ columns[4]->insert_data((const char*)&num,
sizeof(num));
+ }
+ } else {
+ for (int j = 1; j < 35; j++) {
+ columns[j]->insert(content);
+ }
+ columns[35]->insert(redirect);
+ columns[36]->insert(space);
+ if (schema->keys_type() == UNIQUE_KEYS) {
+ uint8_t num = 0;
+ columns[37]->insert_data((const char*)&num,
sizeof(num));
+ }
+ }
+ }
}
- EXPECT_TRUE(rowset_writer->add_block(&block).ok());
- EXPECT_TRUE(rowset_writer->flush().ok());
+
+ Status st = rowset_writer->add_block(&block);
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ st = rowset_writer->flush();
+ EXPECT_TRUE(st.ok()) << st.to_string();
const auto& dst_writer =
dynamic_cast<BaseBetaRowsetWriter*>(rowset_writer.get());
check_idx_file_writer_closed(dst_writer, true);
- EXPECT_TRUE(rowset_writer->build(rowsets[i]).ok());
- EXPECT_TRUE(tablet->add_rowset(rowsets[i]).ok());
- EXPECT_TRUE(rowsets[i]->num_segments() == 5);
+ st = rowset_writer->build(rowsets[i]);
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ st = tablet->add_rowset(rowsets[i]);
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ EXPECT_TRUE(rowsets[i]->num_segments() ==
+ (rowsets[i]->num_rows() / max_rows_per_segment))
+ << rowsets[i]->num_segments();
// check rowset meta and file
for (int seg_id = 0; seg_id < rowsets[i]->num_segments();
seg_id++) {
@@ -583,7 +702,8 @@ class IndexCompactionUtils {
const auto& file_name = fmt::format("{}/{}_{}.idx",
rowsets[i]->tablet_path(),
rowsets[i]->rowset_id().to_string(), seg_id);
int64_t file_size = 0;
- EXPECT_TRUE(fs->file_size(file_name, &file_size).ok());
+ Status st = fs->file_size(file_name, &file_size);
+ EXPECT_TRUE(st.ok()) << st.to_string();
EXPECT_EQ(index_info.index_size(), file_size);
const auto& seg_path = rowsets[i]->segment_path(seg_id);
@@ -593,7 +713,8 @@ class IndexCompactionUtils {
auto inverted_index_file_reader =
std::make_shared<InvertedIndexFileReader>(
fs, std::string(index_file_path_prefix),
schema->get_inverted_index_storage_format(),
index_info);
- EXPECT_TRUE(inverted_index_file_reader->init().ok());
+ st = inverted_index_file_reader->init();
+ EXPECT_TRUE(st.ok()) << st.to_string();
const auto& dirs =
inverted_index_file_reader->get_all_directories();
EXPECT_TRUE(dirs.has_value());
if (custom_check) {
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json
b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json
new file mode 100644
index 00000000000..4cbdc10850f
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json
@@ -0,0 +1,50 @@
+{"title":"102.2 Smooth FM","content":"{{About|the defunct GMG Radio station
which played adult contemporary music|the current station which plays
oldies|102.2 Smooth Radio}}\n{{Use British English|date=May 2015}}\n{{Use dmy
dates|date=December 2023}}\n{{Infobox radio station\n| logo = SmoothFM
london.png\n| logo_size = 100px\n| name = 102.2 Smooth FM (London)
(defunct)\n| airdate = 7 June 2005\n| frequency = 102.2 [[megahertz|MHz]]\n|
area = [[Greater London]] (FM),<br / [...]
+{"title":"1932 Prussian coup d'état","content":"{{Short description|Takeover
by Weimar chancellor Franz von Papen}}\n{{Infobox civil conflict\n| title =
1932 Prussian coup d'état\n| subtitle =\n| partof = [[Weimar Republic#Reasons
for failure|failure of Weimar Republic]]\n| image = Bundesarchiv Bild
102-13680, Berlin, Verordnung über Ausnahmezustand.jpg\n| caption = The
Emergency Decree of President von Hindenburg (Berlin, July 1932)\n| date = 20
July 1932\n| place = [[Free State of Prus [...]
+{"title":"3467 Bernheim","redirect":"List of minor planets: 3001–4000"}
+{"title":"509 Harbourfront","content":"{{short description|Streetcar route in
Toronto, Canada}}\n{{Use dmy dates|date=June 2022}}\n{{Infobox rail line\n|
box_width = auto\n| name = 509
Harbourfront\n| color = \n| logo
= TTC.svg\n| logo_width = 75\n| logo_alt
= \n| image = Streetcar 4407 Queens Quay
West at Harbourfront [...]
+{"title":"A Midsummer Night's Gene","content":"{{Short description|1997 novel
by Andrew Harman}}\n{{infobox book | <!-- See [[Wikipedia:WikiProject Novels]]
or [[Wikipedia:WikiProject Books]] -->\n| name = A Midsummer Night's
Gene\n| title_orig = \n| translator = \n| image = A Midsummer
Night's Gene.jpg\n| caption = First edition\n| author = [[Andrew
Harman]]\n| illustrator = \n| cover_artist = \n| country = United
Kingdom\n| language = English [...]
+{"title":"A Sides Win: Singles 1992–2005","content":"{{Infobox album\n| name
= A Sides Win: Singles 1992–2005\n| type = greatest\n| artist =
[[Sloan (band)|Sloan]]\n| cover = Sloan asideswin.png\n| alt =\n|
released = {{Start date|2005|5|3}}\n| recorded = 1992–2005\n| venue
=\n| studio =\n| genre = [[Rock and roll|Rock]]\n| length =
54:54\n| label = [[Sony BMG Music Entertainment|Sony / BMG]]
{{small|([[Canada]])}}<br />[[Koch Entert [...]
+{"title":"Aalesund University College","content":"{{Use dmy dates|date=July
2020}}\n{{coord|62|28|19.87|N|6|14|8.58|E|type:edu_region:NO_dim:1100|display=title}}\n{{Infobox
university\n| name = Aalesund University College\n|
native_name = Høgskolen i Ålesund\n| latin_name = \n|
image = [[File:A-3-rgb.png|200px]]\n| motto =
\n| established = 1994\n| type = [[Public
University]]\n| rector [...]
+{"title":"Abbun d'bishmayya","redirect":"Lord's Prayer"}
+{"title":"Abraxas Foundation","redirect":"Boyd Rice"}
+{"title":"Academy of the Asturian Language","content":"{{Short
description|Asturian Institution}}\n{{Use dmy dates|date=October
2013}}\n[[File:Academia de la Llingua Asturiana 1.jpg|thumb|350px|Current
headquarters at [[Oviedo]]]]\n[[File:WIKITONGUES- Victor speaking
Asturian.webm|thumb|Victor speaking Asturian.]]\nThe '''Academia de la Llingua
Asturiana''' or '''Academy of the Asturian Language''' (ALLA) is an Official
Institution<ref>Official Decret of Asturian Regional Council 33/1980 [...]
+{"title":"Ada, MI","redirect":"Ada Township, Michigan"}
+{"title":"Adams Township, Clinton County, OH","redirect":"Adams Township,
Clinton County, Ohio"}
+{"title":"Adams, Adams County, WI","redirect":"Adams, Adams County, Wisconsin"}
+{"title":"Adel, OR","redirect":"Adel, Oregon"}
+{"title":"Afognak, AK","redirect":"Afognak, Alaska"}
+{"title":"Age fabrication","content":"{{short description|Misrepresenting a
person's age}}\n{{refimprove|date=May 2022}}\n{{Use mdy dates|date=April
2018}}\n'''Age fabrication''' occurs when people deliberately misrepresent
their true age. This is usually done with intent to garner privileges or
[[Social status|status]] that would not otherwise be available to that person
(e.g. a minor misrepresenting their age in order to garner the privileges given
to adults). It may be done through th [...]
+{"title":"Agenda, Ashland County, WI","redirect":"Agenda, Wisconsin"}
+{"title":"Agua Dulce, CA","redirect":"Agua Dulce, California"}
+{"title":"Aguanga, CA","redirect":"Aguanga, California"}
+{"title":"Aleksandra Kollontai","redirect":"Alexandra Kollontai"}
+{"title":"Aleksandra Mikhailovna Kollontai","redirect":"Alexandra Kollontai"}
+{"title":"Aleksei Nikolaevich Tolstoy","redirect":"Aleksey Nikolayevich
Tolstoy"}
+{"title":"All Mod Cons","content":"{{about|the album by The Jam|the television
episode|All Mod Cons (Minder)}}\n{{EngvarB|date=May 2014}}\n{{Use dmy
dates|date=March 2021}}\n\n{{Infobox album\n| name = All Mod Cons\n| type
= Album\n| artist = [[the Jam]]\n| cover =
The_Jam_-_All_Mod_Cons.jpg\n| alt =\n| released = 3 November 1978\n|
recorded = 4 July – 17 August 1978\n| venue =\n| studio = [[RAK
Studios|RAK]] and [[Eden Studios|Eden]], London\n| g [...]
+{"title":"Ana Palacio","content":"{{short description|Spanish
politician}}\n{{family name hatnote|de Palacio|del Valle
Lersundi|lang=Spanish}}\n{{Infobox officeholder\n| honorific-prefix = [[The
Most Excellent]]\n| name = Ana Palacio\n| image = Ana
Palacio.jpg\n| caption = Palacio in 2004\n| office = [[Ministry of
Foreign Affairs (Spain)|Minister of Foreign Affairs]]\n| term_start = July
20, 2002\n| term_end = April 18, 2004\n| predecessor = [[Jose [...]
+{"title":"Anatoli Lunacharsky","redirect":"Anatoly Lunacharsky"}
+{"title":"Anatolij Vasil'evich Lunacharskij","redirect":"Anatoly Lunacharsky"}
+{"title":"Andrew Harman","content":"{{Use dmy dates|date=February
2018}}\n{{Use British English|date=February 2018}}\n'''Andrew Harman''' (born
1964) is an author from the United Kingdom known for writing pun-filled and
farcical [[fantasy fiction]].\n\n== Life ==\nAndrew Harman studied
[[biochemistry]] at the [[University of York]], being a member of [[Wentworth
College]].\n\nSince 2000, Harman has moved on from writing to create YAY Games,
a UK independent publisher of board and card ga [...]
+{"title":"Angarsk","content":"{{Short description|City in Irkutsk Oblast,
Russia}}\n{{Use mdy dates|date=May 2011}}\n{{Expand
Russian|topic=geo|date=April 2020}}\n{{Infobox Russian inhabited
locality\n|en_name=Angarsk\n|ru_name=Ангарск\n|image_skyline=Angarsk_car_Volga_GAZ-21_(25720495842).jpg\n|image_caption=city
center\n|coordinates =
{{coord|52|33|N|103|54|E|display=inline,title}}\n|map_label_position=top\n|image_coa=Coat
of Arms of Angarsk (Irkutsk oblast).png\n|coa_caption=\n|image_ [...]
+{"title":"Animal rescue group","content":"{{short description|Rescue
organization is dedicated to pet adoption}}\n{{about|pet rescue|other
uses|Rescue|and|Rescue (disambiguation)|and|Animal rescue
(disambiguation)}}\n{{Multiple issues|\n{{more footnotes|date=January
2014}}\n{{Original research|date=August 2020}}\n}}\nAn '''animal rescue
group''' or '''animal rescue organization''' is a group dedicated to [[pet
adoption]]. These groups take unwanted, abandoned, abused, or [[feral|stray]]
[...]
+{"title":"Annapolis—Kings","content":"<!--uncomment if needed ''For the
current|defunct federal|provincial electoral district, see [[Annapolis—Kings
(Nova Scotia federal electoral district)]]'' --->\n{{Infobox Canada electoral
district\n| name = Annapolis—Kings\n| province = Nova
Scotia\n| image = \n| caption = \n| fed-status
= defunct\n| fed-district-number = \n| fed-created = 1947\n|
fed-abolished = 1952\n| fed- [...]
+{"title":"Anne Teresa De Keersmaeker","content":"{{Use dmy dates|date=August
2023}}\n[[File:Anne Teresa De Keersmaeker 2016.jpg|thumb|Anne Teresa De
Keersmaeker in 2016]]\n'''Anne Teresa, Baroness De Keersmaeker'''
({{IPA-nl|ˈɑnə teːˈreːzaː dəˈkeːrsmaːkər}}, born 1960 in [[Mechelen]], Belgium,
grew up in Wemmel) is a [[contemporary dance]] choreographer. The dance company
constructed around her, {{ill|Rosas (dance ensemble)|fr|Compagnie Rosas}}, was
in residence at [[La Monnaie]] in [[Br [...]
+{"title":"Anne Teresa de Keersmaeker","redirect":"Anne Teresa De Keersmaeker"}
+{"title":"Antoine Chaudet","redirect":"Antoine-Denis Chaudet"}
+{"title":"Apostatic selection","content":"{{short description|Process in
evolutionary theory}}\n{{Use dmy dates|date=July 2016}}\n'''Apostatic
selection''' is a form of negative [[frequency-dependent selection]]. It
describes the survival of individual [[prey]] animals that are different
(through [[mutation]]) from their species in a way that makes it more likely
for them to be ignored by their [[predator]]s. It operates on [[polymorphism
(biology)|polymorphic]] species, species which ha [...]
+{"title":"Architecture Without Architects","content":"{{Short description|1964
book by Bernard Rudofsky}}\n{{italic title}}\n[[image:Architecture without
Architects cover.JPG|thumb|right|200px|''Architecture Without Architects''
cover]]\n\n'''''Architecture Without Architects: A Short Introduction to
Non-Pedigreed Architecture''''' is a book based on the [[New York City|NYC]]
[[Museum of Modern Art|MoMA]] exhibition of the same name by [[Bernard
Rudofsky]] originally published in 1964. I [...]
+{"title":"Argenteuil—Deux-Montagnes","redirect":"Argenteuil—Papineau—Mirabel"}
+{"title":"Army of the Pharaohs","content":"{{short description|American hip
hop group}}\n\n{{about|the U.S. hip hop group|the military and history
topic|military of ancient Egypt}}\n{{Use mdy dates|date=March
2021}}\n\n{{Infobox musical artist\n| name = Army of the Pharaohs\n|
image = AOTP 2014.jpg\n| image_upright = 1.1\n| caption =
The group in 2014\n| origin = [[Philadelphia]], Pennsylvania, U.S.\n|
genre = [[Hip hop music|Hip hop]], [ [...]
+{"title":"Ashland Global","content":"{{short description|American chemical
company}}\n{{Infobox company\n| name = Ashland Global Holdings,
Inc.\n| logo = Ashland 4color process.png\n| logo_size =
250px\n| type = [[Public company|Public]]\n| traded_as =
{{NYSE|ASH}}<br />[[List of S&P 400 companies|S&P 400 Component]]\n| foundation
= 1924\n| location = [[Wilmington, Delaware]], [[United States|
U.S.]]\n| key_people = [[ [...]
+{"title":"Ashland Oil","redirect":"Ashland Global"}
+{"title":"At The Circus","redirect":"At the Circus"}
+{"title":"Automobles","redirect":"Car"}
+{"title":"Banshu","redirect":"Banshū"}
+{"title":"Banshu Province","redirect":"Harima Province"}
+{"title":"Battle River—Camrose","content":"<!--uncomment if needed ''For the
current|defunct federal|provincial electoral district, see [[Battle
River–Camrose (federal electoral district)]]'' --->\n'''Battle River—Camrose'''
was a federal [[electoral district (Canada)|electoral district]] in
[[Alberta]], Canada, that was represented in the [[House of Commons of Canada]]
from 1953 to 1968.\n\nThis riding was created in 1952 from parts of [[Battle
River (electoral district)|Battle River]], [...]
+{"title":"Battle of Pacocha","content":"{{EngvarB|date=July 2014}}\n{{Use dmy
dates|date=July 2014}}\n\n{{Infobox military conflict|\n| conflict = Battle
of Pacocha\n| image = Combate de Pacocha.jpg\n| image_size = 300px\n|
caption = ''The Naval Combat in the Pacific between HMS SHAH and HMS
AMETHYST and the Peruvian Rebel Ironclad Turret Ram HUASCAR on May 29th 1877'',
[[William Frederick Mitchell]]\n| date = 29 May 1877\n| place =
Off [[Ilo, Peru|Ylo]], [[Pac [...]
+{"title":"Beant Singh (assassin)","content":"{{Short description|Sikh
bodyguard and assassin of Indian Prime Minister Indira Gandhi}}\n{{Other
uses|Beant Singh (disambiguation)}}\n{{more citations needed|date=September
2015}}\n{{Use dmy dates|date=June 2020}}\n{{Infobox person\n| name
= Beant Singh\n| image = Photograph of Beant Singh, one of two
assassins of Indira Gandhi.jpg\n| caption = \n| native_name =
\n| birth_name = Beant Singh [...]
+{"title":"Bedford railway station","content":"{{Short description|Railway
station in Bedfordshire, England}}\n{{about|the station in Bedford,
Bedfordshire, England|the proposed station in Bedford, Virginia|Bedford station
(Virginia)}}\n{{Use dmy dates|date=March 2015}}\n{{Use British
English|date=March 2015}}\n{{Infobox station\n| name = Bedford\n|
symbol_location = gb\n| symbol = rail\n| image = Bedford railway station MMB 06
222022.jpg\n| address = \n| borough = [[Bedford]], [[Borough [...]
+{"title":"Ben Cheney","content":"{{more citations needed|date=December
2010}}\n{{Use mdy dates|date=August 2023}} \n{{Infobox person\n| name = Ben
Cheney\n| birth_name = Ben Bradbury Cheney\n| birth_date = {{birth
date|1905|03|24}}\n| birth_place = [[Lima, Montana]], U.S.\n| death_date =
{{death date and age|1971|05|18|1905|03|24}}\n| death_place = [[Tacoma,
Washington]], U.S.\n}}\n'''Ben Bradbury Cheney''' (March 24, 1905 – May 18,
1971) was an American businessman and sports enthusiast [...]
+{"title":"Beneventan Script","redirect":"Beneventan script"}
+{"title":"Bengt Gabrielsson, Greve Oxenstierna","redirect":"Bengt Gabrielsson
Oxenstierna"}
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json
b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json
new file mode 100644
index 00000000000..859f55797be
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json
@@ -0,0 +1,50 @@
+{"title":"100 Rifles","content":"{{short description|1969 American Western
film}}\n{{Use American English|date=October 2021}}\n{{Use mdy dates|date=July
2020}}\n{{Infobox film\n| name = 100 Rifles\n| image = 100
Rifles (movie poster).jpg\n| caption = Theatrical release poster\n|
director = [[Tom Gries]]\n| producer = [[Marvin Schwartz]]\n|
screenplay = [[Clair Huffaker]]<br>Tom Gries\n| based_on = {{based
on|''The Californio''<br>1967 novel [...]
+{"title":"4-By The Beatles (EP)","redirect":"4 by the Beatles"}
+{"title":"Abitibi (electoral
district)","redirect":"Abitibi—Baie-James—Nunavik—Eeyou"}
+{"title":"Adblock Plus","content":"{{Distinguish|AdBlock}}\n{{short
description|Content-filtering and ad blocking browser extension}}\n{{Use mdy
dates|date=March 2022}}\n{{Infobox software\n| name = Adblock
Plus\n| logo = Adblock Plus 2014 Logo.svg\n| screenshot
= Adblock-plus-1.2-en-preferences-add-exception-xfwm4.png\n| caption
= Preferences dialog box of Adblock Plus showing a group of filters\n|
developer = Eyeo GmbH<r [...]
+{"title":"Aframomum melegueta","redirect":"Grains of paradise"}
+{"title":"Alexander Stamboliski","redirect":"Aleksandar Stamboliyski"}
+{"title":"Always Be My Baby","content":"{{Short description|1996 single by
Mariah Carey}}\n{{About||the Sara Evans country song|You'll Always Be My
Baby||Always Be My Maybe (disambiguation) {{!}}Always Be My Maybe}}\n{{Use
American English|date=September 2020}}\n{{Use mdy dates|date=December
2017}}\n{{Infobox song\n| name = Always Be My Baby\n| cover =
Always Be My Baby (Mariah Carey single - cover art).jpg\n| alt = A
black-and-white photo of Carey smiling from [...]
+{"title":"Annapolis Valley (electoral district)","redirect":"Kings—Hants"}
+{"title":"Anyox","content":"{{Short description|Ghost town in British
Columbia, Canada}}\n[[File:Anyox British Columbia
1911.jpg|200px|thumb|right|Anyox, British Columbia]]\n'''Anyox''' was a small
company-owned mining town in [[British Columbia]],
Canada.<ref>{{BCGNIS|36025|Anyox}}</ref> Today it is a [[ghost town]],
abandoned and largely destroyed. It is located on the shores of Granby Bay in
coastal [[Observatory Inlet]], about {{convert|60|km|mi|0|abbr=off}} southeast
of (but without [...]
+{"title":"Argenteuil—Papineau","redirect":"Argenteuil—Papineau—Mirabel"}
+{"title":"Art of Ancient Greece","redirect":"Ancient Greek art"}
+{"title":"BSG 75","redirect":"Battlestar Galactica (fictional spacecraft)"}
+{"title":"BSG-75","redirect":"Battlestar Galactica (fictional spacecraft)"}
+{"title":"BSG75","redirect":"Battlestar Galactica (fictional spacecraft)"}
+{"title":"Bangor & Aroostook","redirect":"Bangor and Aroostook Railroad"}
+{"title":"Barrie—Simcoe—Bradford","redirect":"Barrie (federal electoral
district)"}
+{"title":"Bas-Richelieu—Nicolet—Becancour","redirect":"Bécancour—Nicolet—Saurel"}
+{"title":"Battleford—Kindersley","content":"'''Battleford—Kindersley''' was a
federal [[electoral district (Canada)|electoral district]] (riding) n
[[Saskatchewan]], Canada, that was represented in the [[House of Commons of
Canada]] from 1968 to 1979.\n\nThis [[Riding (division)|riding]] was created in
1966 from parts of [[Kindersley (electoral district)|Kindersley]], [[The
Battlefords (federal electoral district)|The Battlefords]] and
[[Rosetown—Biggar (federal electoral district)|Roset [...]
+{"title":"Beaches (federal electoral district)","content":"{{for|the defunct
provincial electoral district|Beaches (provincial electoral
district)}}\n{{Infobox Canada electoral district\n| province =
Ontario\n| image = Beaches riding.png\n| caption =
Beaches in relation to other electoral districts in Toronto\n| fed-status
= defunct\n| fed-district-number = \n| fed-created = 1976\n|
fed-abolished = 1987\n| fed-election-first = [...]
+{"title":"Beauport—Montmorency—Côte-de-Beaupré—Île-d'Orléans","redirect":"Montmorency
(federal electoral district)"}
+{"title":"Beauport—Montmorency—Orléans","redirect":"Montmorency (federal
electoral district)"}
+{"title":"Beaver River (federal electoral district)","content":"<!--uncomment
if needed ''For the current|defunct federal|provincial electoral district, see
[[Beaver River (federal electoral district)]]'' --->\n'''Beaver River''' was a
federal [[electoral district (Canada)|electoral district]] represented in the
[[House of Commons of Canada]] from 1988 to 1997.\n\nIt was located in the
[[provinces and territories of Canada|province]] of [[Alberta]]. This riding
was created in 1987, and w [...]
+{"title":"Belarus at the 2000 Summer Olympics","content":"{{Use dmy
dates|date=September 2021}}\n{{infobox country at games\n| NOC = BLR\n| NOCname
= [[Belarus Olympic Committee]]\n| games = Summer Olympics\n| year = 2000\n|
flagcaption = \n| oldcode = \n| website = {{url|www.noc.by }} {{in
lang|ru|en}}\n| location = [[Sydney]]\n| competitors = 139 (72 men and 67
women)\n| sports = 20\n| flagbearer = [[Sergey Lishtvan]]\n| rank = 23\n| gold
= 3\n| silver = 3\n| bronze = 11\n| offici [...]
+{"title":"Bella Ciao","redirect":"Bella ciao"}
+{"title":"Bernard Fokke","content":"{{Short description|17th-century
Frisian-born captain for the Dutch East India Company.}}\n'''Bernard''' or
'''Barend Fokke''', sometimes known as '''Barend Fockesz''', was a
17th-century, [[Frisians|Frisian]]-born [[Captain (nautical)|captain]] for the
[[Dutch East India Company]]. He was renowned for the uncanny speed of his
trips from the [[Dutch Republic]] to [[Java (island)|Java]]. For example, in
1678, he traveled the distance in 3 months and 4 ( [...]
+{"title":"Berosus","content":"'''Berosus''' may refer to:\n*In Greek
mythology:\n**Berosus, father of Tanais by [[Lysippe (Amazon)]]\n**Berosus,
father of the [[Sibyl]] Sabbe by Erymanthe\n*[[Berossus]] (3rd century BC),
Hellenistic-era Babylonian writer and astronomer\n*[[Berosus (beetle)]], a
genus of beetles of the family [[Hydrophilidae]]\n*[[Berosus (crater)]], a
lunar crater\n\n{{disambig}}"}
+{"title":"Berthier—Montcalm","content":"<!--uncomment if needed ''For the
current|defunct federal|provincial electoral district, see [[Berthier—Montcalm
(federal electoral district)]]'' --->\n{{Infobox Canada electoral district\n|
name = Berthier—Montcalm\n| province = Quebec\n| image
= \n| caption = \n| fed-status = defunct\n|
fed-district-number = \n| fed-created = 1987\n| fed-abolished =
2003\n| fed-election-first = 1988\n| [...]
+{"title":"Billie Dove","content":"{{Short description|American actress
(1903–1997)}}\n{{Use American English|date=July 2020}}\n{{Use mdy
dates|date=August 2014}}\n{{Infobox person\n| name = Billie
Dove\n| birth_name = Bertha Eugenie Bohny\n| image =
Billy Dove portrait photograph with roses (retouched).jpg\n| caption
= Dove in 1920\n| birth_date = {{Birth date|1903|5|14|mf=yes}}\n|
birth_place = New York City, U.S.\n| dea [...]
+{"title":"Blainville—Deux-Montagne","redirect":"Blainville—Deux-Montagnes"}
+{"title":"Blainville—Deux-Montagnes","content":"{{Use Canadian
English|date=January 2023}}\n{{Infobox Canada electoral district\n| name =
Blainville—Deux-Montagnes\n| province = Quebec\n| image
= \n| caption = \n| fed-status = defunct\n|
fed-district-number = \n| fed-created = 1976\n| fed-abolished =
1996\n| fed-election-first = 1979\n| fed-election-last = 1993\n| fed-rep
= \n| fed-rep-party = \n| demo-pop-r [...]
+{"title":"Blow fly","redirect":"Blowfly"}
+{"title":"Bombing of Vietnam's Dikes","redirect":"Proposed bombing of
Vietnam's dikes"}
+{"title":"Bombing of the dikes","redirect":"Proposed bombing of Vietnam's
dikes"}
+{"title":"Bombing of the dykes","redirect":"Proposed bombing of Vietnam's
dikes"}
+{"title":"Bonaventure—Îles-de-la-Madeleine","redirect":"Bonaventure (federal
electoral district)"}
+{"title":"Bonavista—Trinity—Conception","content":"{{Infobox Canada electoral
district\n| province = Newfoundland and Labrador\n| image
= \n| caption = \n| fed-status = defunct\n| fed-created
= 1966\n| fed-abolished = 2003\n| fed-election-first = 1968\n|
fed-election-last = 2002 by-election\n}}\n<!--uncomment if needed ''For the
current|defunct federal|provincial electoral district, see
[[Bonavista—Trinity—Conception (elector [...]
+{"title":"Boston & Maine","redirect":"Boston and Maine Railroad"}
+{"title":"Boston-area streetcar lines/old","redirect":"Boston-area streetcar
lines"}
+{"title":"Brampton (federal electoral district)","content":"''For the defunct
provincial electoral district, see [[Brampton (provincial electoral
district)]].''\n{{Infobox Canada electoral district\n| name =
Brampton\n| province = Ontario\n| image = \n| caption
= \n| fed-status = defunct\n| fed-district-number = \n|
fed-created = 1987\n| fed-abolished = 1996\n| fed-election-first
= 1988\n| fed-election-last = 1 [...]
+{"title":"Brampton Centre (federal electoral district)","content":"{{short
description|Federal electoral district in Ontario, Canada}}\n{{use mdy
dates|date=October 2021}}\n{{for|the future provincial electoral
district|Brampton Centre (provincial electoral district)}}\n{{Infobox Canada
electoral district\n| province = Ontario\n| image =
Brampton Centre 2015.svg\n| caption = Brampton Centre in relation
to other [[Greater Toronto Area]] districts\n| fe [...]
+{"title":"Brampton West—Mississauga (federal electoral
district)","content":"{{for|the defunct provincial electoral district|Brampton
West—Mississauga (provincial electoral district)}}\n{{Infobox Canada electoral
district\n| province = Ontario\n| image =
[[File:Brampton West-Mississauga (riding map).png|250px]]\n| caption
= Map of the riding\n| fed-status = defunct\n| fed-district-number =
\n| fed-created = 1996\n| fed-abolished [...]
+{"title":"Brampton—Georgetown","content":"<!--- uncomment if needed ''For the
current|defunct federal|provincial electoral district, see
[[Brampton–Georgetown (electoral district)]]'' --->\n{{Infobox Canada electoral
district\n| name = Brampton—Georgetown\n| province =
Ontario\n| image = \n| caption = \n| fed-status
= defunct\n| fed-district-number = \n| fed-created = 1976\n|
fed-abolished = 1987\n| fed-election-f [...]
+{"title":"Bras d'Or (electoral district)","redirect":"Cape Breton—Canso"}
+{"title":"Brossard—La-Prairie","redirect":"Brossard—La Prairie"}
+{"title":"Bruce—Grey","redirect":"Bruce—Grey—Owen Sound (federal electoral
district)"}
+{"title":"Burin—St. George's","content":"{{Infobox Canada electoral
district\n| province = Newfoundland and Labrador\n| image
= \n| caption = \n| fed-status = defunct\n| fed-created
= 1976\n| fed-abolished = 2003\n| fed-election-first = 1979\n|
fed-election-last = 2004\n}}\n<!--uncomment if needed ''For the
current|defunct federal|provincial electoral district, see [[Burin—St. George's
(electoral district)]]'' --->\n'''Burin— [...]
+{"title":"Burnaby (federal electoral district)","content":"{{for|the
historical provincial electoral district of the same name|Burnaby (provincial
electoral district)}}\n{{Infobox Canada electoral district\n| name = Burnaby\n|
province = British Columbia\n| image = \n| caption
= \n| fed-status = defunct\n| fed-district-number = \n|
fed-created = 1976\n| fed-abolished = 1987\n| fed-election-first
= 1979\n| fed-election-last = [...]
+{"title":"Burnaby—Kingsway","content":"{{Infobox Canada electoral district\n|
province = British Columbia\n| image = \n| caption
= \n| fed-status = defunct\n| fed-district-number = \n|
fed-created = 1987\n| fed-abolished = 1996\n| fed-election-first
= 1988\n| fed-election-last = 1993\n| fed-rep = \n| fed-rep-link
= \n| fed-rep-party = \n| fed-rep-party-link = \n| demo-pop-ref
= \n| demo-area-r [...]
+{"title":"Bussells","redirect":"Bussell family"}
+{"title":"Cape Breton Highlands—Canso","content":"<!--uncomment if needed
''For the current|defunct federal|provincial electoral district, see [[Cape
Breton Highlands—Canso (electoral district)]]'' --->\n{{Infobox Canada
electoral district\n| name = Cape Breton Highlands—Canso\n|
province = Nova Scotia\n| image = \n| caption
= \n| fed-status = defunct\n| fed-district-number = \n| fed-created
= 1966\n| fed-abolished [...]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]