This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6ac0bfeceb [Feature](inverted index) add unicode parser for inverted
index (#21035)
6ac0bfeceb is described below
commit 6ac0bfecebc12da011bb63ee1f1ae3f445f108ee
Author: airborne12 <[email protected]>
AuthorDate: Wed Jun 21 20:14:06 2023 +0800
[Feature](inverted index) add unicode parser for inverted index (#21035)
---
be/src/clucene | 2 +-
be/src/olap/inverted_index_parser.cpp | 6 +++++-
be/src/olap/inverted_index_parser.h | 2 ++
.../olap/rowset/segment_v2/inverted_index_reader.cpp | 5 +++++
.../olap/rowset/segment_v2/inverted_index_writer.cpp | 14 ++++++++++----
docs/en/docs/data-table/index/inverted-index.md | 16 +++++++++-------
docs/zh-CN/docs/data-table/index/inverted-index.md | 19 ++++++++++---------
.../org/apache/doris/analysis/InvertedIndexUtil.java | 6 ++++--
.../data/inverted_index_p0/test_chinese_analyzer.out | 6 ------
.../inverted_index_p0/test_chinese_analyzer.groovy | 2 --
10 files changed, 46 insertions(+), 32 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 60f5eab7ac..103e88a8a3 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 60f5eab7ac6294493a2e7e290297000c3c39875c
+Subproject commit 103e88a8a3b24da9ae2a0d9908a3ceb3f7808a61
diff --git a/be/src/olap/inverted_index_parser.cpp
b/be/src/olap/inverted_index_parser.cpp
index d1e04e9df1..b0ab8c9d1a 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -27,6 +27,8 @@ std::string
inverted_index_parser_type_to_string(InvertedIndexParserType parser_
return INVERTED_INDEX_PARSER_NONE;
case InvertedIndexParserType::PARSER_STANDARD:
return INVERTED_INDEX_PARSER_STANDARD;
+ case InvertedIndexParserType::PARSER_UNICODE:
+ return INVERTED_INDEX_PARSER_UNICODE;
case InvertedIndexParserType::PARSER_ENGLISH:
return INVERTED_INDEX_PARSER_ENGLISH;
case InvertedIndexParserType::PARSER_CHINESE:
@@ -44,6 +46,8 @@ InvertedIndexParserType
get_inverted_index_parser_type_from_string(const std::st
return InvertedIndexParserType::PARSER_NONE;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_STANDARD) {
return InvertedIndexParserType::PARSER_STANDARD;
+ } else if (parser_str_lower == INVERTED_INDEX_PARSER_UNICODE) {
+ return InvertedIndexParserType::PARSER_UNICODE;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_ENGLISH) {
return InvertedIndexParserType::PARSER_ENGLISH;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_CHINESE) {
@@ -67,7 +71,7 @@ std::string get_parser_mode_string_from_properties(
if (properties.find(INVERTED_INDEX_PARSER_MODE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_MODE_KEY);
} else {
- return INVERTED_INDEX_PARSER_FINE_GRANULARITY;
+ return INVERTED_INDEX_PARSER_COARSE_GRANULARITY;
}
}
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index 87e2ef991a..eb4c414308 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -29,6 +29,7 @@ enum class InvertedIndexParserType {
PARSER_STANDARD = 2,
PARSER_ENGLISH = 3,
PARSER_CHINESE = 4,
+ PARSER_UNICODE = 5,
};
struct InvertedIndexCtx {
@@ -46,6 +47,7 @@ const std::string INVERTED_INDEX_PARSER_KEY = "parser";
const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
const std::string INVERTED_INDEX_PARSER_NONE = "none";
const std::string INVERTED_INDEX_PARSER_STANDARD = "standard";
+const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode";
const std::string INVERTED_INDEX_PARSER_ENGLISH = "english";
const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese";
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index f198cea229..c311bcec8f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -103,6 +103,11 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
analyzer =
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
reader.reset(
(new lucene::util::StringReader(std::wstring(value.begin(),
value.end()).c_str())));
+ } else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ analyzer =
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
+ reader.reset(new lucene::util::SimpleInputStreamReader(
+ new lucene::util::AStringReader(value.c_str()),
+ lucene::util::SimpleInputStreamReader::UTF8));
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
auto chinese_analyzer =
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 8e1dfc0b05..ab5d3548df 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -154,7 +154,8 @@ public:
_doc = std::make_unique<lucene::document::Document>();
_dir.reset(DorisCompoundDirectory::getDirectory(_fs,
index_path.c_str(), true));
- if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
+ if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
+ _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
_analyzer =
std::make_unique<lucene::analysis::standard::StandardAnalyzer>();
} else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
_analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
@@ -163,10 +164,10 @@ public:
chinese_analyzer->setLanguage(L"chinese");
chinese_analyzer->initDict(config::inverted_index_dict_path);
auto mode =
get_parser_mode_string_from_properties(_index_meta->properties());
- if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
-
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
- } else {
+ if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+ } else {
+
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
}
_analyzer.reset(chinese_analyzer);
} else {
@@ -222,6 +223,11 @@ public:
if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH ||
_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
new_char_token_stream(field_value_data, field_value_size, _field);
+ } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
+ new lucene::util::AStringReader(field_value_data,
field_value_size),
+ lucene::util::SimpleInputStreamReader::UTF8);
+ _field->setValue(stringReader);
} else {
new_field_value(field_value_data, field_value_size, _field);
}
diff --git a/docs/en/docs/data-table/index/inverted-index.md
b/docs/en/docs/data-table/index/inverted-index.md
index 6b3ad7a647..57216d8ad4 100644
--- a/docs/en/docs/data-table/index/inverted-index.md
+++ b/docs/en/docs/data-table/index/inverted-index.md
@@ -52,7 +52,7 @@ The features for inverted index is as follows:
- add fulltext search on text(string, varchar, char) field
- MATCH_ALL matches all keywords, MATCH_ANY matches any keywords
- support fulltext on array of text field
- - support english and chinese word parser
+ - support english, chinese and mixed unicode word parser
- accelerate normal equal, range query, replacing bitmap index in the future
- suport =, !=, >, >=, <, <= on text, numeric, datetime types
- suport =, !=, >, >=, <, <= on array of text, numeric, datetime types
@@ -74,10 +74,12 @@ The features for inverted index is as follows:
- missing stands for no parser, the whole field is considered to be a
term
- "english" stands for english parser
- "chinese" stands for chinese parser
+ - "unicode" stands for mixed-type word segmentation suitable for
situations with a mix of Chinese and English. It can segment email prefixes and
suffixes, IP addresses, and mixed characters and numbers, and can also segment
Chinese characters into 1-gram.
+
- "parser_mode" is utilized to set the tokenizer/parser type for Chinese
word segmentation.
- in "fine_grained" mode, the system will meticulously tokenize each
possible segment.
- in "coarse_grained" mode, the system follows the maximization
principle, performing accurate and comprehensive tokenization.
- - default mode is "fine_grained".
+ - default mode is "coarse_grained".
- "support_phrase" is utilized to specify if the index requires support
for phrase mode.
- "true" indicates that support is needed.
- "false" indicates that support is not needed.
@@ -88,10 +90,10 @@ The features for inverted index is as follows:
CREATE TABLE table_name
(
columns_difinition,
- INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" =
"english|chinese")] [COMMENT 'your comment']
- INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" =
"english|chinese")] [COMMENT 'your comment']
+ INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" =
"english|chinese|unicode")] [COMMENT 'your comment']
+ INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" =
"english|chinese|unicode")] [COMMENT 'your comment']
INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" =
"chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your
comment']
- INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" =
"english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
+ INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" =
"english|chinese|unicode", "support_phrase" = "true|false")] [COMMENT 'your
comment']
)
table_properties;
```
@@ -99,9 +101,9 @@ table_properties;
- add an inverted index to existed table
```sql
-- syntax 1
-CREATE INDEX idx_name ON table_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
+CREATE INDEX idx_name ON table_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'];
-- syntax 2
-ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
+ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'];
```
- drop an inverted index
diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md
b/docs/zh-CN/docs/data-table/index/inverted-index.md
index 768e29f856..3ac4992519 100644
--- a/docs/zh-CN/docs/data-table/index/inverted-index.md
+++ b/docs/zh-CN/docs/data-table/index/inverted-index.md
@@ -52,7 +52,7 @@ Doris倒排索引的功能简要介绍如下:
- 增加了字符串类型的全文检索
- 支持字符串全文检索,包括同时匹配多个关键字MATCH_ALL、匹配任意一个关键字MATCH_ANY、匹配短语词组MATCH_PHRASE
- 支持字符串数组类型的全文检索
- - 支持英文、中文分词
+ - 支持英文、中文以及混合类型分词
- 加速普通等值、范围查询,覆盖bitmap索引的功能,未来会代替bitmap索引
- 支持字符串、数值、日期时间类型的 =, !=, >, >=, <, <= 快速过滤
- 支持字符串、数字、日期时间数组类型的 =, !=, >, >=, <, <=
@@ -72,11 +72,12 @@ Doris倒排索引的功能简要介绍如下:
- parser指定分词器
- 默认不指定代表不分词
- english是英文分词,适合被索引列是英文的情况,用空格和标点符号分词,性能高
- - chinese是中文分词,适合被索引列有中文或者中英文混合的情况,采用jieba分词库,性能比english分词低
+ - chinese是中文分词,适合被索引列有中文或者中英文混合的情况,性能比english分词低
+ -
unicode是混合类型分词,适用于中英文混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文字符进行1-gram分词。
- parser_mode用于指定中文分词的模式
- fine_grained模式,系统将对可以进行分词的部分都进行详尽的分词处理
- coarse_grained模式,系统则依据最大化原则,执行精确且全面的分词操作
- - 默认find_grained模式
+ - 默认coarse_grained模式
- support_phrase用于指定索引是否需要支持短语模式
- true为需要
- false为不需要
@@ -87,10 +88,10 @@ Doris倒排索引的功能简要介绍如下:
CREATE TABLE table_name
(
columns_difinition,
- INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" =
"english|chinese")] [COMMENT 'your comment']
- INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" =
"english|chinese")] [COMMENT 'your comment']
+ INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" =
"english|unicode|chinese")] [COMMENT 'your comment']
+ INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" =
"english|unicode|chinese")] [COMMENT 'your comment']
INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" =
"chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your
comment']
- INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" =
"english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
+ INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" =
"english|unicode|chinese", "support_phrase" = "true|false")] [COMMENT 'your
comment']
)
table_properties;
```
@@ -98,9 +99,9 @@ table_properties;
- 已有表增加倒排索引
```sql
-- 语法1
-CREATE INDEX idx_name ON table_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
+CREATE INDEX idx_name ON table_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'];
-- 语法2
-ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
+ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED
[PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'];
```
- 删除倒排索引
@@ -149,7 +150,7 @@ USE test_inverted_index;
-- 创建表的同时创建了comment的倒排索引idx_comment
-- USING INVERTED 指定索引类型是倒排索引
--- PROPERTIES("parser" = "english")
指定采用english分词,还支持"chinese"中文分词,如果不指定"parser"参数表示不分词
+-- PROPERTIES("parser" = "english")
指定采用english分词,还支持"chinese"中文分词和"unicode"中英文混合分词,如果不指定"parser"参数表示不分词
CREATE TABLE hackernews_1m
(
`id` BIGINT,
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 8b8c57b95b..294f71dff6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -28,6 +28,7 @@ public class InvertedIndexUtil {
public static String INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
public static String INVERTED_INDEX_PARSER_NONE = "none";
public static String INVERTED_INDEX_PARSER_STANDARD = "standard";
+ public static String INVERTED_INDEX_PARSER_UNICODE = "unicode";
public static String INVERTED_INDEX_PARSER_ENGLISH = "english";
public static String INVERTED_INDEX_PARSER_CHINESE = "chinese";
@@ -53,8 +54,9 @@ public class InvertedIndexUtil {
if (colType.isStringType()) {
if (!(parser.equals(INVERTED_INDEX_PARSER_NONE)
|| parser.equals(INVERTED_INDEX_PARSER_STANDARD)
- || parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
- || parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
+ || parser.equals(INVERTED_INDEX_PARSER_UNICODE)
+ || parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
+ ||
parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
throw new AnalysisException("INVERTED index parser: " + parser
+ " is invalid for column: " + indexColName + " of type "
+ colType);
}
diff --git a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
index 71489df784..dfb1cd2ccb 100644
--- a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
+++ b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
@@ -11,18 +11,12 @@
-- !sql --
1 我来到北京清华大学
--- !sql --
-1 我来到北京清华大学
-
-- !sql --
3 人民可以得到更多实惠
-- !sql --
2 我爱你中国
--- !sql --
-1 我来到北京清华大学
-
-- !sql --
-- !sql --
diff --git
a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
index f779e0bfce..1acf8ffa6a 100644
--- a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
+++ b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
@@ -48,7 +48,6 @@ suite("test_chinese_analyzer"){
sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3,
'人民可以得到更多实惠');"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我爱你' ORDER BY id;"
- qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华大学' ORDER BY id;"
@@ -74,7 +73,6 @@ suite("test_chinese_analyzer"){
sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3,
'人民可以得到更多实惠');"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;"
- qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]