This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 90b4e127e3 [Feature](inverted index) add parser_mode properties for 
inverted index parser (#20116)
90b4e127e3 is described below

commit 90b4e127e3e53cea3b1620e2c205cd0f00ffe7ff
Author: airborne12 <[email protected]>
AuthorDate: Mon May 29 23:21:52 2023 +0800

    [Feature](inverted index) add parser_mode properties for inverted index 
parser (#20116)
    
    We add parser mode for inverted index, usage like this:
    ```
    CREATE TABLE `inverted` (
      `FIELD0` text NULL,
      `FIELD1` text NULL,
      `FIELD2` text NULL,
      `FIELD3` text NULL,
      INDEX idx_name1 (`FIELD0`) USING INVERTED PROPERTIES("parser" = 
"chinese", "parser_mode" = "fine_grained") COMMENT '',
      INDEX idx_name2 (`FIELD1`) USING INVERTED PROPERTIES("parser" = 
"chinese", "parser_mode" = "coarse_grained") COMMENT ''
    ) ENGINE=OLAP
    );
    ```
---
 be/src/clucene                                     |  2 +-
 be/src/olap/inverted_index_parser.cpp              |  8 +++
 be/src/olap/inverted_index_parser.h                |  6 ++
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 10 +--
 .../rowset/segment_v2/inverted_index_reader.cpp    | 74 ++++++++++---------
 .../olap/rowset/segment_v2/inverted_index_reader.h | 63 +++++++----------
 .../rowset/segment_v2/inverted_index_writer.cpp    | 14 ++--
 be/src/olap/tablet_schema.h                        |  8 +++
 .../inverted_index_p0/test_chinese_analyzer.out    | 35 +++++++++
 .../inverted_index_p0/test_chinese_analyzer.groovy | 82 ++++++++++++++++++++++
 10 files changed, 217 insertions(+), 85 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 3e493ab995..6033b8c33c 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 3e493ab99573cea5a7ed57f52d8fd9e03e2c17a9
+Subproject commit 6033b8c33c08fd45575d2799f93973d9ebd032ea
diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index 9407b52ee4..e920a4a930 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -62,4 +62,12 @@ std::string get_parser_string_from_properties(
     }
 }
 
+std::string get_parser_mode_string_from_properties(
+        const std::map<std::string, std::string>& properties) {
+    if (properties.find(INVERTED_INDEX_PARSER_MODE_KEY) != properties.end()) {
+        return properties.at(INVERTED_INDEX_PARSER_MODE_KEY);
+    } else {
+        return INVERTED_INDEX_PARSER_FINE_GRANULARITY;
+    }
+}
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index 0c870aa355..d36950e514 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -30,6 +30,10 @@ enum class InvertedIndexParserType {
     PARSER_CHINESE = 4,
 };
 
+const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
+const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
+const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained";
+
 const std::string INVERTED_INDEX_PARSER_KEY = "parser";
 const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
 const std::string INVERTED_INDEX_PARSER_NONE = "none";
@@ -42,5 +46,7 @@ std::string 
inverted_index_parser_type_to_string(InvertedIndexParserType parser_
 InvertedIndexParserType get_inverted_index_parser_type_from_string(const 
std::string& parser_str);
 
 std::string get_parser_string_from_properties(const std::map<std::string, 
std::string>& properties);
+std::string get_parser_mode_string_from_properties(
+        const std::map<std::string, std::string>& properties);
 
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index e55fc2a595..ed21161b71 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -230,7 +230,7 @@ Status ColumnReader::new_inverted_index_iterator(const 
TabletIndex* index_meta,
                                                  InvertedIndexIterator** 
iterator) {
     RETURN_IF_ERROR(_ensure_inverted_index_loaded(index_meta));
     if (_inverted_index) {
-        RETURN_IF_ERROR(_inverted_index->new_iterator(index_meta, stats, 
iterator));
+        RETURN_IF_ERROR(_inverted_index->new_iterator(stats, iterator));
     }
     return Status::OK();
 }
@@ -479,15 +479,15 @@ Status ColumnReader::_load_inverted_index_index(const 
TabletIndex* index_meta) {
     if (is_string_type(type)) {
         if (parser_type != InvertedIndexParserType::PARSER_NONE) {
             _inverted_index.reset(new FullTextIndexReader(
-                    _file_reader->fs(), _file_reader->path().native(), 
index_meta->index_id()));
+                    _file_reader->fs(), _file_reader->path().native(), 
index_meta));
             return Status::OK();
         } else {
             _inverted_index.reset(new StringTypeInvertedIndexReader(
-                    _file_reader->fs(), _file_reader->path().native(), 
index_meta->index_id()));
+                    _file_reader->fs(), _file_reader->path().native(), 
index_meta));
         }
     } else if (is_numeric_type(type)) {
-        _inverted_index.reset(new BkdIndexReader(_file_reader->fs(), 
_file_reader->path().native(),
-                                                 index_meta->index_id()));
+        _inverted_index.reset(
+                new BkdIndexReader(_file_reader->fs(), 
_file_reader->path().native(), index_meta));
     } else {
         _inverted_index.reset();
     }
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 97a87c4309..544620e68f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -92,8 +92,8 @@ Status 
InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach
         // try to get query bitmap result from cache and return immediately on 
cache hit
         io::Path path(_path);
         auto index_dir = path.parent_path();
-        auto index_file_name =
-                InvertedIndexDescriptor::get_index_file_name(path.filename(), 
_index_id);
+        auto index_file_name = 
InvertedIndexDescriptor::get_index_file_name(path.filename(),
+                                                                            
_index_meta.index_id());
         auto index_file_path = index_dir / index_file_name;
         InvertedIndexQueryCache::CacheKey cache_key {
                 index_file_path, "", InvertedIndexQueryType::UNKNOWN_QUERY, 
L"null_bitmap"};
@@ -140,11 +140,13 @@ Status 
InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach
 }
 
 std::vector<std::wstring> FullTextIndexReader::get_analyse_result(
-        const std::wstring& field_name, const std::string& value, 
InvertedIndexQueryType query_type,
-        InvertedIndexParserType analyser_type) {
+        const std::wstring& field_name, const std::string& value,
+        InvertedIndexQueryType query_type) {
     std::vector<std::wstring> analyse_result;
     std::shared_ptr<lucene::analysis::Analyzer> analyzer;
     std::unique_ptr<lucene::util::Reader> reader;
+    auto analyser_type = get_inverted_index_parser_type_from_string(
+            get_parser_string_from_properties(_index_meta.properties()));
     if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
         analyzer = 
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
         reader.reset(
@@ -153,10 +155,18 @@ std::vector<std::wstring> 
FullTextIndexReader::get_analyse_result(
         auto chinese_analyzer =
                 
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
         chinese_analyzer->initDict(config::inverted_index_dict_path);
+        auto mode = 
get_parser_mode_string_from_properties(_index_meta.properties());
+        if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
+            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+        } else {
+            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+        }
         analyzer = chinese_analyzer;
-        reader.reset(new lucene::util::SimpleInputStreamReader(
-                new lucene::util::AStringReader(value.c_str()),
-                lucene::util::SimpleInputStreamReader::UTF8));
+        reader.reset(_CLNEW lucene::util::SStringReader<char>(value.c_str(), 
strlen(value.c_str()),
+                                                              false));
+        //reader.reset(new lucene::util::SimpleInputStreamReader(
+        //        new lucene::util::AStringReader(value.c_str()),
+        //        lucene::util::SimpleInputStreamReader::UTF8));
     } else {
         // default
         analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
@@ -189,15 +199,14 @@ std::vector<std::wstring> 
FullTextIndexReader::get_analyse_result(
     return analyse_result;
 }
 
-Status FullTextIndexReader::new_iterator(const TabletIndex* index_meta, 
OlapReaderStatistics* stats,
+Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats,
                                          InvertedIndexIterator** iterator) {
-    *iterator = new InvertedIndexIterator(index_meta, stats, this);
+    *iterator = new InvertedIndexIterator(stats, this);
     return Status::OK();
 }
 
 Status FullTextIndexReader::query(OlapReaderStatistics* stats, const 
std::string& column_name,
                                   const void* query_value, 
InvertedIndexQueryType query_type,
-                                  InvertedIndexParserType analyser_type,
                                   roaring::Roaring* bit_map) {
     SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
 
@@ -207,14 +216,16 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, const std::string
 
     io::Path path(_path);
     auto index_dir = path.parent_path();
-    auto index_file_name = 
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+    auto index_file_name =
+            InvertedIndexDescriptor::get_index_file_name(path.filename(), 
_index_meta.index_id());
     auto index_file_path = index_dir / index_file_name;
 
     std::unique_ptr<lucene::search::Query> query;
     std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
+
     try {
         std::vector<std::wstring> analyse_result =
-                get_analyse_result(field_ws, search_str, query_type, 
analyser_type);
+                get_analyse_result(field_ws, search_str, query_type);
 
         if (analyse_result.empty()) {
             LOG(WARNING) << "invalid input query_str: " << search_str
@@ -332,17 +343,15 @@ InvertedIndexReaderType FullTextIndexReader::type() {
     return InvertedIndexReaderType::FULLTEXT;
 }
 
-Status StringTypeInvertedIndexReader::new_iterator(const TabletIndex* 
index_meta,
-                                                   OlapReaderStatistics* stats,
+Status StringTypeInvertedIndexReader::new_iterator(OlapReaderStatistics* stats,
                                                    InvertedIndexIterator** 
iterator) {
-    *iterator = new InvertedIndexIterator(index_meta, stats, this);
+    *iterator = new InvertedIndexIterator(stats, this);
     return Status::OK();
 }
 
 Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
                                             const std::string& column_name, 
const void* query_value,
                                             InvertedIndexQueryType query_type,
-                                            InvertedIndexParserType 
analyser_type,
                                             roaring::Roaring* bit_map) {
     SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
 
@@ -362,7 +371,8 @@ Status 
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
 
     io::Path path(_path);
     auto index_dir = path.parent_path();
-    auto index_file_name = 
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+    auto index_file_name =
+            InvertedIndexDescriptor::get_index_file_name(path.filename(), 
_index_meta.index_id());
     auto index_file_path = index_dir / index_file_name;
 
     // try to get query bitmap result from cache and return immediately on 
cache hit
@@ -451,12 +461,12 @@ InvertedIndexReaderType 
StringTypeInvertedIndexReader::type() {
 }
 
 BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
-                               const uint32_t uniq_id)
-        : InvertedIndexReader(fs, path, uniq_id), _compoundReader(nullptr) {
+                               const TabletIndex* index_meta)
+        : InvertedIndexReader(fs, path, index_meta), _compoundReader(nullptr) {
     io::Path io_path(_path);
     auto index_dir = io_path.parent_path();
-    auto index_file_name =
-            InvertedIndexDescriptor::get_index_file_name(io_path.filename(), 
_index_id);
+    auto index_file_name = 
InvertedIndexDescriptor::get_index_file_name(io_path.filename(),
+                                                                        
index_meta->index_id());
 
     // check index file existence
     auto index_file = index_dir / index_file_name;
@@ -469,9 +479,8 @@ BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const 
std::string& path,
             config::inverted_index_read_buffer_size);
 }
 
-Status BkdIndexReader::new_iterator(const TabletIndex* index_meta, 
OlapReaderStatistics* stats,
-                                    InvertedIndexIterator** iterator) {
-    *iterator = new InvertedIndexIterator(index_meta, stats, this);
+Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, 
InvertedIndexIterator** iterator) {
+    *iterator = new InvertedIndexIterator(stats, this);
     return Status::OK();
 }
 
@@ -511,7 +520,7 @@ Status BkdIndexReader::bkd_query(OlapReaderStatistics* 
stats, const std::string&
 
 Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const 
std::string& column_name,
                                  const void* query_value, 
InvertedIndexQueryType query_type,
-                                 InvertedIndexParserType analyser_type, 
uint32_t* count) {
+                                 uint32_t* count) {
     uint64_t start = UnixMillis();
     auto visitor = std::make_unique<InvertedIndexVisitor>(nullptr, query_type, 
true);
     std::shared_ptr<lucene::util::bkd::bkd_reader> r;
@@ -537,12 +546,13 @@ Status BkdIndexReader::try_query(OlapReaderStatistics* 
stats, const std::string&
 
 Status BkdIndexReader::query(OlapReaderStatistics* stats, const std::string& 
column_name,
                              const void* query_value, InvertedIndexQueryType 
query_type,
-                             InvertedIndexParserType analyser_type, 
roaring::Roaring* bit_map) {
+                             roaring::Roaring* bit_map) {
     SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
 
     io::Path path(_path);
     auto index_dir = path.parent_path();
-    auto index_file_name = 
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+    auto index_file_name =
+            InvertedIndexDescriptor::get_index_file_name(path.filename(), 
_index_meta.index_id());
     auto index_file_path = index_dir / index_file_name;
     // std::string query_str {(const char *)query_value};
 
@@ -820,8 +830,7 @@ Status 
InvertedIndexIterator::read_from_inverted_index(const std::string& column
         }
     }
 
-    RETURN_IF_ERROR(
-            _reader->query(_stats, column_name, query_value, query_type, 
_analyser_type, bit_map));
+    RETURN_IF_ERROR(_reader->query(_stats, column_name, query_value, 
query_type, bit_map));
     return Status::OK();
 }
 
@@ -835,16 +844,11 @@ Status 
InvertedIndexIterator::try_read_from_inverted_index(const std::string& co
         query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY ||
         query_type == InvertedIndexQueryType::LESS_THAN_QUERY ||
         query_type == InvertedIndexQueryType::EQUAL_QUERY) {
-        RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value, 
query_type,
-                                           _analyser_type, count));
+        RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value, 
query_type, count));
     }
     return Status::OK();
 }
 
-InvertedIndexParserType 
InvertedIndexIterator::get_inverted_index_analyser_type() const {
-    return _analyser_type;
-}
-
 InvertedIndexReaderType 
InvertedIndexIterator::get_inverted_index_reader_type() const {
     return _reader->type();
 }
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index a9e263357d..80c653f418 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -78,19 +78,18 @@ enum class InvertedIndexQueryType {
 class InvertedIndexReader {
 public:
     explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string& 
path,
-                                 const uint32_t index_id)
-            : _fs(std::move(fs)), _path(path), _index_id(index_id) {}
+                                 const TabletIndex* index_meta)
+            : _fs(std::move(fs)), _path(path), _index_meta(*index_meta) {}
     virtual ~InvertedIndexReader() = default;
 
     // create a new column iterator. Client should delete returned iterator
-    virtual Status new_iterator(const TabletIndex* index_meta, 
OlapReaderStatistics* stats,
-                                InvertedIndexIterator** iterator) = 0;
+    virtual Status new_iterator(OlapReaderStatistics* stats, 
InvertedIndexIterator** iterator) = 0;
     virtual Status query(OlapReaderStatistics* stats, const std::string& 
column_name,
                          const void* query_value, InvertedIndexQueryType 
query_type,
-                         InvertedIndexParserType analyser_type, 
roaring::Roaring* bit_map) = 0;
+                         roaring::Roaring* bit_map) = 0;
     virtual Status try_query(OlapReaderStatistics* stats, const std::string& 
column_name,
                              const void* query_value, InvertedIndexQueryType 
query_type,
-                             InvertedIndexParserType analyser_type, uint32_t* 
count) = 0;
+                             uint32_t* count) = 0;
 
     Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle,
                             lucene::store::Directory* dir = nullptr);
@@ -98,56 +97,53 @@ public:
     virtual InvertedIndexReaderType type() = 0;
     bool indexExists(io::Path& index_file_path);
 
-    uint32_t get_index_id() const { return _index_id; }
+    uint32_t get_index_id() const { return _index_meta.index_id(); }
 
 protected:
     bool _is_match_query(InvertedIndexQueryType query_type);
     friend class InvertedIndexIterator;
     io::FileSystemSPtr _fs;
     std::string _path;
-    uint32_t _index_id;
+    TabletIndex _index_meta;
 };
 
 class FullTextIndexReader : public InvertedIndexReader {
 public:
     explicit FullTextIndexReader(io::FileSystemSPtr fs, const std::string& 
path,
-                                 const int64_t uniq_id)
-            : InvertedIndexReader(std::move(fs), path, uniq_id) {}
+                                 const TabletIndex* index_meta)
+            : InvertedIndexReader(std::move(fs), path, index_meta) {}
     ~FullTextIndexReader() override = default;
 
-    Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* 
stats,
-                        InvertedIndexIterator** iterator) override;
+    Status new_iterator(OlapReaderStatistics* stats, InvertedIndexIterator** 
iterator) override;
     Status query(OlapReaderStatistics* stats, const std::string& column_name,
                  const void* query_value, InvertedIndexQueryType query_type,
-                 InvertedIndexParserType analyser_type, roaring::Roaring* 
bit_map) override;
+                 roaring::Roaring* bit_map) override;
     Status try_query(OlapReaderStatistics* stats, const std::string& 
column_name,
                      const void* query_value, InvertedIndexQueryType 
query_type,
-                     InvertedIndexParserType analyser_type, uint32_t* count) 
override {
+                     uint32_t* count) override {
         return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>();
     }
 
     InvertedIndexReaderType type() override;
     std::vector<std::wstring> get_analyse_result(const std::wstring& 
field_name,
                                                  const std::string& value,
-                                                 InvertedIndexQueryType 
query_type,
-                                                 InvertedIndexParserType 
analyser_type);
+                                                 InvertedIndexQueryType 
query_type);
 };
 
 class StringTypeInvertedIndexReader : public InvertedIndexReader {
 public:
     explicit StringTypeInvertedIndexReader(io::FileSystemSPtr fs, const 
std::string& path,
-                                           const int64_t uniq_id)
-            : InvertedIndexReader(std::move(fs), path, uniq_id) {}
+                                           const TabletIndex* index_meta)
+            : InvertedIndexReader(std::move(fs), path, index_meta) {}
     ~StringTypeInvertedIndexReader() override = default;
 
-    Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* 
stats,
-                        InvertedIndexIterator** iterator) override;
+    Status new_iterator(OlapReaderStatistics* stats, InvertedIndexIterator** 
iterator) override;
     Status query(OlapReaderStatistics* stats, const std::string& column_name,
                  const void* query_value, InvertedIndexQueryType query_type,
-                 InvertedIndexParserType analyser_type, roaring::Roaring* 
bit_map) override;
+                 roaring::Roaring* bit_map) override;
     Status try_query(OlapReaderStatistics* stats, const std::string& 
column_name,
                      const void* query_value, InvertedIndexQueryType 
query_type,
-                     InvertedIndexParserType analyser_type, uint32_t* count) 
override {
+                     uint32_t* count) override {
         return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>();
     }
     InvertedIndexReaderType type() override;
@@ -189,7 +185,8 @@ public:
 
 class BkdIndexReader : public InvertedIndexReader {
 public:
-    explicit BkdIndexReader(io::FileSystemSPtr fs, const std::string& path, 
const uint32_t uniq_id);
+    explicit BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
+                            const TabletIndex* index_meta);
     ~BkdIndexReader() override {
         if (_compoundReader != nullptr) {
             _compoundReader->close();
@@ -198,15 +195,14 @@ public:
         }
     }
 
-    Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* 
stats,
-                        InvertedIndexIterator** iterator) override;
+    Status new_iterator(OlapReaderStatistics* stats, InvertedIndexIterator** 
iterator) override;
 
     Status query(OlapReaderStatistics* stats, const std::string& column_name,
                  const void* query_value, InvertedIndexQueryType query_type,
-                 InvertedIndexParserType analyser_type, roaring::Roaring* 
bit_map) override;
+                 roaring::Roaring* bit_map) override;
     Status try_query(OlapReaderStatistics* stats, const std::string& 
column_name,
                      const void* query_value, InvertedIndexQueryType 
query_type,
-                     InvertedIndexParserType analyser_type, uint32_t* count) 
override;
+                     uint32_t* count) override;
     Status bkd_query(OlapReaderStatistics* stats, const std::string& 
column_name,
                      const void* query_value, InvertedIndexQueryType 
query_type,
                      std::shared_ptr<lucene::util::bkd::bkd_reader>& r,
@@ -223,13 +219,8 @@ private:
 
 class InvertedIndexIterator {
 public:
-    InvertedIndexIterator(const TabletIndex* index_meta, OlapReaderStatistics* 
stats,
-                          InvertedIndexReader* reader)
-            : _index_meta(index_meta), _stats(stats), _reader(reader) {
-        // TODO xk maybe change interface to use index
-        _analyser_type = get_inverted_index_parser_type_from_string(
-                get_parser_string_from_properties(_index_meta->properties()));
-    }
+    InvertedIndexIterator(OlapReaderStatistics* stats, InvertedIndexReader* 
reader)
+            : _stats(stats), _reader(reader) {}
 
     Status read_from_inverted_index(const std::string& column_name, const 
void* query_value,
                                     InvertedIndexQueryType query_type, 
uint32_t segment_num_rows,
@@ -242,15 +233,11 @@ public:
         return _reader->read_null_bitmap(cache_handle, dir);
     }
 
-    InvertedIndexParserType get_inverted_index_analyser_type() const;
-
     InvertedIndexReaderType get_inverted_index_reader_type() const;
 
 private:
-    const TabletIndex* _index_meta;
     OlapReaderStatistics* _stats;
     InvertedIndexReader* _reader;
-    InvertedIndexParserType _analyser_type;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 49e221212c..253a187fa4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -154,6 +154,12 @@ public:
             auto chinese_analyzer = _CLNEW 
lucene::analysis::LanguageBasedAnalyzer();
             chinese_analyzer->setLanguage(L"chinese");
             chinese_analyzer->initDict(config::inverted_index_dict_path);
+            auto mode = 
get_parser_mode_string_from_properties(_index_meta->properties());
+            if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
+                
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+            } else {
+                chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+            }
             _analyzer.reset(chinese_analyzer);
         } else {
             // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
@@ -199,13 +205,9 @@ public:
     }
 
     void new_fulltext_field(const char* field_value_data, size_t 
field_value_size) {
-        if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
+        if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH ||
+            _parser_type == InvertedIndexParserType::PARSER_CHINESE) {
             new_char_token_stream(field_value_data, field_value_size, _field);
-        } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
-            auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
-                    new lucene::util::AStringReader(field_value_data, 
field_value_size),
-                    lucene::util::SimpleInputStreamReader::UTF8);
-            _field->setValue(stringReader);
         } else {
             new_field_value(field_value_data, field_value_size, _field);
         }
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index 3dad8ba875..a488ae82fd 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -152,6 +152,7 @@ class TabletSchema;
 
 class TabletIndex {
 public:
+    TabletIndex() = default;
     void init_from_thrift(const TOlapTableIndex& index, const TabletSchema& 
tablet_schema);
     void init_from_thrift(const TOlapTableIndex& index, const 
std::vector<int32_t>& column_uids);
     void init_from_pb(const TabletIndexPB& index);
@@ -176,6 +177,13 @@ public:
 
         return 0;
     }
+    TabletIndex(const TabletIndex& other) {
+        _index_id = other._index_id;
+        _index_name = other._index_name;
+        _index_type = other._index_type;
+        _col_unique_ids = other._col_unique_ids;
+        _properties = other._properties;
+    }
 
 private:
     int64_t _index_id;
diff --git a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out 
b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
new file mode 100644
index 0000000000..71489df784
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
@@ -0,0 +1,35 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+2      我爱你中国
+
+-- !sql --
+1      我来到北京清华大学
+
+-- !sql --
+1      我来到北京清华大学
+
+-- !sql --
+1      我来到北京清华大学
+
+-- !sql --
+1      我来到北京清华大学
+
+-- !sql --
+3      人民可以得到更多实惠
+
+-- !sql --
+2      我爱你中国
+
+-- !sql --
+1      我来到北京清华大学
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+1      我来到北京清华大学
+
+-- !sql --
+3      人民可以得到更多实惠
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
new file mode 100644
index 0000000000..f779e0bfce
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_chinese_analyzer"){
+    // prepare test table
+
+
+    def timeout = 60000
+    def delta_time = 1000
+    def alter_res = "null"
+    def useTime = 0
+
+    def indexTblName = "chinese_analyzer_test"
+
+    sql "DROP TABLE IF EXISTS ${indexTblName}"
+    // create 1 replica table
+    sql """
+       CREATE TABLE IF NOT EXISTS ${indexTblName}(
+               `id`int(11)NULL,
+               `c` text NULL,
+               INDEX c_idx(`c`) USING INVERTED 
PROPERTIES("parser"="chinese","parser_mode"="fine_grained") COMMENT ''
+       ) ENGINE=OLAP
+       DUPLICATE KEY(`id`)
+       COMMENT 'OLAP'
+       DISTRIBUTED BY HASH(`id`) BUCKETS 1
+       PROPERTIES(
+               "replication_allocation" = "tag.location.default: 1"
+       );
+    """
+    
+    def var_result = sql "show variables"
+    logger.info("show variales result: " + var_result )
+
+    sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, 
'人民可以得到更多实惠');"
+    qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我爱你' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '大学' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华大学' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '人民' ORDER BY id;"
+
+    def indexTblName2 = "chinese_analyzer_test2"
+
+    sql "DROP TABLE IF EXISTS ${indexTblName2}"
+    // create 1 replica table
+    sql """
+       CREATE TABLE IF NOT EXISTS ${indexTblName2}(
+               `id`int(11)NULL,
+               `c` text NULL,
+               INDEX c_idx(`c`) USING INVERTED 
PROPERTIES("parser"="chinese","parser_mode"="coarse_grained") COMMENT ''
+       ) ENGINE=OLAP
+       DUPLICATE KEY(`id`)
+       COMMENT 'OLAP'
+       DISTRIBUTED BY HASH(`id`) BUCKETS 1
+       PROPERTIES(
+                "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, 
'人民可以得到更多实惠');"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '人民' ORDER BY id;"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to