This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2b0b87be27e [fix](inverted index) Fix empty string MATCH on keyword
index returning wrong results (#60500)
2b0b87be27e is described below
commit 2b0b87be27e9cdbedecae23988fe9130b336d4f0
Author: Jack <[email protected]>
AuthorDate: Thu Feb 5 10:43:18 2026 +0800
[fix](inverted index) Fix empty string MATCH on keyword index returning
wrong results (#60500)
## Proposed changes
Fix empty string MATCH on keyword index returning wrong results.
The multi-analyzer feature commit (2c950e140a5) incorrectly added an
empty string check that prevented `MATCH ''` from finding rows with
empty string values in keyword indexes.
For keyword index (no tokenization), empty string is a valid exact match
value and should be matchable. The previous code incorrectly skipped
empty strings with the comment "empty query should match nothing", which
is wrong for keyword indexes.
## Problem
```sql
-- Table with keyword index (no parser)
CREATE TABLE test (id INT, col TEXT, INDEX idx(col) USING INVERTED);
INSERT INTO test VALUES (1, ''), (2, 'data');
-- Before fix: returns 0 (WRONG!)
-- After fix: returns 1 (CORRECT!)
SELECT count() FROM test WHERE col MATCH '';
```
## Changes
This fix removes the empty string check for keyword index paths in:
- `be/src/vec/functions/match.cpp` (slow path)
- `be/src/olap/rowset/segment_v2/inverted_index_reader.cpp` (index path)
- `be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp`
Added regression test `test_empty_string_match.groovy` to cover:
- Empty string match on keyword index (both index and slow paths)
- Empty string match on tokenized index (should return 0)
- match_any and match_all with empty string
## Check List (For Author)
- Test
- [x] Regression test
- [x] Unit Test
- [ ] Manual test
- [ ] No need to test
- Behavior changed:
- [x] Yes. `MATCH ''` on keyword index now correctly matches rows with
empty string values.
- Does this need documentation?
- [ ] No.
---
.../inverted_index/analyzer/analyzer.cpp | 6 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 7 +-
be/src/vec/functions/match.cpp | 13 ++--
.../inverted_index_p0/test_empty_string_match.out | 26 ++++++++
.../test_empty_string_match.groovy | 78 ++++++++++++++++++++++
5 files changed, 115 insertions(+), 15 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index ec9d81f1503..cad3837d081 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -187,10 +187,10 @@ std::vector<TermInfo>
InvertedIndexAnalyzer::get_analyse_result(
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
const std::string& search_str, const std::map<std::string,
std::string>& properties) {
if (!should_analyzer(properties)) {
+ // Keyword index: all strings (including empty) are valid tokens for
exact match.
+ // Empty string is a valid value in keyword index and should be
matchable.
std::vector<TermInfo> result;
- if (!search_str.empty()) {
- result.emplace_back(search_str);
- }
+ result.emplace_back(search_str);
return result;
}
InvertedIndexAnalyzerConfig config;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index f34a546a105..fecf5b64462 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -315,10 +315,9 @@ Status FullTextIndexReader::query(const
IndexQueryContextPtr& context,
} else {
SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) {
- // Don't add empty string as token - empty query should match
nothing
- if (!search_str.empty()) {
- query_info.term_infos.emplace_back(search_str);
- }
+ // Keyword index: all strings (including empty) are valid
tokens for exact match.
+ // Empty string is a valid value in keyword index and should
be matchable.
+ query_info.term_infos.emplace_back(search_str);
} else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer !=
nullptr) {
// Use analyzer from query context for consistent behavior
across all segments.
// This ensures that the query uses the same analyzer settings
(e.g., lowercase)
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index c3a2ec9fce9..502a636b8c8 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -205,20 +205,17 @@ std::vector<TermInfo>
FunctionMatchBase::analyse_query_str_token(
// - PARSER_NONE: no tokenization (keyword/exact match)
// - Other parsers: tokenize using the analyzer
if (!analyzer_ctx->should_tokenize()) {
- // Keyword index or no tokenization needed
- // Don't add empty string as token - empty query should match nothing
- if (!match_query_str.empty()) {
- query_tokens.emplace_back(match_query_str);
- }
+ // Keyword index: all strings (including empty) are valid tokens for
exact match.
+ // Empty string is a valid value in keyword index and should be
matchable.
+ query_tokens.emplace_back(match_query_str);
return query_tokens;
}
// Safety check: if analyzer is nullptr but tokenization is expected, fall
back to no tokenization
if (analyzer_ctx->analyzer == nullptr) {
VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
- if (!match_query_str.empty()) {
- query_tokens.emplace_back(match_query_str);
- }
+ // For fallback case, also allow empty strings to be matched
+ query_tokens.emplace_back(match_query_str);
return query_tokens;
}
diff --git a/regression-test/data/inverted_index_p0/test_empty_string_match.out
b/regression-test/data/inverted_index_p0/test_empty_string_match.out
new file mode 100644
index 00000000000..c05432b680b
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_empty_string_match.out
@@ -0,0 +1,26 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !keyword_index_path --
+1
+3
+
+-- !keyword_slow_path --
+1
+3
+
+-- !english_index_path --
+0
+
+-- !english_slow_path --
+0
+
+-- !keyword_nonempty --
+2
+
+-- !match_any_empty --
+1
+3
+
+-- !match_all_empty --
+1
+3
+
diff --git
a/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy
b/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy
new file mode 100644
index 00000000000..798e0100f1b
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_empty_string_match", "p0") {
+ def tableName = "test_empty_string_match"
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ keyword_col TEXT DEFAULT '',
+ english_col TEXT DEFAULT '',
+ INDEX keyword_idx(keyword_col) USING INVERTED COMMENT 'keyword
index',
+ INDEX english_idx(english_col) USING INVERTED PROPERTIES("parser"
= "english") COMMENT 'english parser'
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_allocation" = "tag.location.default: 1");
+ """
+
+ sql """
+ INSERT INTO ${tableName} VALUES
+ (1, '', 'hello world'),
+ (2, 'test', ''),
+ (3, '', ''),
+ (4, 'data', 'some text');
+ """
+
+ sql "SET enable_common_expr_pushdown = true"
+
+ // Test 1: Empty string match on keyword index (index path)
+ // Should match rows where keyword_col is empty string (rows 1 and 3)
+ sql "SET enable_inverted_index_query = true"
+ qt_keyword_index_path """SELECT id FROM ${tableName} WHERE keyword_col
match '' ORDER BY id"""
+
+ // Test 2: Empty string match on keyword index (slow path)
+ // Should also match rows where keyword_col is empty string
+ sql "SET enable_inverted_index_query = false"
+ sql "SET enable_match_without_inverted_index = true"
+ qt_keyword_slow_path """SELECT id FROM ${tableName} WHERE keyword_col
match '' ORDER BY id"""
+
+ // Test 3: Empty string match on tokenized index (index path)
+ // Should return no rows because empty string tokenizes to nothing
+ sql "SET enable_inverted_index_query = true"
+ qt_english_index_path """SELECT count() FROM ${tableName} WHERE
english_col match ''"""
+
+ // Test 4: Empty string match on tokenized index (slow path)
+ // Should also return no rows
+ sql "SET enable_inverted_index_query = false"
+ qt_english_slow_path """SELECT count() FROM ${tableName} WHERE english_col
match ''"""
+
+ // Test 5: Non-empty string match on keyword index should work as before
+ sql "SET enable_inverted_index_query = true"
+ qt_keyword_nonempty """SELECT id FROM ${tableName} WHERE keyword_col match
'test' ORDER BY id"""
+
+ // Test 6: Verify match_any with empty string on keyword index
+ sql "SET enable_inverted_index_query = false"
+ qt_match_any_empty """SELECT id FROM ${tableName} WHERE keyword_col
match_any '' ORDER BY id"""
+
+ // Test 7: Verify match_all with empty string on keyword index
+ qt_match_all_empty """SELECT id FROM ${tableName} WHERE keyword_col
match_all '' ORDER BY id"""
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]