This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9a9b3b27171 [fix](search) Upgrade query type for variant subcolumns
with analyzer-based indexes (#60782)
9a9b3b27171 is described below
commit 9a9b3b27171e5a6c21063f99ec02652659dd023d
Author: Jack <[email protected]>
AuthorDate: Wed Feb 18 18:26:05 2026 +0800
[fix](search) Upgrade query type for variant subcolumns with analyzer-based
indexes (#60782)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #60654
Problem Summary:
Follow-up fix for #60654 (SearchDslParser refactoring).
When FE resolves a variant subcolumn field pattern to a specific
analyzer-based index and sends its `index_properties` via
`TSearchFieldBinding`, the BE `FieldReaderResolver` was using
`EQUAL_QUERY` for TERM clauses. This caused `select_best_reader` to pick
the `STRING_TYPE` reader (untokenized index directory) instead of the
`FULLTEXT` reader, so tokenized search terms would never match.
**Root cause**: For variant subcolumns with analyzer-based indexes,
`EQUAL_QUERY` opens the wrong (untokenized) index directory. The query
type needs to be upgraded to `MATCH_ANY_QUERY` so `select_best_reader`
picks the correct FULLTEXT reader.
**Fix**: In `FieldReaderResolver::resolve()`, when the field is a
variant subcolumn and the FE-provided `index_properties` indicate an
analyzer-based index (`should_analyzer()` returns true), automatically
upgrade `EQUAL_QUERY` to `MATCH_ANY_QUERY` before calling
`select_best_reader`. Also reuse the `fb_it` iterator to avoid a
redundant map lookup.
---
be/src/vec/functions/function_search.cpp | 23 +++-
.../test_search_variant_dual_index_reader.out | 23 ++++
.../test_search_variant_dual_index_reader.groovy | 133 +++++++++++++++++++++
3 files changed, 176 insertions(+), 3 deletions(-)
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 26d38a32934..2b5e8d9d305 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -108,10 +108,27 @@ Status FieldReaderResolver::resolve(const std::string&
field_name,
"iterator for field '{}' is not InvertedIndexIterator",
field_name);
}
+ // For variant subcolumns, FE resolves the field pattern to a specific
index and sends
+ // its index_properties via TSearchFieldBinding. When FE picks an
analyzer-based index,
+ // upgrade EQUAL_QUERY to MATCH_ANY_QUERY so select_best_reader picks the
FULLTEXT reader
+ // instead of STRING_TYPE. Without this, TERM clauses from lucene-mode DSL
would open the
+ // wrong (untokenized) index directory and tokenized search terms would
never match.
+ InvertedIndexQueryType effective_query_type = query_type;
+ auto fb_it = _field_binding_map.find(field_name);
+ if (is_variant_sub && fb_it != _field_binding_map.end() &&
+ fb_it->second->__isset.index_properties &&
!fb_it->second->index_properties.empty()) {
+ if (inverted_index::InvertedIndexAnalyzer::should_analyzer(
+ fb_it->second->index_properties) &&
+ effective_query_type == InvertedIndexQueryType::EQUAL_QUERY) {
+ effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
+ }
+ }
+
Result<InvertedIndexReaderPtr> reader_result;
const auto& column_type = data_it->second.second;
if (column_type) {
- reader_result = inverted_iterator->select_best_reader(column_type,
query_type, "");
+ reader_result =
+ inverted_iterator->select_best_reader(column_type,
effective_query_type, "");
} else {
reader_result = inverted_iterator->select_best_reader("");
}
@@ -165,11 +182,11 @@ Status FieldReaderResolver::resolve(const std::string&
field_name,
resolved.stored_field_name = stored_field_name;
resolved.stored_field_wstr =
StringHelper::to_wstring(resolved.stored_field_name);
resolved.column_type = column_type;
- resolved.query_type = query_type;
+ resolved.query_type = effective_query_type;
resolved.inverted_reader = inverted_reader;
resolved.lucene_reader = reader_holder;
// Prefer FE-provided index_properties (needed for variant subcolumn
field_pattern matching)
- auto fb_it = _field_binding_map.find(field_name);
+ // Reuse fb_it from earlier lookup above.
if (fb_it != _field_binding_map.end() &&
fb_it->second->__isset.index_properties &&
!fb_it->second->index_properties.empty()) {
resolved.index_properties = fb_it->second->index_properties;
diff --git
a/regression-test/data/search/test_search_variant_dual_index_reader.out
b/regression-test/data/search/test_search_variant_dual_index_reader.out
new file mode 100644
index 00000000000..d4f038ba490
--- /dev/null
+++ b/regression-test/data/search/test_search_variant_dual_index_reader.out
@@ -0,0 +1,23 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !dual_index_basic --
+1
+3
+
+-- !dual_index_and --
+3
+
+-- !dual_index_other_field --
+4
+
+-- !dual_index_field_syntax --
+2
+3
+
+-- !dual_index_case_insensitive --
+1
+3
+
+-- !dual_index_match_baseline --
+1
+3
+
diff --git
a/regression-test/suites/search/test_search_variant_dual_index_reader.groovy
b/regression-test/suites/search/test_search_variant_dual_index_reader.groovy
new file mode 100644
index 00000000000..6e1e297af92
--- /dev/null
+++ b/regression-test/suites/search/test_search_variant_dual_index_reader.groovy
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Regression test for variant subcolumn with dual inverted indexes on the
same field pattern.
+ *
+ * Bug scenario: When a variant column has two indexes on the same
field_pattern (e.g. "string_*"):
+ * - idx_no_analyzer: no parser -> STRING_TYPE reader (untokenized)
+ * - idx_with_analyzer: parser=xxx -> FULLTEXT reader (tokenized)
+ *
+ * FE correctly resolves the field to the analyzer-based index and sends its
index_properties
+ * via TSearchFieldBinding. However, BE's FieldReaderResolver::resolve() called
+ * select_best_reader(column_type, EQUAL_QUERY, "") which preferred
STRING_TYPE for EQUAL_QUERY.
+ * This opened the untokenized index directory, so tokenized search terms
never matched.
+ *
+ * Fix: For variant subcolumns, when FE provides index_properties indicating
an analyzer,
+ * upgrade EQUAL_QUERY to MATCH_ANY_QUERY before reader selection so the
FULLTEXT reader is chosen.
+ *
+ * Before fix: search() returns empty (wrong reader selected)
+ * After fix: search() returns matching rows (correct FULLTEXT reader
selected)
+ */
+suite("test_search_variant_dual_index_reader") {
+ def tableName = "test_variant_dual_index_reader"
+
+ sql """ set enable_match_without_inverted_index = false """
+ sql """ set enable_common_expr_pushdown = true """
+ sql """ set default_variant_enable_typed_paths_to_sparse = false """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Create table with variant column and TWO indexes on the same
field_pattern:
+ // one without analyzer (STRING_TYPE) and one with analyzer (FULLTEXT).
+ // This is the exact scenario that triggers the bug.
+ sql """
+ CREATE TABLE ${tableName} (
+ `id` INT NOT NULL,
+ `props` variant<
+ MATCH_NAME_GLOB 'string_*' : string,
+ properties("variant_max_subcolumns_count" = "100")
+ > NULL,
+ INDEX idx_no_analyzer (props) USING INVERTED PROPERTIES(
+ "field_pattern" = "string_*"
+ ),
+ INDEX idx_with_analyzer (props) USING INVERTED PROPERTIES(
+ "parser" = "unicode",
+ "field_pattern" = "string_*",
+ "lower_case" = "true"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ )
+ """
+
+ sql """INSERT INTO ${tableName} VALUES
+ (1, '{"string_8": "admin user"}'),
+ (2, '{"string_8": "readonly access"}'),
+ (3, '{"string_8": "admin access granted"}'),
+ (4, '{"string_1": "hello world"}'),
+ (5, '{"string_8": "guest only"}')
+ """
+
+ sql "sync"
+ Thread.sleep(5000)
+
+ // Test 1: Basic tokenized search on variant subcolumn with dual indexes.
+ // "admin" should match rows 1 and 3 via the FULLTEXT reader (tokenized).
+ // Before fix: returns empty because EQUAL_QUERY selects STRING_TYPE
reader.
+ // After fix: returns rows 1, 3 because MATCH_ANY_QUERY selects FULLTEXT
reader.
+ qt_dual_index_basic """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('admin',
'{"default_field":"props.string_8","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 2: Multi-term AND search. Both "admin" and "access" must match.
+ // Before fix: empty. After fix: row 3.
+ qt_dual_index_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('admin access',
'{"default_field":"props.string_8","mode":"lucene","default_operator":"AND"}')
+ ORDER BY id
+ """
+
+ // Test 3: Search on a different subcolumn matching the same field_pattern.
+ // Ensures the fix works across different subcolumns under the same
pattern.
+ qt_dual_index_other_field """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('hello',
'{"default_field":"props.string_1","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 4: Field-qualified syntax with dual indexes.
+ qt_dual_index_field_syntax """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('props.string_8:access', '{"mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 5: Case-insensitive search (lowercase index).
+ // "ADMIN" should match "admin user" and "admin access granted".
+ qt_dual_index_case_insensitive """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('ADMIN',
'{"default_field":"props.string_8","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 6: Verify MATCH_ANY also works as baseline (uses MATCH query type
directly,
+ // so it always picks FULLTEXT reader — this should work both before and
after the fix).
+ qt_dual_index_match_baseline """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE props['string_8'] MATCH_ANY 'admin'
+ ORDER BY id
+ """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]