This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new b4509371589 [refactor](search) Refactor SearchDslParser to
single-phase ANTLR parsing and fix ES compatibility issues (#60654)
b4509371589 is described below
commit b450937158974dbe8d72a281ba14ea59f7a3ce52
Author: Jack <[email protected]>
AuthorDate: Wed Feb 18 10:54:56 2026 +0800
[refactor](search) Refactor SearchDslParser to single-phase ANTLR parsing
and fix ES compatibility issues (#60654)
### What problem does this PR solve?
Problem Summary:
The `search()` function's DSL parser had multiple ES compatibility
issues and used a two-phase parsing approach (manual pre-parse + ANTLR)
that was error-prone. This PR refactors the parser and fixes several
bugs:
1. **SearchDslParser refactoring**: Consolidated from two-phase (manual
pre-parse + ANTLR) to single-phase ANTLR parsing. The ANTLR grammar now
handles all DSL syntax directly, eliminating the fragile manual
pre-parse layer. This fixes issues with operator precedence, grouping,
and edge cases.
2. **ANTLR grammar improvements**: Updated `SearchLexer.g4` and
`SearchParser.g4` to properly handle quoted phrases, field-qualified
expressions, prefix/wildcard/regexp patterns, range queries, and boolean
operators with correct precedence.
3. **minimum_should_match pipeline**: Added `default_operator` and
`minimum_should_match` fields to `TSearchParam` thrift, passing them
from FE `SearchPredicate` through to BE `function_search`. When
`minimum_should_match > 0`, uses `OccurBooleanQuery` for proper
Lucene-style boolean query semantics.
4. **Wildcard/Prefix/Regexp case-sensitivity**: Wildcard and PREFIX
patterns are now lowercased when the index has `parser +
lower_case=true` (matching ES query_string normalizer behavior). REGEXP
patterns are NOT lowercased (matching ES regex behavior where patterns
bypass analysis).
5. **MATCH_ALL_DOCS support**: Added `MATCH_ALL_DOCS` clause type for
standalone `*` queries and pure NOT query rewrites. Enhanced `AllQuery`
with deferred `max_doc` from `context.segment_num_rows` and nullable
field support via `NullableScorer`.
6. **BE fixes**:
- `regexp_weight._max_expansions`: Changed from 50 to 0 (unlimited) to
prevent PREFIX queries from missing documents
- `occur_boolean_weight`: Fixed swap→append bug when all SHOULD clauses
must match, preserving existing MUST scorers
- Variant subcolumn `index_properties` propagation for proper analyzer
selection
- `lower_case` default handling: inverted index `lower_case` defaults to
`"true"` when a parser is configured
---
.../inverted_index/query_v2/all_query/all_query.h | 49 ++-
.../boolean_query/occur_boolean_weight.cpp | 7 +-
.../query_v2/regexp_query/regexp_weight.h | 4 +-
be/src/olap/tablet_schema.cpp | 14 +-
be/src/vec/functions/function_search.cpp | 108 ++++-
be/src/vec/functions/function_search.h | 12 +-
.../occur_boolean_query_real_index_test.cpp | 9 +-
.../query_v2/occur_boolean_query_test.cpp | 18 +-
.../query_v2/regexp_wildcard_lowercase_test.cpp | 228 +++++++++++
be/test/vec/function/function_search_test.cpp | 4 +-
.../org/apache/doris/analysis/SearchPredicate.java | 31 ++
.../glue/translator/ExpressionTranslator.java | 18 +-
.../functions/scalar/SearchDslParser.java | 339 ++++++++++------
.../apache/doris/analysis/SearchPredicateTest.java | 136 +++++++
.../functions/scalar/SearchDslParserTest.java | 439 +++++++++++++++++++--
gensrc/thrift/Exprs.thrift | 3 +
.../data/search/test_search_lucene_mode.out | 4 +
.../data/search/test_search_multi_field.out | 4 +-
.../data/search/test_search_regexp_lowercase.out | 39 ++
.../test_search_variant_subcolumn_analyzer.out | 30 ++
.../suites/search/test_search_lucene_mode.groovy | 9 +-
.../suites/search/test_search_multi_field.groovy | 2 +
.../search/test_search_regexp_lowercase.groovy | 153 +++++++
.../test_search_variant_subcolumn_analyzer.groovy | 175 ++++++++
24 files changed, 1610 insertions(+), 225 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h
index cd73860d46f..aa17338e2b1 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h
@@ -19,7 +19,9 @@
#include <algorithm>
#include <memory>
+#include <string>
+#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/query.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h"
@@ -34,14 +36,14 @@ using AllScorerPtr = std::shared_ptr<AllScorer>;
using AllWeightPtr = std::shared_ptr<AllWeight>;
using AllQueryPtr = std::shared_ptr<AllQuery>;
+/// Scorer that matches all documents [0, max_doc).
+/// Mirrors Lucene's MatchAllDocsQuery scorer with ConstantScoreWeight:
+/// returns a constant score of 1.0 when scoring is enabled, 0.0 otherwise.
class AllScorer : public Scorer {
public:
- explicit AllScorer(uint32_t max_doc) : _max_doc(max_doc) {
- if (_max_doc == 0) {
- _doc = TERMINATED;
- } else {
- _doc = 0;
- }
+ AllScorer(uint32_t max_doc, bool enable_scoring)
+ : _max_doc(max_doc), _score(enable_scoring ? 1.0F : 0.0F) {
+ _doc = (_max_doc == 0) ? TERMINATED : 0;
}
~AllScorer() override = default;
@@ -72,41 +74,60 @@ public:
return _doc;
}
- float score() override { return 1.0F; }
+ float score() override { return _score; }
uint32_t size_hint() const override { return _max_doc; }
private:
uint32_t _max_doc = 0;
uint32_t _doc = TERMINATED;
+ float _score;
};
+/// Weight for AllQuery. Analogous to Lucene's ConstantScoreWeight used by
MatchAllDocsQuery.
class AllWeight : public Weight {
public:
- explicit AllWeight(uint32_t max_doc) : _max_doc(max_doc) {}
+ explicit AllWeight(bool enable_scoring) : _enable_scoring(enable_scoring)
{}
+
+ AllWeight(std::wstring field, bool nullable, bool enable_scoring)
+ : _field(std::move(field)), _nullable(nullable),
_enable_scoring(enable_scoring) {}
~AllWeight() override = default;
ScorerPtr scorer(const QueryExecutionContext& context) override {
- return std::make_shared<AllScorer>(_max_doc);
+ auto inner = std::make_shared<AllScorer>(context.segment_num_rows,
_enable_scoring);
+ if (_nullable && context.null_resolver != nullptr) {
+ std::string logical = logical_field_or_fallback(context, "",
_field);
+ return make_nullable_scorer(std::move(inner), logical,
context.null_resolver);
+ }
+ return inner;
}
private:
- uint32_t _max_doc = 0;
+ std::wstring _field;
+ bool _nullable = false;
+ bool _enable_scoring = false;
};
+/// Query that matches all documents, analogous to Lucene's MatchAllDocsQuery.
+/// Uses constant scoring (score = 1.0) like Lucene's ConstantScoreWeight.
class AllQuery : public Query {
public:
- explicit AllQuery(uint32_t max_doc) : _max_doc(max_doc) {}
+ AllQuery() = default;
+ AllQuery(std::wstring field, bool nullable) : _field(std::move(field)),
_nullable(nullable) {}
~AllQuery() override = default;
- WeightPtr weight(bool /*enable_scoring*/) override {
- return std::make_shared<AllWeight>(_max_doc);
+ WeightPtr weight(bool enable_scoring) override {
+ if (!_field.empty()) {
+ return std::make_shared<AllWeight>(_field, _nullable,
enable_scoring);
+ }
+ return std::make_shared<AllWeight>(enable_scoring);
}
private:
- uint32_t _max_doc = 0;
+ std::wstring _field;
+ bool _nullable = false;
};
} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp
index e92a32fbe94..9b828708798 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp
@@ -112,6 +112,7 @@ std::optional<CombinationMethod>
OccurBooleanWeight<ScoreCombinerPtrT>::build_sh
} else if (adjusted_minimum == 1) {
return Required {scorer_union(std::move(should_scorers), combiner)};
} else if (adjusted_minimum == num_of_should_scorers) {
+ // All SHOULD clauses must match - move them to must_scorers (append,
not swap)
for (auto& scorer : should_scorers) {
must_scorers.push_back(std::move(scorer));
}
@@ -137,7 +138,7 @@ ScorerPtr
OccurBooleanWeight<ScoreCombinerPtrT>::effective_must_scorer(
std::vector<ScorerPtr> must_scorers, size_t must_num_all_scorers) {
if (must_scorers.empty()) {
if (must_num_all_scorers > 0) {
- return std::make_shared<AllScorer>(_max_doc);
+ return std::make_shared<AllScorer>(_max_doc, _enable_scoring);
}
return nullptr;
}
@@ -152,10 +153,10 @@ SpecializedScorer
OccurBooleanWeight<ScoreCombinerPtrT>::effective_should_scorer
if (_enable_scoring) {
std::vector<ScorerPtr> scorers;
scorers.push_back(into_box_scorer(std::move(should_scorer),
combiner));
- scorers.push_back(std::make_shared<AllScorer>(_max_doc));
+ scorers.push_back(std::make_shared<AllScorer>(_max_doc,
_enable_scoring));
return make_buffered_union(std::move(scorers), combiner);
} else {
- return std::make_shared<AllScorer>(_max_doc);
+ return std::make_shared<AllScorer>(_max_doc, _enable_scoring);
}
}
return should_scorer;
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h
index b58d124ed11..f9959ff0d8c 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h
@@ -48,7 +48,9 @@ private:
std::string _pattern;
bool _enable_scoring = false;
bool _nullable = true;
- int32_t _max_expansions = 50;
+ // Set to 0 to disable limit (ES has no default limit for prefix queries)
+ // The limit prevents collecting too many terms, but can cause incorrect
results
+ int32_t _max_expansions = 0;
};
} // namespace doris::segment_v2::inverted_index::query_v2
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index d79945f1f89..c1f9ff085ec 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -39,6 +39,7 @@
#include "olap/inverted_index_parser.h"
#include "olap/olap_common.h"
#include "olap/olap_define.h"
+#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/tablet_column_object_pool.h"
#include "olap/types.h"
#include "olap/utils.h"
@@ -955,9 +956,16 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const
{
DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })
- // lowercase by default
- if (!_properties.empty()) {
- if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
+ // Only add lower_case=true default for built-in analyzers/parsers, NOT
for custom analyzers
+ // Custom analyzer: lower_case is determined by analyzer's internal token
filter
+ if (!_properties.empty() &&
!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
+ bool has_parser = _properties.contains(INVERTED_INDEX_PARSER_KEY) ||
+
_properties.contains(INVERTED_INDEX_PARSER_KEY_ALIAS);
+ std::string analyzer_name =
get_analyzer_name_from_properties(_properties);
+ bool is_builtin = analyzer_name.empty() ||
+
segment_v2::inverted_index::InvertedIndexAnalyzer::is_builtin_analyzer(
+ analyzer_name);
+ if (has_parser || is_builtin) {
(*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
INVERTED_INDEX_PARSER_TRUE;
}
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 6f737146915..26d38a32934 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -37,6 +37,7 @@
#include "olap/rowset/segment_v2/index_query_context.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/query/query_helper.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_query.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_query_builder.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/operator.h"
@@ -48,6 +49,7 @@
#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
#include "olap/rowset/segment_v2/inverted_index_iterator.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
+#include "util/string_util.h"
#include "vec/columns/column_const.h"
#include "vec/core/columns_with_type_and_name.h"
#include "vec/data_types/data_type_string.h"
@@ -166,7 +168,14 @@ Status FieldReaderResolver::resolve(const std::string&
field_name,
resolved.query_type = query_type;
resolved.inverted_reader = inverted_reader;
resolved.lucene_reader = reader_holder;
- resolved.index_properties = inverted_reader->get_index_properties();
+ // Prefer FE-provided index_properties (needed for variant subcolumn
field_pattern matching)
+ auto fb_it = _field_binding_map.find(field_name);
+ if (fb_it != _field_binding_map.end() &&
fb_it->second->__isset.index_properties &&
+ !fb_it->second->index_properties.empty()) {
+ resolved.index_properties = fb_it->second->index_properties;
+ } else {
+ resolved.index_properties = inverted_reader->get_index_properties();
+ }
resolved.binding_key = binding_key;
resolved.analyzer_key =
normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties));
@@ -217,10 +226,22 @@ Status
FunctionSearch::evaluate_inverted_index_with_search_param(
FieldReaderResolver resolver(data_type_with_names, iterators, context,
search_param.field_bindings);
+ // Extract default_operator from TSearchParam (default: "or")
+ std::string default_operator = "or";
+ if (search_param.__isset.default_operator &&
!search_param.default_operator.empty()) {
+ default_operator = search_param.default_operator;
+ }
+ // Extract minimum_should_match from TSearchParam (-1 means not set)
+ int32_t minimum_should_match = -1;
+ if (search_param.__isset.minimum_should_match) {
+ minimum_should_match = search_param.minimum_should_match;
+ }
+
query_v2::QueryPtr root_query;
std::string root_binding_key;
RETURN_IF_ERROR(build_query_recursive(search_param.root, context,
resolver, &root_query,
- &root_binding_key));
+ &root_binding_key, default_operator,
+ minimum_should_match));
if (root_query == nullptr) {
LOG(INFO) << "search: Query tree resolved to empty query, dsl:"
<< search_param.original_dsl;
@@ -429,7 +450,9 @@ Status FunctionSearch::build_query_recursive(const
TSearchClause& clause,
const
std::shared_ptr<IndexQueryContext>& context,
FieldReaderResolver& resolver,
inverted_index::query_v2::QueryPtr* out,
- std::string* binding_key) const {
+ std::string* binding_key,
+ const std::string&
default_operator,
+ int32_t minimum_should_match)
const {
DCHECK(out != nullptr);
*out = nullptr;
if (binding_key) {
@@ -438,6 +461,12 @@ Status FunctionSearch::build_query_recursive(const
TSearchClause& clause,
const std::string& clause_type = clause.clause_type;
+ // Handle MATCH_ALL_DOCS - matches all documents in the segment
+ if (clause_type == "MATCH_ALL_DOCS") {
+ *out = std::make_shared<query_v2::AllQuery>();
+ return Status::OK();
+ }
+
// Handle OCCUR_BOOLEAN - Lucene-style boolean query with
MUST/SHOULD/MUST_NOT
if (clause_type == "OCCUR_BOOLEAN") {
auto builder =
segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
@@ -452,7 +481,8 @@ Status FunctionSearch::build_query_recursive(const
TSearchClause& clause,
query_v2::QueryPtr child_query;
std::string child_binding_key;
RETURN_IF_ERROR(build_query_recursive(child_clause, context,
resolver, &child_query,
- &child_binding_key));
+ &child_binding_key,
default_operator,
+ minimum_should_match));
// Determine occur type from child clause
query_v2::Occur occur = query_v2::Occur::MUST; // default
@@ -483,7 +513,8 @@ Status FunctionSearch::build_query_recursive(const
TSearchClause& clause,
query_v2::QueryPtr child_query;
std::string child_binding_key;
RETURN_IF_ERROR(build_query_recursive(child_clause, context,
resolver, &child_query,
- &child_binding_key));
+ &child_binding_key,
default_operator,
+ minimum_should_match));
// Add all children including empty BitSetQuery
// BooleanQuery will handle the logic:
// - AND with empty bitmap → result is empty
@@ -497,14 +528,17 @@ Status FunctionSearch::build_query_recursive(const
TSearchClause& clause,
return Status::OK();
}
- return build_leaf_query(clause, context, resolver, out, binding_key);
+ return build_leaf_query(clause, context, resolver, out, binding_key,
default_operator,
+ minimum_should_match);
}
Status FunctionSearch::build_leaf_query(const TSearchClause& clause,
const
std::shared_ptr<IndexQueryContext>& context,
FieldReaderResolver& resolver,
inverted_index::query_v2::QueryPtr*
out,
- std::string* binding_key) const {
+ std::string* binding_key,
+ const std::string& default_operator,
+ int32_t minimum_should_match) const {
DCHECK(out != nullptr);
*out = nullptr;
if (binding_key) {
@@ -576,7 +610,27 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
return Status::OK();
}
- auto builder =
create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR);
+ // When minimum_should_match is specified, use OccurBooleanQuery
+ // ES behavior: msm only applies to SHOULD clauses
+ if (minimum_should_match > 0) {
+ auto builder =
+
segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
+ builder->set_minimum_number_should_match(minimum_should_match);
+ query_v2::Occur occur = (default_operator == "and") ?
query_v2::Occur::MUST
+ :
query_v2::Occur::SHOULD;
+ for (const auto& term_info : term_infos) {
+ std::wstring term_wstr =
StringHelper::to_wstring(term_info.get_single_term());
+ builder->add(make_term_query(term_wstr), occur);
+ }
+ *out = builder->build();
+ return Status::OK();
+ }
+
+ // Use default_operator to determine how to combine tokenized terms
+ query_v2::OperatorType op_type = (default_operator == "and")
+ ?
query_v2::OperatorType::OP_AND
+ :
query_v2::OperatorType::OP_OR;
+ auto builder = create_operator_boolean_query_builder(op_type);
for (const auto& term_info : term_infos) {
std::wstring term_wstr =
StringHelper::to_wstring(term_info.get_single_term());
builder->add(make_term_query(term_wstr), binding.binding_key);
@@ -716,20 +770,50 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
return Status::OK();
}
if (clause_type == "PREFIX") {
- *out = std::make_shared<query_v2::WildcardQuery>(context,
field_wstr, value);
+ // Apply lowercase only if:
+ // 1. There's a parser/analyzer (otherwise lower_case has no
effect on indexing)
+ // 2. lower_case is explicitly set to "true"
+ bool has_parser =
inverted_index::InvertedIndexAnalyzer::should_analyzer(
+ binding.index_properties);
+ std::string lowercase_setting =
+
get_parser_lowercase_from_properties(binding.index_properties);
+ bool should_lowercase = has_parser && (lowercase_setting ==
INVERTED_INDEX_PARSER_TRUE);
+ std::string pattern = should_lowercase ? to_lower(value) : value;
+ *out = std::make_shared<query_v2::WildcardQuery>(context,
field_wstr, pattern);
VLOG_DEBUG << "search: PREFIX clause processed, field=" <<
field_name << ", pattern='"
- << value << "'";
+ << pattern << "' (original='" << value << "',
has_parser=" << has_parser
+ << ", lower_case=" << lowercase_setting << ")";
return Status::OK();
}
if (clause_type == "WILDCARD") {
- *out = std::make_shared<query_v2::WildcardQuery>(context,
field_wstr, value);
+ // Standalone wildcard "*" matches all non-null values for this
field
+ // Consistent with ES query_string behavior where field:* becomes
FieldExistsQuery
+ if (value == "*") {
+ *out = std::make_shared<query_v2::AllQuery>(field_wstr, true);
+ VLOG_DEBUG << "search: WILDCARD '*' converted to
AllQuery(nullable=true), field="
+ << field_name;
+ return Status::OK();
+ }
+ // Apply lowercase only if:
+ // 1. There's a parser/analyzer (otherwise lower_case has no
effect on indexing)
+ // 2. lower_case is explicitly set to "true"
+ bool has_parser =
inverted_index::InvertedIndexAnalyzer::should_analyzer(
+ binding.index_properties);
+ std::string lowercase_setting =
+
get_parser_lowercase_from_properties(binding.index_properties);
+ bool should_lowercase = has_parser && (lowercase_setting ==
INVERTED_INDEX_PARSER_TRUE);
+ std::string pattern = should_lowercase ? to_lower(value) : value;
+ *out = std::make_shared<query_v2::WildcardQuery>(context,
field_wstr, pattern);
VLOG_DEBUG << "search: WILDCARD clause processed, field=" <<
field_name << ", pattern='"
- << value << "'";
+ << pattern << "' (original='" << value << "',
has_parser=" << has_parser
+ << ", lower_case=" << lowercase_setting << ")";
return Status::OK();
}
if (clause_type == "REGEXP") {
+ // ES-compatible: regex patterns are NOT lowercased
(case-sensitive matching)
+ // This matches ES query_string behavior where regex patterns
bypass analysis
*out = std::make_shared<query_v2::RegexpQuery>(context,
field_wstr, value);
VLOG_DEBUG << "search: REGEXP clause processed, field=" <<
field_name << ", pattern='"
<< value << "'";
diff --git a/be/src/vec/functions/function_search.h
b/be/src/vec/functions/function_search.h
index 944f07dd1b6..d8b7c08fac6 100644
--- a/be/src/vec/functions/function_search.h
+++ b/be/src/vec/functions/function_search.h
@@ -64,11 +64,12 @@ public:
_iterators(iterators),
_context(std::move(context)),
_field_bindings(field_bindings) {
- // Build a lookup map for quick variant subcolumn checks
+ // Build lookup maps for quick access
for (const auto& binding : _field_bindings) {
if (binding.__isset.is_variant_subcolumn &&
binding.is_variant_subcolumn) {
_variant_subcolumn_fields.insert(binding.field_name);
}
+ _field_binding_map[binding.field_name] = &binding;
}
}
@@ -114,6 +115,7 @@ private:
const std::unordered_map<std::string, IndexIterator*>& _iterators;
std::shared_ptr<IndexQueryContext> _context;
std::vector<TSearchFieldBinding> _field_bindings;
+ std::unordered_map<std::string, const TSearchFieldBinding*>
_field_binding_map;
std::unordered_set<std::string> _variant_subcolumn_fields;
std::unordered_map<std::string, FieldReaderBinding> _cache;
std::vector<std::shared_ptr<lucene::index::IndexReader>> _readers;
@@ -182,13 +184,15 @@ public:
Status build_query_recursive(const TSearchClause& clause,
const std::shared_ptr<IndexQueryContext>&
context,
FieldReaderResolver& resolver,
- inverted_index::query_v2::QueryPtr* out,
- std::string* binding_key) const;
+ inverted_index::query_v2::QueryPtr* out,
std::string* binding_key,
+ const std::string& default_operator,
+ int32_t minimum_should_match) const;
Status build_leaf_query(const TSearchClause& clause,
const std::shared_ptr<IndexQueryContext>& context,
FieldReaderResolver& resolver,
inverted_index::query_v2::QueryPtr* out,
- std::string* binding_key) const;
+ std::string* binding_key, const std::string&
default_operator,
+ int32_t minimum_should_match) const;
Status collect_all_field_nulls(const TSearchClause& clause,
const std::unordered_map<std::string,
IndexIterator*>& iterators,
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
index 4fc01f43e1d..0088b88dd4d 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
@@ -136,8 +136,7 @@ TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQuery) {
auto phrase_query = std::make_shared<PhraseQuery>(context, field,
term_infos);
- uint32_t max_doc = reader_holder->maxDoc();
- auto all_query = std::make_shared<AllQuery>(max_doc);
+ auto all_query = std::make_shared<AllQuery>();
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::SHOULD, all_query);
@@ -255,8 +254,7 @@ TEST_F(OccurBooleanQueryRealIndexTest,
NotPhraseQueryNonExistent) {
auto phrase_query = std::make_shared<PhraseQuery>(context, field,
term_infos);
- uint32_t max_doc = reader_holder->maxDoc();
- auto all_query = std::make_shared<AllQuery>(max_doc);
+ auto all_query = std::make_shared<AllQuery>();
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::SHOULD, all_query);
@@ -309,8 +307,7 @@ TEST_F(OccurBooleanQueryRealIndexTest,
NotPhraseQueryExcludesPartial) {
auto phrase_query = std::make_shared<PhraseQuery>(context, field,
term_infos);
- uint32_t max_doc = reader_holder->maxDoc();
- auto all_query = std::make_shared<AllQuery>(max_doc);
+ auto all_query = std::make_shared<AllQuery>();
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::SHOULD, all_query);
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp
index 244ddfb8dcc..7d885ecce19 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp
@@ -874,7 +874,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithMustClause) {
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::MUST, std::make_shared<MockQuery>(must_docs));
- clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>(100));
+ clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>());
OccurBooleanQuery query(std::move(clauses));
auto weight = query.weight(false);
@@ -891,7 +891,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithShouldClause) {
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::SHOULD,
std::make_shared<MockQuery>(should_docs));
- clauses.emplace_back(Occur::SHOULD, std::make_shared<AllQuery>(50));
+ clauses.emplace_back(Occur::SHOULD, std::make_shared<AllQuery>());
OccurBooleanQuery query(std::move(clauses));
auto weight = query.weight(false);
@@ -909,7 +909,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithMustNotClause) {
auto must_not_docs = std::vector<uint32_t> {10, 20, 30, 40, 50};
std::vector<std::pair<Occur, QueryPtr>> clauses;
- clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>(100));
+ clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>());
clauses.emplace_back(Occur::MUST_NOT,
std::make_shared<MockQuery>(must_not_docs));
OccurBooleanQuery query(std::move(clauses));
@@ -930,8 +930,8 @@ TEST_F(OccurBooleanQueryTest, MultipleAllQueriesWithMust) {
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::MUST, std::make_shared<MockQuery>(must_docs));
- clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>(100));
- clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>(100));
+ clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>());
+ clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>());
OccurBooleanQuery query(std::move(clauses));
auto weight = query.weight(false);
@@ -945,7 +945,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryOnlyMust) {
_ctx.segment_num_rows = 50;
std::vector<std::pair<Occur, QueryPtr>> clauses;
- clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>(50));
+ clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>());
OccurBooleanQuery query(std::move(clauses));
auto weight = query.weight(false);
@@ -967,7 +967,7 @@ TEST_F(OccurBooleanQueryTest,
AllQueryWithMustAndShouldMinMatch) {
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::MUST, std::make_shared<MockQuery>(must_docs));
- clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>(100));
+ clauses.emplace_back(Occur::MUST, std::make_shared<AllQuery>());
clauses.emplace_back(Occur::SHOULD,
std::make_shared<MockQuery>(should1_docs));
clauses.emplace_back(Occur::SHOULD,
std::make_shared<MockQuery>(should2_docs));
@@ -1014,7 +1014,7 @@ TEST_F(OccurBooleanQueryTest,
ShouldOnlyWithAllQueryMinShouldMatch) {
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::SHOULD,
std::make_shared<MockQuery>(should_docs));
- clauses.emplace_back(Occur::SHOULD, std::make_shared<AllQuery>(50));
+ clauses.emplace_back(Occur::SHOULD, std::make_shared<AllQuery>());
OccurBooleanQuery query(std::move(clauses), 2);
auto weight = query.weight(false);
@@ -1031,7 +1031,7 @@ TEST_F(OccurBooleanQueryTest, ShouldOnlyAllQueryScoring) {
std::vector<std::pair<Occur, QueryPtr>> clauses;
clauses.emplace_back(Occur::SHOULD,
std::make_shared<MockQuery>(std::vector<uint32_t> {1,
2}, 2.0F));
- clauses.emplace_back(Occur::SHOULD, std::make_shared<AllQuery>(10));
+ clauses.emplace_back(Occur::SHOULD, std::make_shared<AllQuery>());
OccurBooleanQuery query(std::move(clauses));
auto weight = query.weight(true);
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/regexp_wildcard_lowercase_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/regexp_wildcard_lowercase_test.cpp
new file mode 100644
index 00000000000..f25ed8db8f0
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/regexp_wildcard_lowercase_test.cpp
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <roaring/roaring.hh>
+#include <string>
+#include <vector>
+
+#include "io/fs/local_file_system.h"
+#include "olap/rowset/segment_v2/index_query_context.h"
+#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h"
+#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+
+namespace doris::segment_v2 {
+
+using namespace inverted_index;
+using namespace inverted_index::query_v2;
+
+// Test that REGEXP queries match directly against the term dictionary (no
lowercasing),
+// while WILDCARD queries are expected to receive already-lowercased patterns
from function_search.cpp.
+//
+// This test creates an index with lowercased terms (simulating
parser=english, lower_case=true)
+// and verifies:
+// 1. REGEXP with uppercase pattern does NOT match lowercased terms
(ES-compatible)
+// 2. REGEXP with lowercase pattern DOES match lowercased terms
+// 3. WILDCARD with lowercase pattern DOES match lowercased terms
+class RegexpWildcardLowercaseTest : public testing::Test {
+public:
+ const std::string kTestDir = "./ut_dir/regexp_wildcard_lowercase_test";
+
+ void SetUp() override {
+ auto st = io::global_local_filesystem()->delete_directory(kTestDir);
+ ASSERT_TRUE(st.ok()) << st;
+ st = io::global_local_filesystem()->create_directory(kTestDir);
+ ASSERT_TRUE(st.ok()) << st;
+ // Create index with lowercased terms (simulating lower_case=true
analyzer)
+ create_test_index("title", kTestDir);
+ }
+
+ void TearDown() override {
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok());
+ }
+
+private:
+ void create_test_index(const std::string& field_name, const std::string&
dir) {
+ // Simulate data that was indexed with lower_case=true:
+ // Original data: "ABC DEF", "abc def", "Apple Banana", "cherry date"
+ // After english analyzer with lower_case=true, terms are all lowercase
+ std::vector<std::string> test_data = {"abc def", "abc def", "apple
banana", "cherry date"};
+
+ // Use standard tokenizer (which lowercases by default)
+ CustomAnalyzerConfig::Builder builder;
+ builder.with_tokenizer_config("standard", {});
+ auto custom_analyzer_config = builder.build();
+ auto custom_analyzer =
CustomAnalyzer::build_custom_analyzer(custom_analyzer_config);
+
+ auto* indexwriter =
+ _CLNEW lucene::index::IndexWriter(dir.c_str(),
custom_analyzer.get(), true);
+ indexwriter->setMaxBufferedDocs(100);
+ indexwriter->setRAMBufferSizeMB(-1);
+ indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+ indexwriter->setMergeFactor(1000000000);
+ indexwriter->setUseCompoundFile(false);
+
+ auto char_string_reader =
std::make_shared<lucene::util::SStringReader<char>>();
+
+ auto* doc = _CLNEW lucene::document::Document();
+ int32_t field_config = lucene::document::Field::STORE_NO;
+ field_config |= lucene::document::Field::INDEX_NONORMS;
+ field_config |= lucene::document::Field::INDEX_TOKENIZED;
+ auto field_name_w = std::wstring(field_name.begin(), field_name.end());
+ auto* field = _CLNEW lucene::document::Field(field_name_w.c_str(),
field_config);
+ field->setOmitTermFreqAndPositions(false);
+ doc->add(*field);
+
+ for (const auto& data : test_data) {
+ char_string_reader->init(data.data(), data.size(), false);
+ auto* stream = custom_analyzer->reusableTokenStream(field->name(),
char_string_reader);
+ field->setValue(stream);
+ indexwriter->addDocument(doc);
+ }
+
+ indexwriter->close();
+ _CLLDELETE(indexwriter);
+ _CLLDELETE(doc);
+ }
+};
+
+static std::shared_ptr<lucene::index::IndexReader> make_shared_reader(
+ lucene::index::IndexReader* raw_reader) {
+ return {raw_reader, [](lucene::index::IndexReader* reader) {
+ if (reader != nullptr) {
+ reader->close();
+ _CLDELETE(reader);
+ }
+ }};
+}
+
+static std::vector<uint32_t> execute_query(const std::string& test_dir, const
std::wstring& field,
+ const std::shared_ptr<Query>&
query) {
+ auto* dir = FSDirectory::getDirectory(test_dir.c_str());
+ auto reader_holder =
make_shared_reader(lucene::index::IndexReader::open(dir, true));
+
+ auto weight = query->weight(false);
+
+ QueryExecutionContext exec_ctx;
+ exec_ctx.segment_num_rows = reader_holder->maxDoc();
+ exec_ctx.readers = {reader_holder};
+ exec_ctx.field_reader_bindings.emplace(field, reader_holder);
+
+ auto scorer = weight->scorer(exec_ctx);
+ std::vector<uint32_t> matched_docs;
+ if (scorer) {
+ uint32_t doc = scorer->doc();
+ while (doc != TERMINATED) {
+ matched_docs.push_back(doc);
+ doc = scorer->advance();
+ }
+ }
+
+ _CLDECDELETE(dir);
+ return matched_docs;
+}
+
+// REGEXP with uppercase pattern should NOT match lowercased index terms.
+// This is consistent with ES query_string regex behavior.
+TEST_F(RegexpWildcardLowercaseTest, RegexpUppercasePatternNoMatch) {
+ auto context = std::make_shared<IndexQueryContext>();
+ std::wstring field = StringHelper::to_wstring("title");
+
+ // Pattern "AB.*" should NOT match "abc" (uppercase vs lowercase)
+ auto query = std::make_shared<RegexpQuery>(context, field, "AB.*");
+ auto matched = execute_query(kTestDir, field, query);
+
+ EXPECT_EQ(matched.size(), 0)
+ << "Uppercase regex 'AB.*' should not match lowercased terms
'abc'";
+}
+
+// REGEXP with lowercase pattern SHOULD match lowercased index terms.
+TEST_F(RegexpWildcardLowercaseTest, RegexpLowercasePatternMatches) {
+ auto context = std::make_shared<IndexQueryContext>();
+ std::wstring field = StringHelper::to_wstring("title");
+
+ // Pattern "ab.*" should match "abc" (both lowercase)
+ auto query = std::make_shared<RegexpQuery>(context, field, "ab.*");
+ auto matched = execute_query(kTestDir, field, query);
+
+ // Docs 0 and 1 contain "abc", docs 2 and 3 don't
+ EXPECT_EQ(matched.size(), 2) << "Lowercase regex 'ab.*' should match
lowercased terms 'abc'";
+}
+
+// WILDCARD with lowercase pattern SHOULD match.
+// In function_search.cpp, WILDCARD patterns are lowercased before being
passed here.
+TEST_F(RegexpWildcardLowercaseTest, WildcardLowercasePatternMatches) {
+ auto context = std::make_shared<IndexQueryContext>();
+ std::wstring field = StringHelper::to_wstring("title");
+
+ // Pattern "ab*" (already lowercased by function_search.cpp) should match
"abc"
+ auto query = std::make_shared<WildcardQuery>(context, field, "ab*");
+ auto matched = execute_query(kTestDir, field, query);
+
+ EXPECT_EQ(matched.size(), 2) << "Lowercase wildcard 'ab*' should match
lowercased terms 'abc'";
+}
+
+// WILDCARD with uppercase pattern should NOT match lowercased index terms
+// (but in practice, function_search.cpp lowercases before passing to
WildcardQuery).
+TEST_F(RegexpWildcardLowercaseTest, WildcardUppercasePatternNoMatch) {
+ auto context = std::make_shared<IndexQueryContext>();
+ std::wstring field = StringHelper::to_wstring("title");
+
+ // Pattern "AB*" should NOT match "abc" at the WildcardQuery level
+ auto query = std::make_shared<WildcardQuery>(context, field, "AB*");
+ auto matched = execute_query(kTestDir, field, query);
+
+ EXPECT_EQ(matched.size(), 0) << "Uppercase wildcard 'AB*' should not match
lowercased terms";
+}
+
+// REGEXP with a more complex pattern
+TEST_F(RegexpWildcardLowercaseTest, RegexpComplexPatternMatches) {
+ auto context = std::make_shared<IndexQueryContext>();
+ std::wstring field = StringHelper::to_wstring("title");
+
+ // Pattern "ch.*y" should match "cherry" (lowercased)
+ auto query = std::make_shared<RegexpQuery>(context, field, "ch.*y");
+ auto matched = execute_query(kTestDir, field, query);
+
+ EXPECT_EQ(matched.size(), 1) << "Regex 'ch.*y' should match 'cherry' in
doc 3";
+ if (!matched.empty()) {
+ EXPECT_EQ(matched[0], 3);
+ }
+}
+
+// WILDCARD matching all terms with '*'
+TEST_F(RegexpWildcardLowercaseTest, WildcardStarMatchesAll) {
+ auto context = std::make_shared<IndexQueryContext>();
+ std::wstring field = StringHelper::to_wstring("title");
+
+ // Pattern "a*" should match "abc" and "apple"
+ auto query = std::make_shared<WildcardQuery>(context, field, "a*");
+ auto matched = execute_query(kTestDir, field, query);
+
+ // Docs 0,1 have "abc", doc 2 has "apple", doc 3 has no "a*" terms
+ EXPECT_EQ(matched.size(), 3) << "Wildcard 'a*' should match docs with
'abc' and 'apple'";
+}
+
+} // namespace doris::segment_v2
diff --git a/be/test/vec/function/function_search_test.cpp
b/be/test/vec/function/function_search_test.cpp
index 64b64b0d667..4daa48f662a 100644
--- a/be/test/vec/function/function_search_test.cpp
+++ b/be/test/vec/function/function_search_test.cpp
@@ -1716,8 +1716,8 @@ TEST_F(FunctionSearchTest, TestBuildLeafQueryPhrase) {
inverted_index::query_v2::QueryPtr out;
std::string out_binding_key;
- Status st =
- function_search->build_leaf_query(clause, context, resolver, &out,
&out_binding_key);
+ Status st = function_search->build_leaf_query(clause, context, resolver,
&out, &out_binding_key,
+ "OR", 0);
EXPECT_TRUE(st.ok());
auto phrase_query =
std::dynamic_pointer_cast<inverted_index::query_v2::PhraseQuery>(out);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
index b53386206e9..2cd1035e298 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
@@ -17,6 +17,7 @@
package org.apache.doris.analysis;
+import org.apache.doris.catalog.Index;
import org.apache.doris.catalog.Type;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsPlan;
@@ -33,7 +34,9 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
+import java.util.Map;
import java.util.stream.IntStream;
/**
@@ -45,11 +48,18 @@ public class SearchPredicate extends Predicate {
private final String dslString;
private final QsPlan qsPlan;
+ private final List<Index> fieldIndexes;
public SearchPredicate(String dslString, QsPlan qsPlan, List<Expr>
children, boolean nullable) {
+ this(dslString, qsPlan, children, Collections.emptyList(), nullable);
+ }
+
+ public SearchPredicate(String dslString, QsPlan qsPlan, List<Expr>
children,
+ List<Index> fieldIndexes, boolean nullable) {
super();
this.dslString = dslString;
this.qsPlan = qsPlan;
+ this.fieldIndexes = fieldIndexes != null ? fieldIndexes :
Collections.emptyList();
this.type = Type.BOOLEAN;
// Add children (SlotReferences)
@@ -63,6 +73,7 @@ public class SearchPredicate extends Predicate {
super(other);
this.dslString = other.dslString;
this.qsPlan = other.qsPlan;
+ this.fieldIndexes = other.fieldIndexes;
}
@Override
@@ -183,10 +194,30 @@ public class SearchPredicate extends Predicate {
thriftBinding.setSlotIndex(i); // fallback to position
}
+ // Set index properties from FE Index lookup (needed for variant
subcolumn analyzer)
+ if (i < fieldIndexes.size() && fieldIndexes.get(i) != null) {
+ Map<String, String> properties =
fieldIndexes.get(i).getProperties();
+ if (properties != null && !properties.isEmpty()) {
+ thriftBinding.setIndexProperties(properties);
+ LOG.debug("buildThriftParam: field='{}'
index_properties={}",
+ fieldPath, properties);
+ }
+ }
+
bindings.add(thriftBinding);
}
param.setFieldBindings(bindings);
+ // Set default_operator for BE to use when tokenizing TERM queries
+ if (qsPlan.getDefaultOperator() != null) {
+ param.setDefaultOperator(qsPlan.getDefaultOperator());
+ }
+
+ // Set minimum_should_match for BE to use when tokenizing TERM queries
in Lucene mode
+ if (qsPlan.getMinimumShouldMatch() != null) {
+ param.setMinimumShouldMatch(qsPlan.getMinimumShouldMatch());
+ }
+
return param;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
index a437e315371..c489576c7eb 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
@@ -656,16 +656,32 @@ public class ExpressionTranslator extends
DefaultExpressionVisitor<Expr, PlanTra
public Expr visitSearchExpression(SearchExpression searchExpression,
PlanTranslatorContext context) {
List<Expr> slotChildren = new ArrayList<>();
+ List<Index> fieldIndexes = new ArrayList<>();
// Convert slot reference children from Nereids to Analysis
for (Expression slotExpr : searchExpression.getSlotChildren()) {
Expr translatedSlot = slotExpr.accept(this, context);
slotChildren.add(translatedSlot);
+
+ // Look up the inverted index for each field (needed for variant
subcolumn analyzer)
+ Index invertedIndex = null;
+ if (slotExpr instanceof SlotReference) {
+ SlotReference slot = (SlotReference) slotExpr;
+ OlapTable olapTbl = getOlapTableDirectly(slot);
+ if (olapTbl != null) {
+ Column column = slot.getOriginalColumn().orElse(null);
+ if (column != null) {
+ invertedIndex = olapTbl.getInvertedIndex(column,
slot.getSubPath());
+ }
+ }
+ }
+ fieldIndexes.add(invertedIndex);
}
// Create SearchPredicate with proper slot children for BE "action on
slot" detection
SearchPredicate searchPredicate = new
SearchPredicate(searchExpression.getDslString(),
- searchExpression.getQsPlan(), slotChildren,
searchExpression.nullable());
+ searchExpression.getQsPlan(), slotChildren, fieldIndexes,
+ searchExpression.nullable());
return searchPredicate;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
index 61e253d710e..fbaba2b6e5d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
@@ -22,6 +22,7 @@ import org.apache.doris.nereids.search.SearchParser;
import org.apache.doris.nereids.search.SearchParserBaseVisitor;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSetter;
import com.fasterxml.jackson.core.JsonProcessingException;
@@ -110,23 +111,30 @@ public class SearchDslParser {
String defaultField = searchOptions.getDefaultField();
String defaultOperator = searchOptions.getDefaultOperator();
+ QsPlan plan;
// Use Lucene mode parser if specified
if (searchOptions.isLuceneMode()) {
// Multi-field + Lucene mode: first expand DSL, then parse with
Lucene semantics
if (searchOptions.isMultiFieldMode()) {
- return parseDslMultiFieldLuceneMode(dsl,
searchOptions.getFields(),
+ plan = parseDslMultiFieldLuceneMode(dsl,
searchOptions.getFields(),
defaultOperator, searchOptions);
- }
- return parseDslLuceneMode(dsl, defaultField, defaultOperator,
searchOptions);
- }
-
- // Multi-field mode parsing (standard mode)
- if (searchOptions.isMultiFieldMode()) {
- return parseDslMultiFieldMode(dsl, searchOptions.getFields(),
defaultOperator, searchOptions);
- }
-
- // Standard mode parsing
- return parseDslStandardMode(dsl, defaultField, defaultOperator);
+ } else {
+ plan = parseDslLuceneMode(dsl, defaultField, defaultOperator,
searchOptions);
+ }
+ } else if (searchOptions.isMultiFieldMode()) {
+ // Multi-field mode parsing (standard mode)
+ plan = parseDslMultiFieldMode(dsl, searchOptions.getFields(),
defaultOperator, searchOptions);
+ } else {
+ // Standard mode parsing
+ plan = parseDslStandardMode(dsl, defaultField, defaultOperator);
+ }
+
+ // Wrap plan with options for BE serialization
+ // NOTE: Must use normalizeDefaultOperator() here because BE compares
+ // default_operator case-sensitively against lowercase "and"/"or"
+ return new QsPlan(plan.getRoot(), plan.getFieldBindings(),
+ normalizeDefaultOperator(searchOptions.getDefaultOperator()),
+ searchOptions.getMinimumShouldMatch());
}
/**
@@ -480,6 +488,12 @@ public class SearchDslParser {
}
validateFieldsList(fields);
+ // For multi-field mode (fields.size() > 1), ignore
minimum_should_match.
+ // The expanded DSL creates complex nested boolean structures where msm
+ // semantics become ambiguous. This is a deliberate design decision.
+ final SearchOptions effectiveOptions = fields.size() > 1
+ ? options.withMinimumShouldMatch(null) : options;
+
String trimmedDsl = dsl.trim();
try {
@@ -507,22 +521,15 @@ public class SearchDslParser {
// Build AST using Lucene-mode visitor with first field as
placeholder for bare queries
// Use constructor with override to avoid mutating shared options
object (thread-safety)
- QsLuceneModeAstBuilder visitor = new
QsLuceneModeAstBuilder(options, fields.get(0));
+ QsLuceneModeAstBuilder visitor = new
QsLuceneModeAstBuilder(effectiveOptions, fields.get(0));
QsNode root = visitor.visit(tree);
- // Apply multi-field expansion based on type
- // Pass luceneMode=true since this is Lucene mode parsing
- QsNode expandedRoot;
- if (options.isCrossFieldsMode()) {
- // cross_fields: each term expands to
OCCUR_BOOLEAN(field1:term, field2:term)
- expandedRoot = MultiFieldExpander.expandCrossFields(root,
fields, true);
- } else if (options.isBestFieldsMode()) {
- // best_fields: entire query copied per field, joined with
OCCUR_BOOLEAN
- expandedRoot = MultiFieldExpander.expandBestFields(root,
fields, true);
- } else {
- throw new IllegalStateException(
- "Invalid type value: '" + options.getType() + "'.
Expected 'best_fields' or 'cross_fields'");
- }
+ // In ES query_string, both best_fields and cross_fields use
per-clause expansion
+ // (each clause is independently expanded across fields). The
difference is only
+ // in scoring (dis_max vs blended analysis), which doesn't apply
to Doris since
+ // search() is a boolean filter. So we always use
expandCrossFields here.
+ // Type validation already happened in SearchOptions.setType().
+ QsNode expandedRoot = MultiFieldExpander.expandCrossFields(root,
fields, true);
// Extract field bindings from expanded AST
Set<String> fieldNames = collectFieldNames(expandedRoot);
@@ -532,7 +539,10 @@ public class SearchDslParser {
bindings.add(new QsFieldBinding(fieldName, slotIndex++));
}
- return new QsPlan(expandedRoot, bindings);
+ // Include default_operator and minimum_should_match for BE
+ return new QsPlan(expandedRoot, bindings,
+
normalizeDefaultOperator(effectiveOptions.getDefaultOperator()),
+ effectiveOptions.getMinimumShouldMatch());
} catch (SearchDslSyntaxException e) {
LOG.error("Failed to parse search DSL in multi-field Lucene mode:
'{}'", dsl, e);
@@ -560,7 +570,8 @@ public class SearchDslParser {
AND, // clause1 AND clause2 (standard boolean algebra)
OR, // clause1 OR clause2 (standard boolean algebra)
NOT, // NOT clause (standard boolean algebra)
- OCCUR_BOOLEAN // Lucene-style boolean query with MUST/SHOULD/MUST_NOT
+ OCCUR_BOOLEAN, // Lucene-style boolean query with MUST/SHOULD/MUST_NOT
+ MATCH_ALL_DOCS // Matches all documents (used for pure NOT query
rewriting)
}
/**
@@ -816,6 +827,8 @@ public class SearchDslParser {
if (result == null) {
throw new RuntimeException("Invalid search value");
}
+ // Mark as explicit field - user wrote "field:term" syntax
+ result.setExplicitField(true);
return result;
} finally {
// Restore previous context
@@ -875,6 +888,10 @@ public class SearchDslParser {
}
private QsNode createPrefixNode(String fieldName, String value) {
+ // Standalone * → MATCH_ALL_DOCS (matches ES behavior: field:*
becomes ExistsQuery)
+ if ("*".equals(value)) {
+ return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List<QsNode>)
null);
+ }
return new QsNode(QsClauseType.PREFIX, fieldName,
unescapeTermValue(value));
}
@@ -996,11 +1013,28 @@ public class SearchDslParser {
@JsonProperty("fieldBindings")
private final List<QsFieldBinding> fieldBindings;
+ @JsonProperty("defaultOperator")
+ private final String defaultOperator;
+
+ @JsonProperty("minimumShouldMatch")
+ private final Integer minimumShouldMatch;
+
@JsonCreator
public QsPlan(@JsonProperty("root") QsNode root,
@JsonProperty("fieldBindings") List<QsFieldBinding>
fieldBindings) {
+ this(root, fieldBindings, null, null);
+ }
+
+ public QsPlan(QsNode root, List<QsFieldBinding> fieldBindings, String
defaultOperator) {
+ this(root, fieldBindings, defaultOperator, null);
+ }
+
+ public QsPlan(QsNode root, List<QsFieldBinding> fieldBindings, String
defaultOperator,
+ Integer minimumShouldMatch) {
this.root = Objects.requireNonNull(root, "root cannot be null");
this.fieldBindings = fieldBindings != null ? new
ArrayList<>(fieldBindings) : new ArrayList<>();
+ this.defaultOperator = defaultOperator;
+ this.minimumShouldMatch = minimumShouldMatch;
}
public QsNode getRoot() {
@@ -1011,6 +1045,14 @@ public class SearchDslParser {
return Collections.unmodifiableList(fieldBindings);
}
+ public String getDefaultOperator() {
+ return defaultOperator;
+ }
+
+ public Integer getMinimumShouldMatch() {
+ return minimumShouldMatch;
+ }
+
/**
* Parse QsPlan from JSON string
*/
@@ -1036,7 +1078,7 @@ public class SearchDslParser {
@Override
public int hashCode() {
- return Objects.hash(root, fieldBindings);
+ return Objects.hash(root, fieldBindings, defaultOperator,
minimumShouldMatch);
}
@Override
@@ -1049,7 +1091,9 @@ public class SearchDslParser {
}
QsPlan qsPlan = (QsPlan) o;
return Objects.equals(root, qsPlan.getRoot())
- && Objects.equals(fieldBindings,
qsPlan.getFieldBindings());
+ && Objects.equals(fieldBindings, qsPlan.getFieldBindings())
+ && Objects.equals(defaultOperator,
qsPlan.getDefaultOperator())
+ && Objects.equals(minimumShouldMatch,
qsPlan.getMinimumShouldMatch());
}
}
@@ -1081,6 +1125,15 @@ public class SearchDslParser {
@JsonProperty("minimumShouldMatch")
private final Integer minimumShouldMatch;
+ /**
+ * Whether the field was explicitly specified in the DSL syntax (e.g.,
title:music)
+ * vs assigned from default field for bare queries (e.g., music).
+ * Used internally by MultiFieldExpander to avoid expanding explicit
field prefixes.
+ * Not serialized to JSON since it's only needed during FE-side AST
expansion.
+ */
+ @JsonIgnore
+ private boolean explicitField;
+
/**
* Constructor for JSON deserialization
*
@@ -1185,6 +1238,23 @@ public class SearchDslParser {
return minimumShouldMatch;
}
+ /**
+ * Returns whether the field was explicitly specified in the DSL
syntax.
+ */
+ public boolean isExplicitField() {
+ return explicitField;
+ }
+
+ /**
+ * Sets whether the field was explicitly specified in the DSL syntax.
+ * @param explicitField true if field was explicitly specified (e.g.,
title:music)
+ * @return this node for method chaining
+ */
+ public QsNode setExplicitField(boolean explicitField) {
+ this.explicitField = explicitField;
+ return this;
+ }
+
/**
* Sets the occur type for this node.
* @param occur the occur type (MUST, SHOULD, MUST_NOT)
@@ -1319,51 +1389,23 @@ public class SearchDslParser {
* @return Expanded AST
*/
public static QsNode expandBestFields(QsNode root, List<String>
fields) {
- return expandBestFields(root, fields, false);
- }
-
- /**
- * Expand AST using best_fields strategy with optional Lucene mode.
- * @param root The AST root node
- * @param fields List of fields to expand across
- * @param luceneMode If true, use Lucene-style OCCUR_BOOLEAN; if
false, use standard OR
- */
- public static QsNode expandBestFields(QsNode root, List<String>
fields, boolean luceneMode) {
if (fields == null || fields.isEmpty()) {
return root;
}
if (fields.size() == 1) {
- // Single field - just set the field on all leaf nodes
return setFieldOnLeaves(root, fields.get(0), fields);
}
- // Use the explicit luceneMode parameter only - don't infer from
node properties
- boolean isLuceneMode = luceneMode;
-
- // Create a copy of the entire AST for each field
+ // Non-lucene mode (used by parseDslMultiFieldMode for multi_match
semantics):
+ // Copy entire AST per field, join with OR.
+ // Example: "hello AND world" with fields=[title,content] becomes
+ // (title:hello AND title:world) OR (content:hello AND
content:world)
List<QsNode> fieldTrees = new ArrayList<>();
for (String field : fields) {
QsNode copy = deepCopyWithField(root, field, fields);
- // In Lucene mode, set SHOULD on each field tree
- if (isLuceneMode) {
- copy.setOccur(QsOccur.SHOULD);
- }
fieldTrees.add(copy);
}
-
- // In Lucene mode, create OCCUR_BOOLEAN instead of OR
- if (isLuceneMode) {
- // Preserve minimum_should_match from root if it has one
- Integer minShouldMatch = root.getMinimumShouldMatch();
- if (minShouldMatch == null) {
- // Default: at least 1 field should match
- minShouldMatch = 1;
- }
- return new QsNode(QsClauseType.OCCUR_BOOLEAN, fieldTrees,
minShouldMatch);
- } else {
- // Standard mode: join with OR
- return new QsNode(QsClauseType.OR, fieldTrees);
- }
+ return new QsNode(QsClauseType.OR, fieldTrees);
}
/**
@@ -1371,13 +1413,15 @@ public class SearchDslParser {
* Always returns a new copy or new node structure, never the original
node.
*/
private static QsNode expandNodeCrossFields(QsNode node, List<String>
fields, boolean luceneMode) {
+ // MATCH_ALL_DOCS matches all documents regardless of field -
don't expand
+ if (node.getType() == QsClauseType.MATCH_ALL_DOCS) {
+ return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List<QsNode>)
null);
+ }
+
// Check if this is a leaf node (no children)
if (isLeafNode(node)) {
- // Check if the node has an explicit field that's NOT in the
fields list
- // If so, don't expand but still return a copy
- String nodeField = node.getField();
- if (nodeField != null && !nodeField.isEmpty() &&
!fields.contains(nodeField)) {
- // Explicit field not in expansion list - return a copy
preserving all fields
+ // If the user explicitly wrote "field:term" syntax, respect
it - don't expand
+ if (node.isExplicitField()) {
return new QsNode(
node.getType(),
node.getField(),
@@ -1450,17 +1494,13 @@ public class SearchDslParser {
* Always returns a new copy, never the original node.
*/
private static QsNode deepCopyWithField(QsNode node, String field,
List<String> fields) {
+ // MATCH_ALL_DOCS matches all documents regardless of field -
don't set field
+ if (node.getType() == QsClauseType.MATCH_ALL_DOCS) {
+ return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List<QsNode>)
null);
+ }
if (isLeafNode(node)) {
- // Check if the node has an explicit field that's NOT in the
fields list
- String nodeField = node.getField();
- String targetField;
- if (nodeField != null && !nodeField.isEmpty() &&
!fields.contains(nodeField)) {
- // Explicit field not in expansion list - preserve
original field
- targetField = nodeField;
- } else {
- // Use new field
- targetField = field;
- }
+ // If the user explicitly wrote "field:term" syntax, preserve
original field
+ String targetField = node.isExplicitField() ? node.getField()
: field;
// Create a complete copy of the leaf node
QsNode copy = new QsNode(
@@ -1471,6 +1511,7 @@ public class SearchDslParser {
node.getOccur(),
node.getMinimumShouldMatch()
);
+ copy.setExplicitField(node.isExplicitField());
return copy;
}
@@ -1500,16 +1541,13 @@ public class SearchDslParser {
* Always returns a new copy, never the original node.
*/
private static QsNode setFieldOnLeaves(QsNode node, String field,
List<String> fields) {
+ // MATCH_ALL_DOCS matches all documents regardless of field -
don't set field
+ if (node.getType() == QsClauseType.MATCH_ALL_DOCS) {
+ return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List<QsNode>)
null);
+ }
if (isLeafNode(node)) {
- // Check if the node has an explicit field that's NOT in the
fields list
- String nodeField = node.getField();
- String targetField;
- if (nodeField != null && !nodeField.isEmpty() &&
!fields.contains(nodeField)) {
- // Explicit field not in expansion list - preserve
original field
- targetField = nodeField;
- } else {
- targetField = field;
- }
+ // If the user explicitly wrote "field:term" syntax, preserve
original field
+ String targetField = node.isExplicitField() ? node.getField()
: field;
// Create complete copy
return new QsNode(
@@ -1676,6 +1714,21 @@ public class SearchDslParser {
return "cross_fields".equals(type);
}
+ /**
+ * Create a copy of this SearchOptions with a different
minimum_should_match value.
+ * Used for ES compatibility in multi-field mode where msm is ignored.
+ */
+ public SearchOptions withMinimumShouldMatch(Integer newMsm) {
+ SearchOptions copy = new SearchOptions();
+ copy.defaultField = this.defaultField;
+ copy.defaultOperator = this.defaultOperator;
+ copy.mode = this.mode;
+ copy.minimumShouldMatch = newMsm;
+ copy.fields = this.fields != null ? new ArrayList<>(this.fields) :
null;
+ copy.type = this.type;
+ return copy;
+ }
+
/**
* Validate the options after deserialization.
* Checks for:
@@ -1793,7 +1846,10 @@ public class SearchDslParser {
bindings.add(new QsFieldBinding(fieldName, slotIndex++));
}
- return new QsPlan(root, bindings);
+ // Include default_operator and minimum_should_match for BE
+ return new QsPlan(root, bindings,
+ normalizeDefaultOperator(defaultOperator),
+ options.getMinimumShouldMatch());
} catch (SearchDslSyntaxException e) {
// Syntax error in DSL - user input issue
@@ -1831,6 +1887,7 @@ public class SearchDslParser {
private String currentFieldName = null;
// Override for default field - used in multi-field mode to avoid
mutating options
private final String overrideDefaultField;
+ private int nestingLevel = 0;
public QsLuceneModeAstBuilder(SearchOptions options) {
this.options = options;
@@ -1894,11 +1951,17 @@ public class SearchDslParser {
if (terms.size() == 1) {
TermWithOccur singleTerm = terms.get(0);
if (singleTerm.isNegated) {
- // Single negated term - must wrap in OCCUR_BOOLEAN for BE
to handle MUST_NOT
+ // Single negated term - rewrite to:
SHOULD(MATCH_ALL_DOCS) + MUST_NOT(term)
+ // This ensures proper Lucene semantics: match all docs
EXCEPT those matching the term
singleTerm.node.setOccur(QsOccur.MUST_NOT);
+
+ QsNode matchAllNode = new
QsNode(QsClauseType.MATCH_ALL_DOCS, (List<QsNode>) null);
+ matchAllNode.setOccur(QsOccur.SHOULD);
+
List<QsNode> children = new ArrayList<>();
+ children.add(matchAllNode);
children.add(singleTerm.node);
- return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0);
+ return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 1);
}
// Single non-negated term - return directly without wrapper
return singleTerm.node;
@@ -1908,37 +1971,32 @@ public class SearchDslParser {
applyLuceneBooleanLogic(terms);
// Determine minimum_should_match
- Integer minShouldMatch = options.getMinimumShouldMatch();
+ // Only use explicit option at top level; nested clauses use
default logic
+ Integer minShouldMatch = (nestingLevel == 0) ?
options.getMinimumShouldMatch() : null;
if (minShouldMatch == null) {
// Default: 0 if there are MUST clauses, 1 if only SHOULD
+ // This matches Lucene BooleanQuery default behavior
boolean hasMust = terms.stream().anyMatch(t -> t.occur ==
QsOccur.MUST);
boolean hasMustNot = terms.stream().anyMatch(t -> t.occur ==
QsOccur.MUST_NOT);
minShouldMatch = (hasMust || hasMustNot) ? 0 : 1;
}
- // Filter out SHOULD clauses if minimum_should_match=0 and there
are MUST clauses
final int finalMinShouldMatch = minShouldMatch;
- if (minShouldMatch == 0) {
- boolean hasMust = terms.stream().anyMatch(t -> t.occur ==
QsOccur.MUST);
- if (hasMust) {
- terms = terms.stream()
- .filter(t -> t.occur != QsOccur.SHOULD)
- .collect(Collectors.toList());
- }
- }
-
- if (terms.isEmpty()) {
- throw new RuntimeException("All terms filtered out in Lucene
boolean logic");
- }
if (terms.size() == 1) {
TermWithOccur remainingTerm = terms.get(0);
if (remainingTerm.occur == QsOccur.MUST_NOT) {
- // Single MUST_NOT term - must wrap in OCCUR_BOOLEAN for
BE to handle
+ // Single MUST_NOT term - rewrite to:
SHOULD(MATCH_ALL_DOCS) + MUST_NOT(term)
+ // This ensures proper Lucene semantics: match all docs
EXCEPT those matching the term
remainingTerm.node.setOccur(QsOccur.MUST_NOT);
+
+ QsNode matchAllNode = new
QsNode(QsClauseType.MATCH_ALL_DOCS, (List<QsNode>) null);
+ matchAllNode.setOccur(QsOccur.SHOULD);
+
List<QsNode> children = new ArrayList<>();
+ children.add(matchAllNode);
children.add(remainingTerm.node);
- return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0);
+ return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 1);
}
return remainingTerm.node;
}
@@ -2026,8 +2084,14 @@ public class SearchDslParser {
QsNode node;
if (atomCtx.clause() != null) {
- // Parenthesized clause - visit recursively
- node = visit(atomCtx.clause());
+ // Parenthesized clause - visit recursively with incremented
nesting level
+ // This ensures nested clauses don't use top-level
minimum_should_match
+ nestingLevel++;
+ try {
+ node = visit(atomCtx.clause());
+ } finally {
+ nestingLevel--;
+ }
} else if (atomCtx.fieldQuery() != null) {
// Field query with explicit field prefix
node = visit(atomCtx.fieldQuery());
@@ -2048,14 +2112,23 @@ public class SearchDslParser {
/**
* Apply Lucene boolean logic to determine final MUST/SHOULD/MUST_NOT
for each term.
* <p>
- * Rules (processed left-to-right):
- * 1. First term: MUST (due to default_operator=AND)
- * 2. AND introduces: marks preceding and current as MUST
- * 3. OR introduces: marks preceding and current as SHOULD
- * 4. NOT modifier: marks current as MUST_NOT
- * 5. AND after MUST_NOT: the MUST_NOT term is not affected, current
becomes MUST
+ * Faithfully replicates Lucene QueryParserBase.addClause() semantics:
+ * - Processes terms left-to-right with NO operator precedence (AND/OR
are equal)
+ * - Each conjunction affects at most the immediately preceding term
+ * <p>
+ * With OR_OPERATOR (default_operator=OR):
+ * - First term / no conjunction: SHOULD
+ * - AND: preceding becomes MUST, current MUST
+ * - OR: current SHOULD (preceding unchanged)
+ * <p>
+ * With AND_OPERATOR (default_operator=AND):
+ * - First term / no conjunction: MUST
+ * - AND: preceding becomes MUST, current MUST
+ * - OR: preceding becomes SHOULD, current SHOULD
*/
private void applyLuceneBooleanLogic(List<TermWithOccur> terms) {
+ boolean useAnd =
"AND".equalsIgnoreCase(options.getDefaultOperator());
+
for (int i = 0; i < terms.size(); i++) {
TermWithOccur current = terms.get(i);
@@ -2063,36 +2136,44 @@ public class SearchDslParser {
// NOT modifier - mark as MUST_NOT
current.occur = QsOccur.MUST_NOT;
- // OR + NOT: preceding becomes SHOULD (if not already
MUST_NOT)
- if (current.introducedByOr && i > 0) {
+ if (current.introducedByAnd && i > 0) {
+ // AND + NOT: AND still makes preceding MUST
+ TermWithOccur prev = terms.get(i - 1);
+ if (prev.occur != QsOccur.MUST_NOT) {
+ prev.occur = QsOccur.MUST;
+ }
+ } else if (current.introducedByOr && i > 0 && useAnd) {
+ // OR + NOT with AND_OPERATOR: preceding becomes SHOULD
TermWithOccur prev = terms.get(i - 1);
if (prev.occur != QsOccur.MUST_NOT) {
prev.occur = QsOccur.SHOULD;
}
}
+ // OR + NOT with OR_OPERATOR: no change to preceding
} else if (current.introducedByAnd) {
- // AND introduces: both preceding and current are MUST
+ // AND: preceding becomes MUST, current MUST
current.occur = QsOccur.MUST;
if (i > 0) {
TermWithOccur prev = terms.get(i - 1);
- // Don't change MUST_NOT to MUST
if (prev.occur != QsOccur.MUST_NOT) {
prev.occur = QsOccur.MUST;
}
}
} else if (current.introducedByOr) {
- // OR introduces: both preceding and current are SHOULD
+ // OR: current is SHOULD
current.occur = QsOccur.SHOULD;
- if (i > 0) {
+ // Only change preceding to SHOULD if default_operator=AND
+ // (Lucene: OR_OPERATOR + CONJ_OR does NOT modify
preceding)
+ if (useAnd && i > 0) {
TermWithOccur prev = terms.get(i - 1);
- // Don't change MUST_NOT to SHOULD
if (prev.occur != QsOccur.MUST_NOT) {
prev.occur = QsOccur.SHOULD;
}
}
} else {
- // First term: MUST (default_operator=AND)
- current.occur = QsOccur.MUST;
+ // First term or implicit conjunction (no explicit AND/OR)
+ // Lucene: SHOULD for OR_OPERATOR, MUST for AND_OPERATOR
+ current.occur = useAnd ? QsOccur.MUST : QsOccur.SHOULD;
}
}
}
@@ -2218,7 +2299,10 @@ public class SearchDslParser {
currentFieldName = fieldPath;
try {
- return visit(ctx.searchValue());
+ QsNode result = visit(ctx.searchValue());
+ // Mark as explicit field - user wrote "field:term" syntax
+ result.setExplicitField(true);
+ return result;
} finally {
currentFieldName = previousFieldName;
}
@@ -2242,7 +2326,12 @@ public class SearchDslParser {
return new QsNode(QsClauseType.TERM, fieldName,
unescapeTermValue(ctx.TERM().getText()));
}
if (ctx.PREFIX() != null) {
- return new QsNode(QsClauseType.PREFIX, fieldName,
unescapeTermValue(ctx.PREFIX().getText()));
+ String prefixText = ctx.PREFIX().getText();
+ // Standalone * → MATCH_ALL_DOCS (matches ES behavior: field:*
becomes ExistsQuery)
+ if ("*".equals(prefixText)) {
+ return new QsNode(QsClauseType.MATCH_ALL_DOCS,
(List<QsNode>) null);
+ }
+ return new QsNode(QsClauseType.PREFIX, fieldName,
unescapeTermValue(prefixText));
}
if (ctx.WILDCARD() != null) {
return new QsNode(QsClauseType.WILDCARD, fieldName,
unescapeTermValue(ctx.WILDCARD().getText()));
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java
b/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java
index 65c6a750766..62c45e9a60d 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java
@@ -17,10 +17,12 @@
package org.apache.doris.analysis;
+import org.apache.doris.catalog.Index;
import org.apache.doris.catalog.Type;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser;
import org.apache.doris.thrift.TExprNode;
import org.apache.doris.thrift.TExprNodeType;
+import org.apache.doris.thrift.TSearchFieldBinding;
import org.apache.doris.thrift.TSearchParam;
import org.junit.jupiter.api.Assertions;
@@ -28,7 +30,9 @@ import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
/**
@@ -252,4 +256,136 @@ public class SearchPredicateTest {
Assertions.assertNotNull(thriftNode.search_param);
Assertions.assertEquals(dsl, thriftNode.search_param.original_dsl);
}
+
+ @Test
+ public void testFieldIndexesPassedToThrift() {
+ // Simulate a variant subcolumn search where FE passes index properties
+ String dsl = "data.string_8:admin";
+
+ SearchDslParser.QsNode root = new SearchDslParser.QsNode(
+ SearchDslParser.QsClauseType.TERM, "data.string_8", "admin");
+ List<SearchDslParser.QsFieldBinding> bindings = Arrays.asList(
+ new SearchDslParser.QsFieldBinding("data.string_8", 0));
+ SearchDslParser.QsPlan plan = new SearchDslParser.QsPlan(root,
bindings);
+
+ SlotRef dataSlot = createTestSlotRef("data");
+ List<Expr> children = Arrays.asList(dataSlot);
+
+ // Create an Index with analyzer properties (simulates field_pattern
matched index)
+ Map<String, String> indexProps = new HashMap<>();
+ indexProps.put("parser", "unicode");
+ indexProps.put("lower_case", "true");
+ Index invertedIndex = new Index(1L, "idx_text", Arrays.asList("data"),
+
org.apache.doris.nereids.trees.plans.commands.info.IndexDefinition.IndexType.INVERTED,
indexProps, "");
+
+ List<Index> fieldIndexes = Arrays.asList(invertedIndex);
+
+ SearchPredicate predicate = new SearchPredicate(dsl, plan, children,
fieldIndexes, true);
+
+ TExprNode thriftNode = new TExprNode();
+ predicate.toThrift(thriftNode);
+
+ TSearchParam param = thriftNode.search_param;
+ Assertions.assertNotNull(param);
+ Assertions.assertEquals(1, param.field_bindings.size());
+
+ TSearchFieldBinding binding = param.field_bindings.get(0);
+ Assertions.assertEquals("data.string_8", binding.field_name);
+ Assertions.assertTrue(binding.is_variant_subcolumn);
+ Assertions.assertEquals("data", binding.parent_field_name);
+ Assertions.assertEquals("string_8", binding.subcolumn_path);
+
+ // Verify index_properties are set
+ Assertions.assertNotNull(binding.index_properties);
+ Assertions.assertEquals("unicode",
binding.index_properties.get("parser"));
+ Assertions.assertEquals("true",
binding.index_properties.get("lower_case"));
+ }
+
+ @Test
+ public void testFieldIndexesNullDoesNotSetProperties() {
+ String dsl = "title:hello";
+ SearchDslParser.QsPlan plan = createTestPlan();
+ SlotRef titleSlot = createTestSlotRef("title");
+ List<Expr> children = Arrays.asList(titleSlot);
+
+ // Pass null Index in the fieldIndexes list
+ List<Index> fieldIndexes = Arrays.asList((Index) null);
+
+ SearchPredicate predicate = new SearchPredicate(dsl, plan, children,
fieldIndexes, true);
+
+ TExprNode thriftNode = new TExprNode();
+ predicate.toThrift(thriftNode);
+
+ TSearchParam param = thriftNode.search_param;
+ TSearchFieldBinding binding = param.field_bindings.get(0);
+
+ // index_properties should not be set when Index is null
+ Assertions.assertFalse(binding.isSetIndexProperties());
+ }
+
+ @Test
+ public void testFieldIndexesEmptyListBackwardCompatible() {
+ // Verify that using the old constructor (without fieldIndexes) still
works
+ String dsl = "title:hello";
+ SearchDslParser.QsPlan plan = createTestPlan();
+ SlotRef titleSlot = createTestSlotRef("title");
+ List<Expr> children = Arrays.asList(titleSlot);
+
+ // Constructor without fieldIndexes
+ SearchPredicate predicate = new SearchPredicate(dsl, plan, children,
true);
+
+ TExprNode thriftNode = new TExprNode();
+ predicate.toThrift(thriftNode);
+
+ TSearchParam param = thriftNode.search_param;
+ TSearchFieldBinding binding = param.field_bindings.get(0);
+
+ // index_properties should not be set
+ Assertions.assertFalse(binding.isSetIndexProperties());
+ }
+
+ @Test
+ public void testMultipleFieldsWithMixedIndexes() {
+ String dsl = "title:hello AND data.string_8:admin";
+
+ SearchDslParser.QsNode leftChild = new SearchDslParser.QsNode(
+ SearchDslParser.QsClauseType.TERM, "title", "hello");
+ SearchDslParser.QsNode rightChild = new SearchDslParser.QsNode(
+ SearchDslParser.QsClauseType.TERM, "data.string_8", "admin");
+ SearchDslParser.QsNode root = new SearchDslParser.QsNode(
+ SearchDslParser.QsClauseType.AND, Arrays.asList(leftChild,
rightChild));
+
+ List<SearchDslParser.QsFieldBinding> fieldBindings = Arrays.asList(
+ new SearchDslParser.QsFieldBinding("title", 0),
+ new SearchDslParser.QsFieldBinding("data.string_8", 1));
+ SearchDslParser.QsPlan plan = new SearchDslParser.QsPlan(root,
fieldBindings);
+
+ List<Expr> children = Arrays.asList(
+ createTestSlotRef("title"),
+ createTestSlotRef("data"));
+
+ // First field has no index, second has index with analyzer
+ Map<String, String> indexProps = new HashMap<>();
+ indexProps.put("parser", "unicode");
+ indexProps.put("lower_case", "true");
+ Index variantIndex = new Index(1L, "idx_text", Arrays.asList("data"),
+
org.apache.doris.nereids.trees.plans.commands.info.IndexDefinition.IndexType.INVERTED,
indexProps, "");
+
+ List<Index> fieldIndexes = Arrays.asList(null, variantIndex);
+
+ SearchPredicate predicate = new SearchPredicate(dsl, plan, children,
fieldIndexes, true);
+
+ TExprNode thriftNode = new TExprNode();
+ predicate.toThrift(thriftNode);
+
+ TSearchParam param = thriftNode.search_param;
+ Assertions.assertEquals(2, param.field_bindings.size());
+
+ // First field: no index_properties
+
Assertions.assertFalse(param.field_bindings.get(0).isSetIndexProperties());
+
+ // Second field: has index_properties
+
Assertions.assertTrue(param.field_bindings.get(1).isSetIndexProperties());
+ Assertions.assertEquals("unicode",
param.field_bindings.get(1).index_properties.get("parser"));
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
index d293433eb1a..214f309bded 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
@@ -664,29 +664,39 @@ public class SearchDslParserTest {
@Test
public void testLuceneModeAndOrMixed() {
// Test: "a AND b OR c" in Lucene mode with minimum_should_match=0
- // Expected: +a (SHOULD terms discarded because MUST exists)
+ // Lucene addClause semantics (left-to-right, no precedence,
default_operator=OR):
+ // a(CONJ_NONE)→SHOULD, b(CONJ_AND)→prev MUST, b MUST,
c(CONJ_OR)→SHOULD (prev unchanged)
+ // Result: [MUST(a), MUST(b), SHOULD(c)] with msm=0
+ // ES: +a +b c (SHOULD(c) kept, not filtered — msm=0 means
optional, not removed)
String dsl = "field:a AND field:b OR field:c";
String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
Assertions.assertNotNull(plan);
- // With minimum_should_match=0 and MUST clauses present, SHOULD is
discarded
- // Only "a" remains with MUST
- Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
- Assertions.assertEquals("field", plan.getRoot().getField());
- Assertions.assertEquals("a", plan.getRoot().getValue());
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
+ Assertions.assertEquals(3, plan.getRoot().getChildren().size());
+
+ QsNode nodeA = plan.getRoot().getChildren().get(0);
+ Assertions.assertEquals("a", nodeA.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST,
nodeA.getOccur());
+
+ QsNode nodeB = plan.getRoot().getChildren().get(1);
+ Assertions.assertEquals("b", nodeB.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST,
nodeB.getOccur());
+
+ QsNode nodeC = plan.getRoot().getChildren().get(2);
+ Assertions.assertEquals("c", nodeC.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD,
nodeC.getOccur());
}
@Test
public void testLuceneModeAndOrNotMixed() {
// Test: "a AND b OR NOT c AND d" in Lucene mode
- // Expected processing:
- // - a: MUST (first term, default_operator=AND)
- // - b: MUST (AND introduces)
- // - c: MUST_NOT (OR + NOT, but OR makes preceding SHOULD, NOT makes
current MUST_NOT)
- // - d: MUST (AND introduces)
- // With minimum_should_match=0: b becomes SHOULD and is discarded
- // Result: +a -c +d
+ // Lucene addClause semantics (left-to-right, no precedence):
+ // a(CONJ_NONE)→SHOULD, b(CONJ_AND)→prev MUST, b MUST,
+ // NOT c(CONJ_OR, MOD_NOT)→MUST_NOT (prev unchanged with
OR_OPERATOR),
+ // d(CONJ_AND)→prev(c) skip (MUST_NOT), d MUST
+ // Result: [MUST(a), MUST(b), MUST_NOT(c), MUST(d)] = +a +b -c +d
String dsl = "field:a AND field:b OR NOT field:c AND field:d";
String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
@@ -694,19 +704,22 @@ public class SearchDslParserTest {
Assertions.assertNotNull(plan);
Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
- // Should have 3 children: a(MUST), c(MUST_NOT), d(MUST)
- // b is filtered out because it becomes SHOULD
- Assertions.assertEquals(3, plan.getRoot().getChildren().size());
+ // Should have 4 children: a(MUST), b(MUST), c(MUST_NOT), d(MUST)
+ Assertions.assertEquals(4, plan.getRoot().getChildren().size());
QsNode nodeA = plan.getRoot().getChildren().get(0);
Assertions.assertEquals("a", nodeA.getValue());
Assertions.assertEquals(SearchDslParser.QsOccur.MUST,
nodeA.getOccur());
- QsNode nodeC = plan.getRoot().getChildren().get(1);
+ QsNode nodeB = plan.getRoot().getChildren().get(1);
+ Assertions.assertEquals("b", nodeB.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST,
nodeB.getOccur());
+
+ QsNode nodeC = plan.getRoot().getChildren().get(2);
Assertions.assertEquals("c", nodeC.getValue());
Assertions.assertEquals(SearchDslParser.QsOccur.MUST_NOT,
nodeC.getOccur());
- QsNode nodeD = plan.getRoot().getChildren().get(2);
+ QsNode nodeD = plan.getRoot().getChildren().get(3);
Assertions.assertEquals("d", nodeD.getValue());
Assertions.assertEquals(SearchDslParser.QsOccur.MUST,
nodeD.getOccur());
}
@@ -714,33 +727,58 @@ public class SearchDslParserTest {
@Test
public void testLuceneModeWithDefaultField() {
// Test: Lucene mode with default field expansion
+ // Lucene addClause semantics with default_operator=AND (AND_OPERATOR):
+ // aterm(CONJ_NONE)→MUST, bterm(CONJ_AND)→prev MUST, bterm MUST,
+ // cterm(CONJ_OR)→SHOULD + prev(bterm) becomes SHOULD (AND_OPERATOR
+ CONJ_OR)
+ // Result: [MUST(aterm), SHOULD(bterm), SHOULD(cterm)] with msm=0
+ // ES: +aterm bterm cterm
String dsl = "aterm AND bterm OR cterm";
- // Now default_field and default_operator are inside the options JSON
String options =
"{\"default_field\":\"firstname\",\"default_operator\":\"and\","
+ "\"mode\":\"lucene\",\"minimum_should_match\":0}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
Assertions.assertNotNull(plan);
- // With minimum_should_match=0, only aterm (MUST) remains
- Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
- Assertions.assertEquals("firstname", plan.getRoot().getField());
- Assertions.assertEquals("aterm", plan.getRoot().getValue());
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
+ Assertions.assertEquals(3, plan.getRoot().getChildren().size());
+
+ QsNode nodeA = plan.getRoot().getChildren().get(0);
+ Assertions.assertEquals("firstname", nodeA.getField());
+ Assertions.assertEquals("aterm", nodeA.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST,
nodeA.getOccur());
+
+ QsNode nodeB = plan.getRoot().getChildren().get(1);
+ Assertions.assertEquals("bterm", nodeB.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD,
nodeB.getOccur());
+
+ QsNode nodeC = plan.getRoot().getChildren().get(2);
+ Assertions.assertEquals("cterm", nodeC.getValue());
+ Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD,
nodeC.getOccur());
}
@Test
public void testLuceneModeNotOperator() {
// Test: "NOT a" in Lucene mode
- // In Lucene mode, single NOT produces OCCUR_BOOLEAN with a MUST_NOT
child
- // (wrapped for BE to handle the negation properly)
+ // Pure NOT queries are rewritten to: SHOULD(MATCH_ALL_DOCS) +
MUST_NOT(term)
+ // with minimum_should_match=1, following ES/Lucene semantics where
pure NOT
+ // should return all documents EXCEPT those matching the NOT clause
String dsl = "NOT field:a";
String options = "{\"mode\":\"lucene\"}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
Assertions.assertNotNull(plan);
Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
- Assertions.assertEquals(1, plan.getRoot().getChildren().size());
- Assertions.assertEquals(QsClauseType.TERM,
plan.getRoot().getChildren().get(0).getType());
- Assertions.assertEquals(QsOccur.MUST_NOT,
plan.getRoot().getChildren().get(0).getOccur());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+ Assertions.assertEquals(Integer.valueOf(1),
plan.getRoot().getMinimumShouldMatch());
+
+ // First child: MATCH_ALL_DOCS with SHOULD
+ QsNode matchAllNode = plan.getRoot().getChildren().get(0);
+ Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS,
matchAllNode.getType());
+ Assertions.assertEquals(QsOccur.SHOULD, matchAllNode.getOccur());
+
+ // Second child: TERM with MUST_NOT
+ QsNode termNode = plan.getRoot().getChildren().get(1);
+ Assertions.assertEquals(QsClauseType.TERM, termNode.getType());
+ Assertions.assertEquals(QsOccur.MUST_NOT, termNode.getOccur());
}
@Test
@@ -817,6 +855,40 @@ public class SearchDslParserTest {
Assertions.assertEquals("First Value", plan.getRoot().getValue());
}
+ @Test
+ public void testEscapedSpaceInBareQueryLuceneMode() {
+ // Test: "Josh\ Brolin" (bare query, no field prefix) in lucene mode
+ // Should be treated as a single term "Josh Brolin", not split into
two terms
+ String dsl = "Josh\\ Brolin";
+ String optionsJson =
"{\"default_field\":\"title\",\"default_operator\":\"AND\","
+ + "\"mode\":\"lucene\",\"minimum_should_match\":0}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, optionsJson);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+ Assertions.assertEquals("title", plan.getRoot().getField());
+ Assertions.assertEquals("Josh Brolin", plan.getRoot().getValue());
+ // defaultOperator must be lowercase for BE case-sensitive comparison
+ Assertions.assertEquals("and", plan.getDefaultOperator());
+ }
+
+ @Test
+ public void testDefaultOperatorNormalization() {
+ // Verify defaultOperator is always normalized to lowercase in the
plan,
+ // regardless of the case used in the options JSON.
+ // BE compares case-sensitively: (default_operator == "and")
+ String dsl = "foo bar";
+ String optionsJson =
"{\"default_field\":\"title\",\"default_operator\":\"AND\","
+ + "\"mode\":\"lucene\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, optionsJson);
+ Assertions.assertEquals("and", plan.getDefaultOperator());
+
+ optionsJson =
"{\"default_field\":\"title\",\"default_operator\":\"OR\","
+ + "\"mode\":\"lucene\"}";
+ plan = SearchDslParser.parseDsl(dsl, optionsJson);
+ Assertions.assertEquals("or", plan.getDefaultOperator());
+ }
+
@Test
public void testEscapedParentheses() {
// Test: \( and \) should be treated as literal characters, not
grouping
@@ -1040,6 +1112,112 @@ public class SearchDslParserTest {
.anyMatch(b -> "category".equals(b.getFieldName())));
}
+ @Test
+ public void testMultiFieldExplicitFieldInFieldsList() {
+ // Bug fix: explicit field prefix should NOT be expanded even when the
field IS in the fields list
+ // ES query_string always respects explicit "field:term" syntax
regardless of the fields parameter.
+ // "title:music AND content:history" with fields=["title","content"]
+ // → title:music AND content:history (NOT expanded to multi-field OR)
+ String dsl = "title:music AND content:history";
+ String options =
"{\"fields\":[\"title\",\"content\"],\"type\":\"cross_fields\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+
+ // First child: title:music - NOT expanded
+ QsNode first = plan.getRoot().getChildren().get(0);
+ Assertions.assertEquals(QsClauseType.TERM, first.getType());
+ Assertions.assertEquals("title", first.getField());
+ Assertions.assertEquals("music", first.getValue());
+
+ // Second child: content:history - NOT expanded
+ QsNode second = plan.getRoot().getChildren().get(1);
+ Assertions.assertEquals(QsClauseType.TERM, second.getType());
+ Assertions.assertEquals("content", second.getField());
+ Assertions.assertEquals("history", second.getValue());
+ }
+
+ @Test
+ public void testMultiFieldExplicitFieldInFieldsListBestFields() {
+ // Same test as above but with best_fields type
+ String dsl = "title:music AND content:history";
+ String options =
"{\"fields\":[\"title\",\"content\"],\"type\":\"best_fields\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ // best_fields wraps in OR for multi-field, but explicit fields should
be preserved in each copy
+ QsNode root = plan.getRoot();
+ Assertions.assertEquals(QsClauseType.OR, root.getType());
+ Assertions.assertEquals(2, root.getChildren().size());
+
+ // Each OR branch should have AND(title:music, content:history) - both
explicit fields preserved
+ for (QsNode branch : root.getChildren()) {
+ Assertions.assertEquals(QsClauseType.AND, branch.getType());
+ Assertions.assertEquals(2, branch.getChildren().size());
+
+ QsNode titleNode = branch.getChildren().get(0);
+ Assertions.assertEquals("title", titleNode.getField());
+ Assertions.assertEquals("music", titleNode.getValue());
+
+ QsNode contentNode = branch.getChildren().get(1);
+ Assertions.assertEquals("content", contentNode.getField());
+ Assertions.assertEquals("history", contentNode.getValue());
+ }
+ }
+
+ @Test
+ public void testMultiFieldMixedExplicitAndBareQuery() {
+ // "title:football AND american" with fields=["title","content"]
+ // → title:football AND (title:american OR content:american)
+ // title:football should NOT be expanded; "american" (bare) should be
expanded
+ String dsl = "title:football AND american";
+ String options =
"{\"fields\":[\"title\",\"content\"],\"type\":\"cross_fields\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+
+ // First child: title:football - NOT expanded (explicit field)
+ QsNode first = plan.getRoot().getChildren().get(0);
+ Assertions.assertEquals(QsClauseType.TERM, first.getType());
+ Assertions.assertEquals("title", first.getField());
+ Assertions.assertEquals("football", first.getValue());
+
+ // Second child: (title:american OR content:american) - expanded (bare
term)
+ QsNode second = plan.getRoot().getChildren().get(1);
+ Assertions.assertEquals(QsClauseType.OR, second.getType());
+ Assertions.assertEquals(2, second.getChildren().size());
+ }
+
+ @Test
+ public void testMultiFieldLuceneModeExplicitFieldInFieldsList() {
+ // Lucene mode: "title:music AND content:history" with
fields=["title","content"]
+ // Explicit fields should be preserved, not expanded
+ String dsl = "title:music AND content:history";
+ String options =
"{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\","
+ + "\"mode\":\"lucene\",\"type\":\"cross_fields\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ QsNode root = plan.getRoot();
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType());
+ Assertions.assertEquals(2, root.getChildren().size());
+
+ // Both children should be leaf TERM nodes (not expanded to
OCCUR_BOOLEAN wrappers)
+ QsNode first = root.getChildren().get(0);
+ Assertions.assertEquals(QsClauseType.TERM, first.getType());
+ Assertions.assertEquals("title", first.getField());
+ Assertions.assertEquals("music", first.getValue());
+
+ QsNode second = root.getChildren().get(1);
+ Assertions.assertEquals(QsClauseType.TERM, second.getType());
+ Assertions.assertEquals("content", second.getField());
+ Assertions.assertEquals("history", second.getValue());
+ }
+
@Test
public void testMultiFieldWithWildcard() {
// Test: "hello*" + fields=["title","content"]
@@ -1175,16 +1353,22 @@ public class SearchDslParserTest {
@Test
public void testMultiFieldLuceneModeAndOrMixed() {
// Test: "a AND b OR c" + fields=["title","content"] + lucene mode +
minimum_should_match=0 + cross_fields
- // With Lucene semantics and minimum_should_match=0: SHOULD groups are
discarded
- // Only "a" (MUST) remains - wrapped in OCCUR_BOOLEAN
+ // With no default_operator (default is OR_OPERATOR in Lucene):
+ // a=MUST (promoted by AND), b=MUST (from AND), c=SHOULD (from OR)
+ // With OR_OPERATOR, OR does NOT change preceding term's occur
+ // msm is ignored for multi-field mode, node-level msm defaults to 0
(since MUST exists)
String dsl = "a AND b OR c";
String options =
"{\"fields\":[\"title\",\"content\"],\"mode\":\"lucene\",\"minimum_should_match\":0,\"type\":\"cross_fields\"}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
Assertions.assertNotNull(plan);
- // With minimum_should_match=0, only (title:a OR content:a) remains
- // In Lucene mode, this is wrapped as OCCUR_BOOLEAN
+ // Root is OCCUR_BOOLEAN with 3 children: MUST(a), MUST(b), SHOULD(c)
Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
+ Assertions.assertEquals(3, plan.getRoot().getChildren().size());
+ // a and b are MUST, c is SHOULD
+ Assertions.assertEquals(QsOccur.MUST,
plan.getRoot().getChildren().get(0).getOccur());
+ Assertions.assertEquals(QsOccur.MUST,
plan.getRoot().getChildren().get(1).getOccur());
+ Assertions.assertEquals(QsOccur.SHOULD,
plan.getRoot().getChildren().get(2).getOccur());
}
@Test
@@ -1236,16 +1420,18 @@ public class SearchDslParserTest {
@Test
public void testMultiFieldLuceneModeMinimumShouldMatchOne() {
- // Test: "a AND b OR c" with minimum_should_match=1 keeps all clauses
+ cross_fields
+ // Test: "a AND b OR c" with minimum_should_match=1 + cross_fields +
multi-field
+ // For multi-field mode (fields.size() > 1), minimum_should_match is
nullified.
+ // Lucene addClause with default_operator=OR: [MUST(a), MUST(b),
SHOULD(c)] msm=0
+ // No SHOULD filtering — all 3 terms kept, each expanded to 2 fields
via cross_fields
String dsl = "a AND b OR c";
String options =
"{\"fields\":[\"title\",\"content\"],\"mode\":\"lucene\",\"minimum_should_match\":1,\"type\":\"cross_fields\"}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
Assertions.assertNotNull(plan);
Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
- // All 3 groups should be present
+ // 3 terms (a, b, c), each expanded to cross_fields OCCUR_BOOLEAN
Assertions.assertEquals(3, plan.getRoot().getChildren().size());
- Assertions.assertEquals(Integer.valueOf(1),
plan.getRoot().getMinimumShouldMatch());
}
// ============ Tests for type parameter (best_fields vs cross_fields)
============
@@ -1306,13 +1492,53 @@ public class SearchDslParserTest {
@Test
public void testMultiFieldBestFieldsLuceneMode() {
- // Test: best_fields with Lucene mode
+ // Test: best_fields with Lucene mode uses per-clause expansion
(matching ES query_string)
+ // "hello world" with AND → each term independently expanded across
fields:
+ // MUST(SHOULD(title:hello, content:hello)) AND
MUST(SHOULD(title:world, content:world))
String dsl = "hello world";
String options =
"{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\",\"mode\":\"lucene\",\"type\":\"best_fields\"}";
QsPlan plan = SearchDslParser.parseDsl(dsl, options);
Assertions.assertNotNull(plan);
Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
+ // Per-clause expansion: 2 children (one per term), each expanded
across fields
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+ for (QsNode child : plan.getRoot().getChildren()) {
+ // Each child is an OCCUR_BOOLEAN wrapping the per-field expansion
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
child.getType());
+ Assertions.assertEquals(2, child.getChildren().size()); // one per
field
+ }
+ }
+
+ @Test
+ public void testMultiFieldBestFieldsLuceneModePerClauseExpansion() {
+ // Test: best_fields with phrase + regex uses per-clause expansion
(not per-field)
+ // ES query_string expands each clause independently across fields:
+ // ("Costner" AND /Li../) → MUST(title:"Costner" |
content:"Costner") AND MUST(title:/Li../ | content:/Li../)
+ // NOT: (title:"Costner" AND title:/Li../) OR (content:"Costner" AND
content:/Li../)
+ String dsl = "\"Costner\" /Li../";
+ String options =
"{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\",\"mode\":\"lucene\",\"type\":\"best_fields\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ QsNode root = plan.getRoot();
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType());
+ // 2 children: one for phrase "Costner", one for regex /Li../
+ Assertions.assertEquals(2, root.getChildren().size());
+
+ // First child: phrase "Costner" expanded across fields
+ QsNode phraseGroup = root.getChildren().get(0);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
phraseGroup.getType());
+ Assertions.assertEquals(2, phraseGroup.getChildren().size());
+ Assertions.assertEquals(QsClauseType.PHRASE,
phraseGroup.getChildren().get(0).getType());
+ Assertions.assertEquals(QsClauseType.PHRASE,
phraseGroup.getChildren().get(1).getType());
+
+ // Second child: regex /Li../ expanded across fields
+ QsNode regexpGroup = root.getChildren().get(1);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
regexpGroup.getType());
+ Assertions.assertEquals(2, regexpGroup.getChildren().size());
+ Assertions.assertEquals(QsClauseType.REGEXP,
regexpGroup.getChildren().get(0).getType());
+ Assertions.assertEquals(QsClauseType.REGEXP,
regexpGroup.getChildren().get(1).getType());
}
@Test
@@ -1582,4 +1808,145 @@ public class SearchDslParserTest {
Assertions.assertEquals(QsClauseType.TERM, termNode.getType());
Assertions.assertEquals("title", termNode.getField());
}
+
+ // =====================================================================
+ // Hubspot-specific tests
+ // =====================================================================
+
+ @Test
+ public void testPhraseWithImplicitOrOperator() {
+ // Test: '"2003 NBA draft" Darrell' with default_operator=OR should
produce same result as
+ // '"2003 NBA draft" OR Darrell'
+ String dsl1 = "\"2003 NBA draft\" Darrell";
+ String dsl2 = "\"2003 NBA draft\" OR Darrell";
+ String options =
"{\"default_field\":\"title\",\"default_operator\":\"OR\","
+ + "\"mode\":\"lucene\",\"minimum_should_match\":0}";
+
+ QsPlan plan1 = SearchDslParser.parseDsl(dsl1, options);
+ QsPlan plan2 = SearchDslParser.parseDsl(dsl2, options);
+
+ Assertions.assertNotNull(plan1);
+ Assertions.assertNotNull(plan2);
+
+ // Both should have the same structure - OCCUR_BOOLEAN with 2 SHOULD
children
+ Assertions.assertEquals(plan2.getRoot().getType(),
plan1.getRoot().getType());
+ Assertions.assertEquals(plan2.getRoot().getChildren().size(),
plan1.getRoot().getChildren().size());
+
+ // Verify the phrase is preserved as PHRASE type, not broken into terms
+ boolean hasPhrase1 = plan1.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.PHRASE);
+ boolean hasPhrase2 = plan2.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.PHRASE);
+ Assertions.assertTrue(hasPhrase1, "Plan 1 should contain a PHRASE
node");
+ Assertions.assertTrue(hasPhrase2, "Plan 2 should contain a PHRASE
node");
+ }
+
+ @Test
+ public void testPhraseWithImplicitAndOperator() {
+ // Test: '"hello world" foo' with default_operator=AND
+ String dsl = "\"hello world\" foo";
+ String options =
"{\"default_field\":\"title\",\"default_operator\":\"AND\"}";
+
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ // Should create AND query: title:"hello world" AND title:foo
+ Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+
+ // Verify the phrase is preserved
+ boolean hasPhrase = plan.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.PHRASE);
+ Assertions.assertTrue(hasPhrase, "Should contain a PHRASE node");
+ }
+
+ @Test
+ public void testMultiplePhrases() {
+ // Test: '"hello world" "foo bar"' with default_operator=OR
+ String dsl = "\"hello world\" \"foo bar\"";
+ String options =
"{\"default_field\":\"title\",\"default_operator\":\"OR\"}";
+
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OR, plan.getRoot().getType());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+
+ // Both children should be PHRASE type
+ for (QsNode child : plan.getRoot().getChildren()) {
+ Assertions.assertEquals(QsClauseType.PHRASE, child.getType());
+ }
+ }
+
+ // ============ Tests for Standalone Wildcard * ============
+
+ @Test
+ public void testStandaloneWildcardWithAnd() {
+ // Test: "Dollar AND *" should produce: MUST(title:Dollar) AND
MUST(MATCH_ALL_DOCS)
+ // Standalone "*" becomes MATCH_ALL_DOCS (matches ES behavior: field:*
→ ExistsQuery)
+ String dsl = "Dollar AND *";
+ String options =
"{\"default_field\":\"title\",\"default_operator\":\"OR\","
+ + "\"mode\":\"lucene\",\"minimum_should_match\":0}";
+
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+
+ // Both children should have MUST occur (AND)
+ for (QsNode child : plan.getRoot().getChildren()) {
+ Assertions.assertEquals(QsOccur.MUST, child.getOccur());
+ }
+
+ // One should be TERM (Dollar), one should be MATCH_ALL_DOCS
+ boolean hasTerm = plan.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.TERM &&
"Dollar".equals(n.getValue()));
+ boolean hasMatchAll = plan.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.MATCH_ALL_DOCS);
+
+ Assertions.assertTrue(hasTerm, "Should contain TERM node for
'Dollar'");
+ Assertions.assertTrue(hasMatchAll, "Should contain MATCH_ALL_DOCS node
for '*'");
+ }
+
+ @Test
+ public void testStandaloneWildcardAlone() {
+ // Test: "*" alone becomes MATCH_ALL_DOCS (matches ES behavior:
field:* → ExistsQuery)
+ String dsl = "*";
+ String options =
"{\"default_field\":\"title\",\"default_operator\":\"OR\"}";
+
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS,
plan.getRoot().getType());
+ }
+
+ @Test
+ public void testStandaloneWildcardWithOr() {
+ // Test: "Dollar OR *" should produce: SHOULD(title:Dollar) OR
SHOULD(MATCH_ALL_DOCS)
+ // Standalone "*" becomes MATCH_ALL_DOCS (matches ES behavior: field:*
→ ExistsQuery)
+ String dsl = "Dollar OR *";
+ String options =
"{\"default_field\":\"title\",\"default_operator\":\"OR\","
+ + "\"mode\":\"lucene\",\"minimum_should_match\":0}";
+
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN,
plan.getRoot().getType());
+ Assertions.assertEquals(2, plan.getRoot().getChildren().size());
+
+ // Both children should have SHOULD occur (OR)
+ for (QsNode child : plan.getRoot().getChildren()) {
+ Assertions.assertEquals(QsOccur.SHOULD, child.getOccur());
+ }
+
+ // One should be TERM (Dollar), one should be MATCH_ALL_DOCS
+ boolean hasTerm = plan.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.TERM &&
"Dollar".equals(n.getValue()));
+ boolean hasMatchAll = plan.getRoot().getChildren().stream()
+ .anyMatch(n -> n.getType() == QsClauseType.MATCH_ALL_DOCS);
+
+ Assertions.assertTrue(hasTerm, "Should contain TERM node for
'Dollar'");
+ Assertions.assertTrue(hasMatchAll, "Should contain MATCH_ALL_DOCS node
for '*'");
+ }
}
diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift
index 6eaa5ff8e79..e74e1083243 100644
--- a/gensrc/thrift/Exprs.thrift
+++ b/gensrc/thrift/Exprs.thrift
@@ -261,12 +261,15 @@ struct TSearchFieldBinding {
3: optional string parent_field_name // Parent field name for variant
subcolumns
4: optional string subcolumn_path // Subcolumn path for variant fields
(e.g., "subcolumn" or "sub1.sub2")
5: optional bool is_variant_subcolumn // True if this is a variant
subcolumn access
+ 6: optional map<string, string> index_properties // Index properties
(parser, lower_case, etc.) from FE Index lookup
}
struct TSearchParam {
1: required string original_dsl // Original DSL string for debugging
2: required TSearchClause root // Parsed AST root
3: required list<TSearchFieldBinding> field_bindings // Field to slot
mappings
+ 4: optional string default_operator // "and" or "or" for TERM
tokenization (default: "or")
+ 5: optional i32 minimum_should_match // Minimum number of SHOULD clauses
that must match (for Lucene mode TERM tokenization)
}
// This is essentially a union over the subclasses of Expr.
diff --git a/regression-test/data/search/test_search_lucene_mode.out
b/regression-test/data/search/test_search_lucene_mode.out
index 68d8e6c1279..5eb4346b50c 100644
--- a/regression-test/data/search/test_search_lucene_mode.out
+++ b/regression-test/data/search/test_search_lucene_mode.out
@@ -34,6 +34,10 @@
2 apple banana
-- !lucene_not --
+4 banana cherry
+5 cherry date
+6 date elderberry
+7 fig grape
-- !lucene_and_not --
3 apple
diff --git a/regression-test/data/search/test_search_multi_field.out
b/regression-test/data/search/test_search_multi_field.out
index 4a4923a4c3b..e22811bddde 100644
--- a/regression-test/data/search/test_search_multi_field.out
+++ b/regression-test/data/search/test_search_multi_field.out
@@ -75,13 +75,10 @@
-- !multi_field_lucene_and_or --
1 machine learning basics
-4 machine maintenance
-8 cooking machine reviews
9 machine guide
-- !multi_field_lucene_min_should_1 --
1 machine learning basics
-8 cooking machine reviews
9 machine guide
-- !multi_field_lucene_and_not --
@@ -119,6 +116,7 @@
-- !multi_field_best_fields_lucene --
1 machine learning basics
+9 machine guide
-- !multi_field_cross_fields_lucene --
1 machine learning basics
diff --git a/regression-test/data/search/test_search_regexp_lowercase.out
b/regression-test/data/search/test_search_regexp_lowercase.out
new file mode 100644
index 00000000000..0ae25fc613f
--- /dev/null
+++ b/regression-test/data/search/test_search_regexp_lowercase.out
@@ -0,0 +1,39 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !regexp_uppercase_no_match --
+
+-- !match_regexp_uppercase_no_match --
+
+-- !regexp_lowercase_match --
+1 ABC DEF
+2 abc def
+
+-- !match_regexp_lowercase_match --
+1 ABC DEF
+2 abc def
+
+-- !wildcard_uppercase_match --
+1 ABC DEF
+2 abc def
+
+-- !wildcard_lowercase_match --
+1 ABC DEF
+2 abc def
+
+-- !regexp_apple_lowercase --
+3 Apple Banana Cherry
+4 apple banana cherry
+
+-- !regexp_apple_uppercase_no_match --
+
+-- !consistency_regexp_cherry --
+3
+4
+
+-- !consistency_match_regexp_cherry --
+3
+4
+
+-- !consistency_regexp_cherry_upper --
+
+-- !consistency_match_regexp_cherry_upper --
+
diff --git
a/regression-test/data/search/test_search_variant_subcolumn_analyzer.out
b/regression-test/data/search/test_search_variant_subcolumn_analyzer.out
new file mode 100644
index 00000000000..d1eff343b82
--- /dev/null
+++ b/regression-test/data/search/test_search_variant_subcolumn_analyzer.out
@@ -0,0 +1,30 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !search_variant_analyzer_basic --
+1
+3
+
+-- !match_variant_baseline --
+1
+3
+
+-- !search_variant_analyzer_multi --
+3
+
+-- !search_variant_analyzer_other_field --
+4
+
+-- !search_variant_analyzer_field_syntax --
+2
+5
+
+-- !search_variant_analyzer_lowercase --
+1
+3
+
+-- !search_variant_analyzer_phrase --
+1
+
+-- !search_variant_direct_index --
+1
+3
+
diff --git a/regression-test/suites/search/test_search_lucene_mode.groovy
b/regression-test/suites/search/test_search_lucene_mode.groovy
index 8e9d4edb7e3..8e95a27a377 100644
--- a/regression-test/suites/search/test_search_lucene_mode.groovy
+++ b/regression-test/suites/search/test_search_lucene_mode.groovy
@@ -137,12 +137,9 @@ suite("test_search_lucene_mode") {
"""
// ============ Test 7: Lucene mode NOT operator (pure negative query)
============
- // 'NOT a' in Lucene mode produces a pure MUST_NOT query.
- // IMPORTANT: In Lucene/ES semantics, a pure negative query (only
MUST_NOT, no MUST/SHOULD)
- // returns EMPTY results because there's no positive clause to match
against.
- // This is correct Lucene behavior - to get "all except X", you need:
- // match_all AND NOT X (i.e., a positive clause combined with negation)
- // Expected: empty result (correct Lucene semantics)
+ // 'NOT a' in Lucene mode is rewritten to: SHOULD(MATCH_ALL_DOCS) +
MUST_NOT(a)
+ // This matches all documents EXCEPT those containing the negated term.
+ // Expected: all docs without "apple" in title (4, 5, 6, 7)
qt_lucene_not """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
FROM ${tableName}
diff --git a/regression-test/suites/search/test_search_multi_field.groovy
b/regression-test/suites/search/test_search_multi_field.groovy
index f71db33f2b0..cc94a31bc14 100644
--- a/regression-test/suites/search/test_search_multi_field.groovy
+++ b/regression-test/suites/search/test_search_multi_field.groovy
@@ -277,6 +277,8 @@ suite("test_search_multi_field") {
"""
// ============ Test 21: best_fields with Lucene mode ============
+ // In lucene mode, best_fields uses per-clause expansion (matching ES
query_string),
+ // so id=1 and id=9 both match (terms can be across different fields)
qt_multi_field_best_fields_lucene """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
FROM ${tableName}
diff --git a/regression-test/suites/search/test_search_regexp_lowercase.groovy
b/regression-test/suites/search/test_search_regexp_lowercase.groovy
new file mode 100644
index 00000000000..957027c2610
--- /dev/null
+++ b/regression-test/suites/search/test_search_regexp_lowercase.groovy
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// DORIS-24464: search() REGEXP with lower_case=true should be consistent with
match_regexp
+// Regex patterns are NOT lowercased (matching ES query_string behavior).
+// Wildcard patterns ARE lowercased (matching ES query_string normalizer
behavior).
+
+suite("test_search_regexp_lowercase") {
+ def tableName = "search_regexp_lowercase_test"
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ sql """
+ CREATE TABLE ${tableName} (
+ a INT,
+ title VARCHAR(512) NOT NULL,
+ INDEX idx_title (title) USING INVERTED PROPERTIES("lower_case" =
"true", "parser" = "english", "support_phrase" = "true")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(a)
+ DISTRIBUTED BY HASH(a) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ )
+ """
+
+ sql "INSERT INTO ${tableName} VALUES(1, 'ABC DEF')"
+ sql "INSERT INTO ${tableName} VALUES(2, 'abc def')"
+ sql "INSERT INTO ${tableName} VALUES(3, 'Apple Banana Cherry')"
+ sql "INSERT INTO ${tableName} VALUES(4, 'apple banana cherry')"
+
+ // Wait for data to be ready
+ Thread.sleep(5000)
+
+ //
=========================================================================
+ // Test 1: REGEXP with uppercase pattern should NOT match lowercased terms
+ // (ES-compatible behavior: regex patterns are not analyzed/lowercased)
+ //
=========================================================================
+
+ // search() REGEXP with uppercase pattern - should return 0 rows
+ // because indexed terms are lowercased (abc, def) but pattern AB.* is
case-sensitive
+ qt_regexp_uppercase_no_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE search('/AB.*/',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ // match_regexp with uppercase pattern - should also return 0 rows
+ qt_match_regexp_uppercase_no_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE title match_regexp 'AB.*'
+ ORDER BY a
+ """
+
+ //
=========================================================================
+ // Test 2: REGEXP with lowercase pattern SHOULD match lowercased terms
+ //
=========================================================================
+
+ // search() REGEXP with lowercase pattern - should match both rows with
"abc"
+ qt_regexp_lowercase_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE search('/ab.*/',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ // match_regexp with lowercase pattern - should also match
+ qt_match_regexp_lowercase_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE title match_regexp 'ab.*'
+ ORDER BY a
+ """
+
+ //
=========================================================================
+ // Test 3: WILDCARD with uppercase pattern should match (wildcards ARE
lowercased)
+ //
=========================================================================
+
+ // search() WILDCARD with uppercase - should match because wildcard
patterns are lowercased
+ qt_wildcard_uppercase_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE search('AB*',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ // search() WILDCARD with lowercase - should also match
+ qt_wildcard_lowercase_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE search('ab*',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ //
=========================================================================
+ // Test 4: More complex REGEXP patterns
+ //
=========================================================================
+
+ // Lowercase regex that matches "apple" - should match rows 3 and 4
+ qt_regexp_apple_lowercase """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE search('/app.*/',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ // Uppercase regex "App.*" should NOT match (terms are lowercased as
"apple")
+ qt_regexp_apple_uppercase_no_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM
${tableName}
+ WHERE search('/App.*/',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ //
=========================================================================
+ // Test 5: REGEXP consistency with match_regexp for various patterns
+ //
=========================================================================
+
+ // Both should return same results for lowercase pattern
+ qt_consistency_regexp_cherry """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM
${tableName}
+ WHERE search('/cher.*/',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ qt_consistency_match_regexp_cherry """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM
${tableName}
+ WHERE title match_regexp 'cher.*'
+ ORDER BY a
+ """
+
+ // Both should return 0 rows for uppercase pattern
+ qt_consistency_regexp_cherry_upper """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM
${tableName}
+ WHERE search('/CHER.*/',
'{"default_field":"title","default_operator":"AND","mode":"lucene",
"minimum_should_match": 0}')
+ ORDER BY a
+ """
+
+ qt_consistency_match_regexp_cherry_upper """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM
${tableName}
+ WHERE title match_regexp 'CHER.*'
+ ORDER BY a
+ """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
diff --git
a/regression-test/suites/search/test_search_variant_subcolumn_analyzer.groovy
b/regression-test/suites/search/test_search_variant_subcolumn_analyzer.groovy
new file mode 100644
index 00000000000..d14cf15f7a3
--- /dev/null
+++
b/regression-test/suites/search/test_search_variant_subcolumn_analyzer.groovy
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Test search() function with variant subcolumn and field_pattern index.
+ *
+ * This test verifies that the analyzer (parser) from field_pattern matched
indexes
+ * is correctly applied when using search() on variant subcolumns.
+ *
+ * Bug: When using search() on variant subcolumns with field_pattern indexes,
+ * the analyzer was not applied because FE did not pass index properties to BE.
+ * This caused exact-match-only behavior instead of tokenized matching.
+ *
+ * Fix: FE now looks up the Index for each field in SearchExpression and passes
+ * the index_properties via TSearchFieldBinding to BE.
+ */
+suite("test_search_variant_subcolumn_analyzer") {
+ def tableName = "test_variant_subcolumn_analyzer"
+
+ sql """ set enable_match_without_inverted_index = false """
+ sql """ set enable_common_expr_pushdown = true """
+ sql """ set default_variant_enable_typed_paths_to_sparse = false """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Create table with variant column using predefined field pattern and
field_pattern index
+ sql """
+ CREATE TABLE ${tableName} (
+ `id` INT NOT NULL,
+ `data` variant<
+ MATCH_NAME_GLOB 'string_*' : string,
+ properties("variant_max_subcolumns_count" = "100")
+ > NULL,
+ INDEX idx_text (data) USING INVERTED PROPERTIES(
+ "parser" = "unicode",
+ "field_pattern" = "string_*",
+ "lower_case" = "true"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ )
+ """
+
+ // Insert test data
+ sql """INSERT INTO ${tableName} VALUES
+ (1, '{"string_8": "admin only"}'),
+ (2, '{"string_8": "user access"}'),
+ (3, '{"string_8": "admin access granted"}'),
+ (4, '{"string_1": "hello world"}'),
+ (5, '{"string_8": "readonly user"}'),
+ (6, '{"number_1": 42}')
+ """
+
+ // Wait for data to be flushed and indexes built
+ sql "sync"
+ Thread.sleep(5000)
+
+ // Test 1: search() with default_field on variant subcolumn matching
field_pattern
+ // "admin" should match "admin only" and "admin access granted" because
the unicode
+ // parser tokenizes them into ["admin", "only"] and ["admin", "access",
"granted"]
+ qt_search_variant_analyzer_basic """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('admin',
'{"default_field":"data.string_8","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 2: Verify MATCH also works (as a baseline)
+ qt_match_variant_baseline """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE data['string_8'] MATCH_ANY 'admin'
+ ORDER BY id
+ """
+
+ // Test 3: Multi-term search should also work with tokenization
+ qt_search_variant_analyzer_multi """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('admin access',
'{"default_field":"data.string_8","mode":"lucene","default_operator":"AND"}')
+ ORDER BY id
+ """
+
+ // Test 4: Search on a different subcolumn matching the same field_pattern
+ qt_search_variant_analyzer_other_field """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('hello',
'{"default_field":"data.string_1","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 5: Search with field-qualified syntax on variant subcolumn
+ qt_search_variant_analyzer_field_syntax """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('data.string_8:user', '{"mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 6: Verify lowercase is applied (search for "ADMIN" should match
"admin only")
+ qt_search_variant_analyzer_lowercase """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('ADMIN',
'{"default_field":"data.string_8","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 7: Phrase search on variant subcolumn with analyzer
+ qt_search_variant_analyzer_phrase """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('"admin only"',
'{"default_field":"data.string_8","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Clean up
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Test Case 2: Variant with direct named field and field_pattern index
for comparison
+ def tableName2 = "test_variant_direct_index"
+
+ sql "DROP TABLE IF EXISTS ${tableName2}"
+
+ sql """
+ CREATE TABLE ${tableName2} (
+ `id` INT NOT NULL,
+ `data` variant<
+ 'name' : string,
+ properties("variant_max_subcolumns_count" = "10")
+ > NULL,
+ INDEX idx_text (data) USING INVERTED PROPERTIES(
+ "parser" = "unicode",
+ "field_pattern" = "name",
+ "lower_case" = "true"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ )
+ """
+
+ sql """INSERT INTO ${tableName2} VALUES
+ (1, '{"name": "admin only"}'),
+ (2, '{"name": "user access"}'),
+ (3, '{"name": "admin access granted"}')
+ """
+
+ sql "sync"
+ Thread.sleep(5000)
+
+ // Test 8: search() on variant subcolumn with named field_pattern (direct
match)
+ qt_search_variant_direct_index """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName2}
+ WHERE search('admin', '{"default_field":"data.name","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ sql "DROP TABLE IF EXISTS ${tableName2}"
+
+ logger.info("All variant subcolumn analyzer tests completed!")
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]