This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 574b84fa9bc [fix](search) Fix slash character in search query_string
terms (#61599)
574b84fa9bc is described below
commit 574b84fa9bc8530c38912a1c21f2417a7b30fa9d
Author: Jack <[email protected]>
AuthorDate: Mon Mar 23 14:51:29 2026 +0800
[fix](search) Fix slash character in search query_string terms (#61599)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
The ANTLR lexer in the search() DSL parser excluded `/` from
`TERM_CHAR`, causing terms like `AC/DC` to be incorrectly tokenized. The
slash was silently skipped by ANTLR's default error recovery, splitting
`AC/DC` into two separate terms `AC` and `DC` instead of treating it as
a single term.
This caused inconsistent behavior compared to Elasticsearch's
query_string parsing, where `AC\/DC` (escaped slash) is handled as a
single analyzed term.
**Fix**: Add `/` to the `TERM_CHAR` fragment in `SearchLexer.g4`. This
allows `/` to appear within terms (e.g., `AC/DC` -> single term) while
regex patterns like `/[a-z]+/` still work correctly since `/` remains
excluded from `TERM_START_CHAR`.
---
.../org/apache/doris/analysis/SearchLexer.g4 | 1 +
.../functions/scalar/SearchDslParserTest.java | 75 +++++++++++++
.../data/search/test_search_slash_in_term.out | 32 ++++++
.../suites/search/test_search_slash_in_term.groovy | 125 +++++++++++++++++++++
4 files changed, 233 insertions(+)
diff --git
a/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
b/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
index 15ee0eaeb36..7b691a61337 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
@@ -32,6 +32,7 @@ fragment TERM_CHAR
: TERM_START_CHAR
| '-'
| '+'
+ | '/'
;
fragment QUOTED_CHAR
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
index 6dc16a1da7a..c078e569121 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
@@ -1094,6 +1094,81 @@ public class SearchDslParserTest {
Assertions.assertEquals("path\\to\\file", plan.getRoot().getValue());
}
+ @Test
+ public void testSlashInTerm() {
+ // DORIS-24624: slash within a term should be treated as a regular
character
+ // e.g., AC/DC should parse as a single term, not trigger regex parsing
+ String dsl = "title:AC/DC";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+ Assertions.assertEquals("title", plan.getRoot().getField());
+ Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+ }
+
+ @Test
+ public void testSlashInTermBareQuery() {
+ // DORIS-24624: slash within a bare term (using default_field)
+ String dsl = "AC/DC";
+ QsPlan plan = SearchDslParser.parseDsl(dsl,
"{\"default_field\":\"title\",\"default_operator\":\"OR\"}");
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+ Assertions.assertEquals("title", plan.getRoot().getField());
+ Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+ }
+
+ @Test
+ public void testSlashInTermLuceneMode() {
+ // DORIS-24624: slash within a bare term in Lucene mode
+ String dsl = "AC/DC";
+ QsPlan plan = SearchDslParser.parseDsl(dsl,
+
"{\"default_field\":\"title\",\"default_operator\":\"OR\",\"minimum_should_match\":0}");
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+ Assertions.assertEquals("title", plan.getRoot().getField());
+ Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+ }
+
+ @Test
+ public void testEscapedSlashInTerm() {
+ // Escaped slash should also work and produce same result as unescaped
+ String dsl = "title:AC\\/DC";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+ Assertions.assertEquals("title", plan.getRoot().getField());
+ // After unescape: AC\/DC -> AC/DC
+ Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+ }
+
+ @Test
+ public void testMultipleSlashesInTerm() {
+ // Multiple slashes within a term
+ String dsl = "path:foo/bar/baz";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+ Assertions.assertEquals("path", plan.getRoot().getField());
+ Assertions.assertEquals("foo/bar/baz", plan.getRoot().getValue());
+ }
+
+ @Test
+ public void testSlashDoesNotBreakRegexp() {
+ // Regex pattern /pattern/ should still work correctly
+ String dsl = "title:/[a-z]+/";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.REGEXP, plan.getRoot().getType());
+ Assertions.assertEquals("title", plan.getRoot().getField());
+ Assertions.assertEquals("[a-z]+", plan.getRoot().getValue());
+ }
+
@Test
public void testUppercaseAndOperator() {
// Test: uppercase AND should be treated as operator
diff --git a/regression-test/data/search/test_search_slash_in_term.out
b/regression-test/data/search/test_search_slash_in_term.out
new file mode 100644
index 00000000000..3b0d6acd7bf
--- /dev/null
+++ b/regression-test/data/search/test_search_slash_in_term.out
@@ -0,0 +1,32 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !slash_in_term --
+1 AC/DC is a rock band
+2 AC power supply
+3 DC comics
+
+-- !escaped_slash_in_term --
+1 AC/DC is a rock band
+2 AC power supply
+3 DC comics
+
+-- !slash_bare_lucene --
+1 AC/DC is a rock band
+2 AC power supply
+3 DC comics
+
+-- !escaped_slash_bare_lucene --
+1 AC/DC is a rock band
+2 AC power supply
+3 DC comics
+
+-- !multi_slash --
+4 path/to/file
+
+-- !regex_still_works --
+1 AC/DC is a rock band
+
+-- !slash_standard_mode --
+1 AC/DC is a rock band
+2 AC power supply
+3 DC comics
+
diff --git a/regression-test/suites/search/test_search_slash_in_term.groovy
b/regression-test/suites/search/test_search_slash_in_term.groovy
new file mode 100644
index 00000000000..0749929f252
--- /dev/null
+++ b/regression-test/suites/search/test_search_slash_in_term.groovy
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * DORIS-24624: Tests for slash (/) character handling in search() function.
+ *
+ * The slash character is used as a regex delimiter in Lucene query_string
syntax
+ * (e.g., /pattern/). However, when it appears in the middle of a term (e.g.,
AC/DC),
+ * it should be treated as a regular character, not as a regex delimiter.
+ *
+ * This test verifies that:
+ * 1. Slash within a term (AC/DC) is parsed correctly as a single term
+ * 2. Escaped slash (AC\/DC) produces the same result
+ * 3. Regex patterns (/pattern/) still work correctly
+ * 4. Both standard and lucene modes handle slashes consistently
+ */
+suite("test_search_slash_in_term", "p0") {
+ def tableName = "search_slash_in_term_test"
+
+ sql """ set enable_common_expr_pushdown = true """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ title VARCHAR(200),
+ content VARCHAR(500),
+ INDEX idx_title(title) USING INVERTED PROPERTIES("parser" =
"standard"),
+ INDEX idx_content(content) USING INVERTED PROPERTIES("parser" =
"standard")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_allocation" = "tag.location.default: 1")
+ """
+
+ sql """INSERT INTO ${tableName} VALUES
+ (1, 'AC/DC is a rock band', 'rock music'),
+ (2, 'AC power supply', 'electrical engineering'),
+ (3, 'DC comics', 'entertainment'),
+ (4, 'path/to/file', 'file system'),
+ (5, 'a/b/c/d', 'multi slash path'),
+ (6, 'hello world', 'greeting'),
+ (7, 'acdc together', 'no slash')
+ """
+
+ // Wait for index building
+ Thread.sleep(3000)
+
+ // ============ Test 1: Slash in term with field prefix ============
+ // title:AC/DC should parse as single term, standard analyzer tokenizes to
"ac" and "dc"
+ // With default OR operator, matches rows containing "ac" or "dc" in title
+ order_qt_slash_in_term """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:AC/DC')
+ ORDER BY id
+ """
+
+ // ============ Test 2: Escaped slash should produce same result
============
+ // title:AC\/DC should produce the same result as title:AC/DC
+ // Groovy: \\\\/ -> SQL: \\/ -> DSL: \/ -> unescaped: /
+ order_qt_escaped_slash_in_term """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:AC\\\\/DC')
+ ORDER BY id
+ """
+
+ // ============ Test 3: Slash in term with default_field (lucene mode)
============
+ // Bare AC/DC with default_field should work
+ order_qt_slash_bare_lucene """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('AC/DC',
'{"default_field":"title","default_operator":"OR","minimum_should_match":0}')
+ ORDER BY id
+ """
+
+ // ============ Test 4: Escaped slash with default_field should match
============
+ order_qt_escaped_slash_bare_lucene """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('AC\\\\/DC',
'{"default_field":"title","default_operator":"OR","minimum_should_match":0}')
+ ORDER BY id
+ """
+
+ // ============ Test 5: Multiple slashes in term ============
+ order_qt_multi_slash """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:path/to/file')
+ ORDER BY id
+ """
+
+ // ============ Test 6: Regex pattern still works ============
+ // /[a-z]+/ should be parsed as regex, not as term with slashes
+ order_qt_regex_still_works """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:/rock/')
+ ORDER BY id
+ """
+
+ // ============ Test 7: Slash in term with standard mode ============
+ order_qt_slash_standard_mode """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('AC/DC', '{"default_field":"title","mode":"standard"}')
+ ORDER BY id
+ """
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]