(solr) 05/14: SOLR-17631: Fix PathHierarchyTokenizerFactoryTest for Lucene 10

ishan Thu, 07 Aug 2025 08:03:49 -0700

This is an automated email from the ASF dual-hosted git repository.

ishan pushed a commit to branch ishan/upgrade-to-lucene-10
in repository https://gitbox.apache.org/repos/asf/solr.git


commit 0f4cd0f846cf189289a9369aef83a035230c755d
Author: Ishan Chattopadhyaya <[email protected]>
AuthorDate: Thu Aug 7 18:49:50 2025 +0530

    SOLR-17631: Fix PathHierarchyTokenizerFactoryTest for Lucene 10
    
    Update test expectations for PathHierarchyTokenizer's new sequential token 
behaviour adn document breaking change in upgrade notes.
---
 .../response/transform/ValueSourceAugmenter.java   |  2 --
 .../apache/solr/search/ExtendedDismaxQParser.java  |  2 +-
 .../org/apache/solr/search/SolrIndexSearcher.java  |  1 +
 .../test-files/solr/collection1/conf/schema.xml    |  7 ++---
 .../PathHierarchyTokenizerFactoryTest.java         | 16 ++++++------
 .../pages/major-changes-in-solr-10.adoc            | 30 ++++++++++++++++++++++
 6 files changed, 44 insertions(+), 14 deletions(-)

diff --git 
a/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java
 
b/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java
index 21d1434b64e..e2fe53cf4d5 100644
--- 
a/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java
+++ 
b/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java
@@ -174,7 +174,6 @@ public class ValueSourceAugmenter extends DocTransformer {
     int docBase;
     int localDocId;
 
-    @Override
     public int docID() {
       return localDocId;
     }
@@ -202,7 +201,6 @@ public class ValueSourceAugmenter extends DocTransformer {
       this.score = score;
     }
 
-    @Override
     public int docID() {
       return docid;
     }
diff --git 
a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java 
b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
index fc553d5e887..ad43dab593b 100644
--- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
+++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
@@ -192,7 +192,7 @@ public class ExtendedDismaxQParser extends QParser {
     // a MatchAllDocsQuery. Using MatchAllDocsQuery by itself enables later 
optimizations
     BooleanQuery topQueryBoolean = (BooleanQuery) topQuery;
     if (topQueryBoolean.clauses().size() == 1) {
-      Query onlyQuery = topQueryBoolean.clauses().get(0).getQuery();
+      Query onlyQuery = topQueryBoolean.clauses().get(0).query();
       if (onlyQuery instanceof MatchAllDocsQuery) {
         topQuery = onlyQuery;
       }
diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java 
b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
index d46de04002d..5d76a4886c2 100644
--- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
+++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
@@ -49,6 +49,7 @@ import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiPostingsEnum;
 import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.QueryTimeout;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml 
b/solr/core/src/test-files/solr/collection1/conf/schema.xml
index 5e4ba701336..e20f31d4bf3 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml
@@ -471,12 +471,13 @@
     </analyzer>
   </fieldType>
   <!-- 
-    Example of using PathHierarchyTokenizerFactory at query time, so
-    queries for paths match documents at that path, or in ancestor paths
+    Example of using PathHierarchyTokenizerFactory for ancestor queries.
+    Since Lucene 10 produces sequential tokens, we need to store paths at 
index time
+    to match against sequential query tokens properly.
   -->
   <fieldType name="ancestor_path" class="solr.TextField">
     <analyzer type="index">
-      <tokenizer class="solr.KeywordTokenizerFactory"/>
+      <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
     </analyzer>
     <analyzer type="query">
       <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
diff --git 
a/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java
 
b/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java
index 805141e77a6..68d2b0db0ad 100644
--- 
a/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java
+++ 
b/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java
@@ -88,24 +88,24 @@ public class PathHierarchyTokenizerFactoryTest extends 
SolrTestCaseJ4 {
   }
 
   public void testAncestors() {
-
+    // NOTE: In Lucene 10+, PathHierarchyTokenizer produces sequential tokens 
instead of overlapping tokens.
+    // This changes the behavior of ancestor queries - they now match based on 
sequential token positions
+    // rather than overlapping positions, so ancestor matching behavior has 
changed.
+    
     assertQ(
         req("{!field f=cat_ancestor}Books/NonFic/Science"),
         "//*[@numFound='2']",
-        "//str[@name='id' and .='40']",
-        "//str[@name='id' and .='42']");
+        "//str[@name='id' and .='42']",
+        "//str[@name='id' and .='43']");
     assertQ(
         req("{!field f=cat_ancestor}Books/NonFic/Law"),
-        "//*[@numFound='3']",
-        "//str[@name='id' and .='40']",
+        "//*[@numFound='2']",
         "//str[@name='id' and .='41']",
         "//str[@name='id' and .='42']");
 
     assertQ(
         req("{!field f=cat_ancestor}Books/NonFic/Science/Physics"),
-        "//*[@numFound='3']",
-        "//str[@name='id' and .='40']",
-        "//str[@name='id' and .='42']",
+        "//*[@numFound='1']",
         "//str[@name='id' and .='43']");
   }
 }
diff --git 
a/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc 
b/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc
index 2740511a1b8..88c6058531c 100644
--- 
a/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc
+++ 
b/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc
@@ -123,3 +123,33 @@ Nowadays, the HTTP request is available via internal APIs: 
`SolrQueryRequest.get
 
 === Upgrade to Jetty 12.x
 Solr upgraded to Jetty 12.x from 10.x as Jetty 10 and 11 have reached 
end-of-life support. Jetty 12.x requires Java 17 or newer and is fully 
compatible with Solr's new minimum requirement of Java 21. This upgrade brings 
support for modern HTTP protocols and adopts the Jakarta EE 10 namespace. For 
more details, see https://webtide.com/jetty-12-has-arrived/.
+
+=== Analysis and Tokenizers
+
+==== PathHierarchyTokenizer Behavior Change
+
+Due to Lucene 10 changes (https://github.com/apache/lucene/pull/12875), 
`PathHierarchyTokenizer` now produces sequential tokens (position increment = 
1) instead of overlapping tokens (position increment = 0). This affects 
ancestor queries that relied on overlapping token matching. Users should test 
existing queries and update configurations if needed.
+
+*Example configuration change:*
+[source,xml]
+----
+<!-- Before: Query-time tokenization for ancestors -->
+<fieldType name="ancestor_path" class="solr.TextField">
+  <analyzer type="index">
+    <tokenizer class="solr.KeywordTokenizerFactory"/>
+  </analyzer>
+  <analyzer type="query">
+    <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
+  </analyzer>
+</fieldType>
+
+<!-- After: Index-time tokenization for modern behavior -->
+<fieldType name="ancestor_path" class="solr.TextField">
+  <analyzer type="index">
+    <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
+  </analyzer>
+  <analyzer type="query">
+    <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
+  </analyzer>
+</fieldType>
+----

(solr) 05/14: SOLR-17631: Fix PathHierarchyTokenizerFactoryTest for Lucene 10

Reply via email to