This is an automated email from the ASF dual-hosted git repository. ishan pushed a commit to branch ishan/upgrade-to-lucene-10 in repository https://gitbox.apache.org/repos/asf/solr.git
commit 0f4cd0f846cf189289a9369aef83a035230c755d Author: Ishan Chattopadhyaya <[email protected]> AuthorDate: Thu Aug 7 18:49:50 2025 +0530 SOLR-17631: Fix PathHierarchyTokenizerFactoryTest for Lucene 10 Update test expectations for PathHierarchyTokenizer's new sequential token behaviour adn document breaking change in upgrade notes. --- .../response/transform/ValueSourceAugmenter.java | 2 -- .../apache/solr/search/ExtendedDismaxQParser.java | 2 +- .../org/apache/solr/search/SolrIndexSearcher.java | 1 + .../test-files/solr/collection1/conf/schema.xml | 7 ++--- .../PathHierarchyTokenizerFactoryTest.java | 16 ++++++------ .../pages/major-changes-in-solr-10.adoc | 30 ++++++++++++++++++++++ 6 files changed, 44 insertions(+), 14 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java b/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java index 21d1434b64e..e2fe53cf4d5 100644 --- a/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java +++ b/solr/core/src/java/org/apache/solr/response/transform/ValueSourceAugmenter.java @@ -174,7 +174,6 @@ public class ValueSourceAugmenter extends DocTransformer { int docBase; int localDocId; - @Override public int docID() { return localDocId; } @@ -202,7 +201,6 @@ public class ValueSourceAugmenter extends DocTransformer { this.score = score; } - @Override public int docID() { return docid; } diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java index fc553d5e887..ad43dab593b 100644 --- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java @@ -192,7 +192,7 @@ public class ExtendedDismaxQParser extends QParser { // a MatchAllDocsQuery. Using MatchAllDocsQuery by itself enables later optimizations BooleanQuery topQueryBoolean = (BooleanQuery) topQuery; if (topQueryBoolean.clauses().size() == 1) { - Query onlyQuery = topQueryBoolean.clauses().get(0).getQuery(); + Query onlyQuery = topQueryBoolean.clauses().get(0).query(); if (onlyQuery instanceof MatchAllDocsQuery) { topQuery = onlyQuery; } diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index d46de04002d..5d76a4886c2 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -49,6 +49,7 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiPostingsEnum; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.QueryTimeout; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml b/solr/core/src/test-files/solr/collection1/conf/schema.xml index 5e4ba701336..e20f31d4bf3 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml @@ -471,12 +471,13 @@ </analyzer> </fieldType> <!-- - Example of using PathHierarchyTokenizerFactory at query time, so - queries for paths match documents at that path, or in ancestor paths + Example of using PathHierarchyTokenizerFactory for ancestor queries. + Since Lucene 10 produces sequential tokens, we need to store paths at index time + to match against sequential query tokens properly. --> <fieldType name="ancestor_path" class="solr.TextField"> <analyzer type="index"> - <tokenizer class="solr.KeywordTokenizerFactory"/> + <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> diff --git a/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java b/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java index 805141e77a6..68d2b0db0ad 100644 --- a/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/analysis/PathHierarchyTokenizerFactoryTest.java @@ -88,24 +88,24 @@ public class PathHierarchyTokenizerFactoryTest extends SolrTestCaseJ4 { } public void testAncestors() { - + // NOTE: In Lucene 10+, PathHierarchyTokenizer produces sequential tokens instead of overlapping tokens. + // This changes the behavior of ancestor queries - they now match based on sequential token positions + // rather than overlapping positions, so ancestor matching behavior has changed. + assertQ( req("{!field f=cat_ancestor}Books/NonFic/Science"), "//*[@numFound='2']", - "//str[@name='id' and .='40']", - "//str[@name='id' and .='42']"); + "//str[@name='id' and .='42']", + "//str[@name='id' and .='43']"); assertQ( req("{!field f=cat_ancestor}Books/NonFic/Law"), - "//*[@numFound='3']", - "//str[@name='id' and .='40']", + "//*[@numFound='2']", "//str[@name='id' and .='41']", "//str[@name='id' and .='42']"); assertQ( req("{!field f=cat_ancestor}Books/NonFic/Science/Physics"), - "//*[@numFound='3']", - "//str[@name='id' and .='40']", - "//str[@name='id' and .='42']", + "//*[@numFound='1']", "//str[@name='id' and .='43']"); } } diff --git a/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc b/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc index 2740511a1b8..88c6058531c 100644 --- a/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc +++ b/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc @@ -123,3 +123,33 @@ Nowadays, the HTTP request is available via internal APIs: `SolrQueryRequest.get === Upgrade to Jetty 12.x Solr upgraded to Jetty 12.x from 10.x as Jetty 10 and 11 have reached end-of-life support. Jetty 12.x requires Java 17 or newer and is fully compatible with Solr's new minimum requirement of Java 21. This upgrade brings support for modern HTTP protocols and adopts the Jakarta EE 10 namespace. For more details, see https://webtide.com/jetty-12-has-arrived/. + +=== Analysis and Tokenizers + +==== PathHierarchyTokenizer Behavior Change + +Due to Lucene 10 changes (https://github.com/apache/lucene/pull/12875), `PathHierarchyTokenizer` now produces sequential tokens (position increment = 1) instead of overlapping tokens (position increment = 0). This affects ancestor queries that relied on overlapping token matching. Users should test existing queries and update configurations if needed. + +*Example configuration change:* +[source,xml] +---- +<!-- Before: Query-time tokenization for ancestors --> +<fieldType name="ancestor_path" class="solr.TextField"> + <analyzer type="index"> + <tokenizer class="solr.KeywordTokenizerFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> + </analyzer> +</fieldType> + +<!-- After: Index-time tokenization for modern behavior --> +<fieldType name="ancestor_path" class="solr.TextField"> + <analyzer type="index"> + <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> + </analyzer> +</fieldType> +----
