(solr) branch branch_10_0 updated: Remove LowercaseTokenizer (#3863)

janhoy Sun, 16 Nov 2025 06:24:00 -0800

This is an automated email from the ASF dual-hosted git repository.

janhoy pushed a commit to branch branch_10_0
in repository https://gitbox.apache.org/repos/asf/solr.git



The following commit(s) were added to refs/heads/branch_10_0 by this push:
     new c16f4e0eb0c Remove LowercaseTokenizer (#3863)
c16f4e0eb0c is described below

commit c16f4e0eb0cc1c8add1245dc21a49051a00d5cdd
Author: Jan Høydahl <[email protected]>
AuthorDate: Sun Nov 16 14:38:37 2025 +0100

    Remove LowercaseTokenizer (#3863)
    
    (cherry picked from commit cdeecacbccad954e107e9ee720f0c937fe929028)
---
 ...ove-LowerCaseTokenizer-deprecated-tokenizer.yml |   7 +
 .../apache/solr/analysis/LowerCaseTokenizer.java   | 153 ---------------------
 .../solr/analysis/LowerCaseTokenizerFactory.java   |  82 -----------
 .../solr/collection1/conf/schema-deprecations.xml  |   6 -
 .../solr/analysis/TestDeprecatedFilters.java       |  35 -----
 .../modules/indexing-guide/pages/tokenizers.adoc   |  49 -------
 .../pages/major-changes-in-solr-10.adoc            |   7 +-
 7 files changed, 12 insertions(+), 327 deletions(-)

diff --git 
a/changelog/unreleased/PR#3863-Remove-LowerCaseTokenizer-deprecated-tokenizer.yml
 
b/changelog/unreleased/PR#3863-Remove-LowerCaseTokenizer-deprecated-tokenizer.yml
new file mode 100644
index 00000000000..e16bd21d8b2
--- /dev/null
+++ 
b/changelog/unreleased/PR#3863-Remove-LowerCaseTokenizer-deprecated-tokenizer.yml
@@ -0,0 +1,7 @@
+title: Remove LowerCaseTokenizer and LowerCaseTokenizerFactory classes which 
were deprecated in SOLR-12775
+type: removed
+authors:
+- name: Jan Høydahl
+links:
+- name: PR#3863
+  url: https://github.com/apache/solr/pull/3863
diff --git 
a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizer.java 
b/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizer.java
deleted file mode 100644
index 41fe0ad8e4d..00000000000
--- a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizer.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.analysis;
-
-import java.io.IOException;
-import org.apache.lucene.analysis.CharacterUtils;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LetterTokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.AttributeFactory;
-
-/**
- * LowerCaseTokenizer performs the function of LetterTokenizer and 
LowerCaseFilter together. It
- * divides text at non-letters and converts them to lower case. While it is 
functionally equivalent
- * to the combination of LetterTokenizer and LowerCaseFilter, there is a 
performance advantage to
- * doing the two tasks at once, hence this (redundant) implementation.
- *
- * <p>Note: this does a decent job for most European languages, but does a 
terrible job for some
- * Asian languages, where words are not separated by spaces.
- *
- * @deprecated Use {@link LetterTokenizer} and {@link 
org.apache.lucene.analysis.LowerCaseFilter}
- */
-@Deprecated
-public final class LowerCaseTokenizer extends Tokenizer {
-
-  /** Construct a new LowerCaseTokenizer. */
-  public LowerCaseTokenizer() {
-    this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
-  }
-
-  /**
-   * Construct a new LowerCaseTokenizer using a given {@link
-   * org.apache.lucene.util.AttributeFactory}.
-   *
-   * @param factory the attribute factory to use for this {@link Tokenizer}
-   */
-  public LowerCaseTokenizer(AttributeFactory factory) {
-    this(factory, DEFAULT_MAX_WORD_LEN);
-  }
-
-  /**
-   * Construct a new LowerCaseTokenizer using a given {@link
-   * org.apache.lucene.util.AttributeFactory}.
-   *
-   * @param factory the attribute factory to use for this {@link Tokenizer}
-   * @param maxTokenLen maximum token length the tokenizer will emit. Must be 
greater than 0 and
-   *     less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
-   * @throws IllegalArgumentException if maxTokenLen is invalid.
-   */
-  public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
-    super(factory);
-    this.maxTokenLen = maxTokenLen;
-  }
-
-  private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
-  public static final int DEFAULT_MAX_WORD_LEN = 255;
-  private static final int IO_BUFFER_SIZE = 4096;
-  private final int maxTokenLen;
-
-  private final CharTermAttribute termAtt = 
addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = 
addAttribute(OffsetAttribute.class);
-
-  private final CharacterUtils.CharacterBuffer ioBuffer =
-      CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
-
-  @Override
-  public final boolean incrementToken() throws IOException {
-    clearAttributes();
-    int length = 0;
-    int start = -1; // this variable is always initialized
-    int end = -1;
-    char[] buffer = termAtt.buffer();
-    while (true) {
-      if (bufferIndex >= dataLen) {
-        offset += dataLen;
-        CharacterUtils.fill(ioBuffer, input); // read supplementary char aware 
with CharacterUtils
-        if (ioBuffer.getLength() == 0) {
-          dataLen = 0; // so next offset += dataLen won't decrement offset
-          if (length > 0) {
-            break;
-          } else {
-            finalOffset = correctOffset(offset);
-            return false;
-          }
-        }
-        dataLen = ioBuffer.getLength();
-        bufferIndex = 0;
-      }
-      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if 
the char based
-      // methods are gone
-      final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, 
ioBuffer.getLength());
-      final int charCount = Character.charCount(c);
-      bufferIndex += charCount;
-
-      if (Character.isLetter(c)) { // if it's a token char
-        if (length == 0) { // start of token
-          assert start == -1;
-          start = offset + bufferIndex - charCount;
-          end = start;
-        } else if (length
-            >= buffer.length - 1) { // check if a supplementary could run out 
of bounds
-          buffer = termAtt.resizeBuffer(2 + length); // make sure a 
supplementary fits in the buffer
-        }
-        end += charCount;
-        // buffer it, normalized
-        length += Character.toChars(Character.toLowerCase(c), buffer, length);
-        // buffer overflow! make sure to check for >= surrogate pair could 
break == test
-        if (length >= maxTokenLen) {
-          break;
-        }
-      } else if (length > 0) { // at non-Letter w/ chars
-        break; // return 'em
-      }
-    }
-
-    termAtt.setLength(length);
-    assert start != -1;
-    offsetAtt.setOffset(correctOffset(start), finalOffset = 
correctOffset(end));
-    return true;
-  }
-
-  @Override
-  public final void end() throws IOException {
-    super.end();
-    // set final offset
-    offsetAtt.setOffset(finalOffset, finalOffset);
-  }
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    bufferIndex = 0;
-    offset = 0;
-    dataLen = 0;
-    finalOffset = 0;
-    ioBuffer.reset(); // make sure to reset the IO buffer!!
-  }
-}
diff --git 
a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java 
b/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
deleted file mode 100644
index 66ce856a041..00000000000
--- a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.analysis;
-
-import static 
org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
-
-import java.util.Map;
-import org.apache.lucene.analysis.TokenizerFactory;
-import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
-import org.apache.lucene.analysis.util.CharTokenizer;
-import org.apache.lucene.util.AttributeFactory;
-
-/**
- * Factory for {@link LowerCaseTokenizer}.
- *
- * <pre class="prettyprint">
- * &lt;fieldType name="text_lwrcase" class="solr.TextField" 
positionIncrementGap="100"&gt;
- * &lt;analyzer&gt;
- * &lt;tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/&gt;
- * &lt;/analyzer&gt;
- * &lt;/fieldType&gt;</pre>
- *
- * <p>Options:
- *
- * <ul>
- *   <li>maxTokenLen: max token length, should be greater than 0 and less than
- *       MAX_TOKEN_LENGTH_LIMIT (1024*1024). It is rare to need to change this 
else {@link
- *       CharTokenizer}::DEFAULT_MAX_WORD_LEN
- * </ul>
- *
- * @deprecated Use {@link 
org.apache.lucene.analysis.core.LetterTokenizerFactory} and {@link
- *     LowerCaseFilterFactory}
- * @lucene.spi {@value #NAME}
- */
-@Deprecated
-public class LowerCaseTokenizerFactory extends TokenizerFactory {
-
-  /** SPI name */
-  public static final String NAME = "lowercase";
-
-  private final int maxTokenLen;
-
-  /** Creates a new LowerCaseTokenizerFactory */
-  public LowerCaseTokenizerFactory(Map<String, String> args) {
-    super(args);
-    maxTokenLen = getInt(args, "maxTokenLen", 
CharTokenizer.DEFAULT_MAX_WORD_LEN);
-    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
-      throw new IllegalArgumentException(
-          "maxTokenLen must be greater than 0 and less than "
-              + MAX_TOKEN_LENGTH_LIMIT
-              + " passed: "
-              + maxTokenLen);
-    }
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
-  }
-
-  /** Default ctor for compatibility with SPI */
-  public LowerCaseTokenizerFactory() {
-    throw defaultCtorException();
-  }
-
-  @Override
-  public LowerCaseTokenizer create(AttributeFactory factory) {
-    return new LowerCaseTokenizer(factory, maxTokenLen);
-  }
-}
diff --git 
a/solr/core/src/test-files/solr/collection1/conf/schema-deprecations.xml 
b/solr/core/src/test-files/solr/collection1/conf/schema-deprecations.xml
index bf726cbf72b..e50bfa8575c 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-deprecations.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-deprecations.xml
@@ -18,11 +18,6 @@
 <schema name="schema-deprecations" version="1.7">
 
   <types>
-    <fieldType name="lowertok" class="solr.TextField">
-      <analyzer>
-        <tokenizer class="solr.LowerCaseTokenizerFactory"/>
-      </analyzer>
-    </fieldType>
     <fieldType name="string" class="solr.StrField"/>
     <fieldType name="long" class="${solr.tests.LongFieldType}"/>
   </types>
@@ -30,7 +25,6 @@
   <fields>
     <field name="id" type="string" indexed="true" stored="true" 
multiValued="false" required="true"/>
     <field name="_version_" type="long" indexed="false" stored="false"/>
-    <field name="lowertext" type="lowertok" indexed="true"/>
   </fields>
 
 </schema>
\ No newline at end of file
diff --git 
a/solr/core/src/test/org/apache/solr/analysis/TestDeprecatedFilters.java 
b/solr/core/src/test/org/apache/solr/analysis/TestDeprecatedFilters.java
deleted file mode 100644
index c1f54fb7f76..00000000000
--- a/solr/core/src/test/org/apache/solr/analysis/TestDeprecatedFilters.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.analysis;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.junit.BeforeClass;
-
-public class TestDeprecatedFilters extends SolrTestCaseJ4 {
-
-  @BeforeClass
-  public static void beforeClass() throws Exception {
-    initCore("solrconfig-leader.xml", "schema-deprecations.xml");
-  }
-
-  public void testLowerCaseTokenizer() {
-    assertU(adoc("id", "1", "lowertext", "THIS IS A TEST"));
-    assertU(commit());
-    assertQ(req("lowertext:test"), "//result[@numFound=1]");
-  }
-}
diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/tokenizers.adoc 
b/solr/solr-ref-guide/modules/indexing-guide/pages/tokenizers.adoc
index 3c5aef1e523..d8be1c2b4f2 100644
--- a/solr/solr-ref-guide/modules/indexing-guide/pages/tokenizers.adoc
+++ b/solr/solr-ref-guide/modules/indexing-guide/pages/tokenizers.adoc
@@ -322,55 +322,6 @@ With class name (legacy)::
 
 *Out:* "I", "can", "t"
 
-== Lower Case Tokenizer
-
-Tokenizes the input stream by delimiting at non-letters and then converting 
all letters to lowercase.
-Whitespace and non-letters are discarded.
-
-*Factory class:* `solr.LowerCaseTokenizerFactory`
-
-*Arguments:*
-
-`maxTokenLen`::
-+
-[%autowidth,frame=none]
-|===
-s|Optional |Default: `255`
-|===
-+
-Maximum token length the tokenizer will emit.
-
-*Example:*
-
-[tabs#tokenizer-lowercase]
-======
-With name::
-+
-====
-[source,xml]
-----
-<analyzer>
-  <tokenizer name="lowercase"/>
-</analyzer>
-----
-====
-
-With class name (legacy)::
-+
-====
-[source,xml]
-----
-<analyzer>
-  <tokenizer class="solr.LowerCaseTokenizerFactory"/>
-</analyzer>
-----
-====
-======
-
-*In:* "I just \*LOVE* my iPhone!"
-
-*Out:* "i", "just", "love", "my", "iphone"
-
 == N-Gram Tokenizer
 
 Reads the field text and generates n-gram tokens of sizes in the given range.
diff --git 
a/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc 
b/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc
index c283d0c03b6..4bfe3cb67ea 100644
--- 
a/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc
+++ 
b/solr/solr-ref-guide/modules/upgrade-notes/pages/major-changes-in-solr-10.adoc
@@ -37,7 +37,9 @@ This compatibility safeguard can be toggled with 
"SOLR_CLOUD_DOWNGRADE_ENABLED".
 
 Minimum Java version for Solr 10.x is Java 21.
 
-== Solr Jetty parameters
+== Solr 10.0
+
+=== Solr Jetty parameters
 
 The previous `SOLR_JETTY_HOST` environment variable and `-Dsolr.jetty.host` 
System Property are deprecated and will be removed in a future release. Please 
update your configuration to use `SOLR_HOST_BIND` and `-Dsolr.host.bind` 
instead.
 
@@ -45,7 +47,6 @@ The previous `SOLR_HOST` and 'host' are deprecated and now 
use `SOLR_HOST_ADVERT
 
 The previous `jetty.port` is deprecated and now use `solr.port.listen`.
 
-
 === Solr CLI and Scripts
 
 The Solr CLI has gone through some significant renovations to reduce technical 
debt, and now functions more consistently and predictably. Most notably, CLI 
commands now use unix-style options, e.g. `--help` instead of `-help`.
@@ -192,6 +193,8 @@ Nowadays, the HTTP request is available via internal APIs: 
`SolrQueryRequest.get
 
 * SolrInfoMBeanHandler and PluginInfoHandler have been removed
 
+* The deprecated `LowerCaseTokenizer` and `LowerCaseTokenizerFactory` have 
been removed. These classes were deprecated in Solr 8 and can be replaced by 
combining `LetterTokenizerFactory` with `LowerCaseFilterFactory`.
+
 === Security
 
 * There is no longer a distinction between trusted and untrusted configSets; 
all configSets are now considered trusted. To ensure security, Solr should be 
properly protected using authentication and authorization mechanisms, allowing 
only authorized users with administrative privileges to publish them.

(solr) branch branch_10_0 updated: Remove LowercaseTokenizer (#3863)

Reply via email to