This is an automated email from the ASF dual-hosted git repository.
xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new f1b2f461e5 Refactor CaseSensitiveAnalyzer and StandardAnalyzer to
CaseAwareStandardAnalyzer (#15830)
f1b2f461e5 is described below
commit f1b2f461e516c7846f554c28da2d349205e1ce88
Author: Xiang Fu <[email protected]>
AuthorDate: Tue May 20 21:29:51 2025 +0800
Refactor CaseSensitiveAnalyzer and StandardAnalyzer to
CaseAwareStandardAnalyzer (#15830)
---
.../pinot/queries/TextSearchQueriesTest.java | 25 ++++-----
.../impl/invertedindex/NativeMutableTextIndex.java | 4 +-
.../creator/impl/text/NativeTextIndexCreator.java | 4 +-
...nalyzer.java => CaseAwareStandardAnalyzer.java} | 59 ++++++++++++++++++----
.../local/segment/store/TextIndexUtils.java | 30 ++++++-----
5 files changed, 83 insertions(+), 39 deletions(-)
diff --git
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index b0ad5f7e1c..4b4f8d8102 100644
---
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -33,7 +33,7 @@ import java.util.Objects;
import java.util.Random;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
@@ -58,6 +58,7 @@ import
org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoa
import
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndex;
import
org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl;
import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
+import
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader;
import org.apache.pinot.segment.spi.ImmutableSegment;
import org.apache.pinot.segment.spi.IndexSegment;
@@ -1372,15 +1373,15 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
// create and open an index writer
File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test1.index");
Directory indexDirectory = FSDirectory.open(indexFile.toPath());
- StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
- IndexWriterConfig indexWriterConfig = new
IndexWriterConfig(standardAnalyzer);
+ Analyzer analyzer = new CaseAwareStandardAnalyzer();
+ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setRAMBufferSizeMB(500);
IndexWriter indexWriter = new IndexWriter(indexDirectory,
indexWriterConfig);
// create an NRT index reader
SearcherManager searcherManager = new SearcherManager(indexWriter, false,
false, null);
- QueryParser queryParser = new QueryParser("skill", standardAnalyzer);
+ QueryParser queryParser = new QueryParser("skill", analyzer);
Query query = queryParser.parse("\"machine learning\"");
// acquire a searcher
@@ -1542,8 +1543,8 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
// create and open an index writer
File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test2.index");
Directory indexDirectory = FSDirectory.open(indexFile.toPath());
- StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
- IndexWriterConfig indexWriterConfig = new
IndexWriterConfig(standardAnalyzer);
+ CaseAwareStandardAnalyzer analyzer = new CaseAwareStandardAnalyzer();
+ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setRAMBufferSizeMB(50);
IndexWriter indexWriter = new IndexWriter(indexDirectory,
indexWriterConfig);
@@ -1553,7 +1554,7 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
indexWriter.addDocument(docToIndex);
// create an NRT index reader from the writer -- should see one
uncommitted document
- QueryParser queryParser = new QueryParser("skill", standardAnalyzer);
+ QueryParser queryParser = new QueryParser("skill", analyzer);
Query query = queryParser.parse("\"distributed systems\" AND (Java C++)");
IndexReader indexReader1 = DirectoryReader.open(indexWriter);
IndexSearcher searcher1 = new IndexSearcher(indexReader1);
@@ -1592,9 +1593,9 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
throws Exception {
File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test3.index");
Directory indexDirectory = FSDirectory.open(indexFile.toPath());
- StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
+ Analyzer analyzer = new CaseAwareStandardAnalyzer();
// create and open a writer
- IndexWriterConfig indexWriterConfig = new
IndexWriterConfig(standardAnalyzer);
+ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setRAMBufferSizeMB(500);
IndexWriter indexWriter = new IndexWriter(indexDirectory,
indexWriterConfig);
@@ -1608,7 +1609,7 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
// start writer and reader
Thread writer = new Thread(new RealtimeWriter(indexWriter));
- Thread realtimeReader = new Thread(new RealtimeReader(searcherManager,
standardAnalyzer));
+ Thread realtimeReader = new Thread(new RealtimeReader(searcherManager,
analyzer));
writer.start();
realtimeReader.start();
@@ -1674,8 +1675,8 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
private final QueryParser _queryParser;
private final SearcherManager _searcherManager;
- RealtimeReader(SearcherManager searcherManager, StandardAnalyzer
standardAnalyzer) {
- _queryParser = new QueryParser("skill", standardAnalyzer);
+ RealtimeReader(SearcherManager searcherManager, Analyzer analyzer) {
+ _queryParser = new QueryParser("skill", analyzer);
_searcherManager = searcherManager;
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
index 1e56c57c87..abeeb08cbc 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
@@ -25,9 +25,9 @@ import java.util.List;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
import org.apache.pinot.segment.local.utils.nativefst.mutablefst.MutableFST;
import
org.apache.pinot.segment.local.utils.nativefst.mutablefst.MutableFSTImpl;
import
org.apache.pinot.segment.local.utils.nativefst.utils.RealTimeRegexpMatcher;
@@ -58,7 +58,7 @@ public class NativeMutableTextIndex implements
MutableTextIndex {
ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock();
_readLock = readWriteLock.readLock();
_writeLock = readWriteLock.writeLock();
- _analyzer = new
StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
+ _analyzer = new
CaseAwareStandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
}
@Override
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
index 832801883d..7ef4d25214 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
@@ -31,10 +31,10 @@ import java.util.TreeMap;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import
org.apache.pinot.segment.local.segment.creator.impl.inv.BitmapInvertedIndexWriter;
import
org.apache.pinot.segment.local.segment.index.text.AbstractTextIndexCreator;
+import
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
import org.apache.pinot.segment.local.utils.nativefst.FST;
import org.apache.pinot.segment.local.utils.nativefst.FSTHeader;
import org.apache.pinot.segment.local.utils.nativefst.builder.FSTBuilder;
@@ -87,7 +87,7 @@ public class NativeTextIndexCreator extends
AbstractTextIndexCreator {
}
_fstIndexFile = new File(_tempDir, FST_FILE_NAME);
_invertedIndexFile = new File(_tempDir, INVERTED_INDEX_FILE_NAME);
- _analyzer = new
StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
+ _analyzer = new
CaseAwareStandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
}
@Override
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java
similarity index 60%
rename from
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
rename to
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java
index d8b003f7ae..25552b00ec 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java
@@ -19,6 +19,7 @@
package org.apache.pinot.segment.local.segment.index.text;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -26,36 +27,55 @@ import
org.apache.lucene.analysis.standard.StandardTokenizer;
/**
- * A {@link org.apache.lucene.analysis.Analyzer} for case-sensitive text.
+ * A {@link org.apache.lucene.analysis.Analyzer} for standard text that is
case-aware.
+ * This analyzer supports both case-sensitive and case-insensitive modes,
making it
+ * suitable for use cases where case sensitivity is configurable.
+ * <p>
* It's directly copied from {@link
org.apache.lucene.analysis.standard.StandardAnalyzer} but
- * removes the lowercase filter.
+ * allows case-sensitive tokenization.
+ * <p>
+ * The analyzer applies lowercasing to tokens only when the {@code
caseSensitive} flag is set to
+ * {@code false} (the default behavior, same as {@link
org.apache.lucene.analysis.standard.StandardAnalyzer}).
+ * When {@code caseSensitive} is {@code true}, tokens preserve their original
case.
*/
-public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase {
+public class CaseAwareStandardAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int _maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+ private final boolean _caseSensitive;
+
/**
* Builds an analyzer with the given stop words.
*
* @param stopWords stop words
*/
- public CaseSensitiveAnalyzer(CharArraySet stopWords) {
- super(stopWords);
+ public CaseAwareStandardAnalyzer(CharArraySet stopWords) {
+ this(stopWords, false);
}
/** Builds an analyzer with no stop words. */
- public CaseSensitiveAnalyzer() {
- this(CharArraySet.EMPTY_SET);
+ public CaseAwareStandardAnalyzer() {
+ this(CharArraySet.EMPTY_SET, false);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopWords stop words
+ */
+ public CaseAwareStandardAnalyzer(CharArraySet stopWords, boolean
caseSensitive) {
+ super(stopWords);
+ _caseSensitive = caseSensitive;
}
/**
* Set the max allowed token length. Tokens larger than this will be chopped
up at this token
* length and emitted as multiple tokens. If you need to skip such large
tokens, you could
* increase this max length, and then use {@code LengthFilter} to remove
long tokens. The default
- * is {@link
org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+ * is {@link CaseAwareStandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*/
public void setMaxTokenLength(int length) {
_maxTokenLength = length;
@@ -70,11 +90,24 @@ public class CaseSensitiveAnalyzer extends
StopwordAnalyzerBase {
return _maxTokenLength;
}
+ /**
+ * Returns true if the analyzer is case sensitive
+ */
+ public boolean isCaseSensitive() {
+ return _caseSensitive;
+ }
+
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final StandardTokenizer tokenizer = new StandardTokenizer();
tokenizer.setMaxTokenLength(_maxTokenLength);
- TokenStream tok = new StopFilter(tokenizer, stopwords);
+ TokenStream tok;
+ if (_caseSensitive) {
+ tok = tokenizer;
+ } else {
+ tok = new LowerCaseFilter(tokenizer);
+ }
+ tok = new StopFilter(tok, stopwords);
return new TokenStreamComponents(
r -> {
tokenizer.setMaxTokenLength(_maxTokenLength);
@@ -82,4 +115,12 @@ public class CaseSensitiveAnalyzer extends
StopwordAnalyzerBase {
},
tok);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ if (_caseSensitive) {
+ return in;
+ }
+ return new LowerCaseFilter(in);
+ }
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index 8d982af4ee..d8d633213e 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -36,7 +36,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
-import org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer;
+import
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.V1Constants;
import org.apache.pinot.segment.spi.V1Constants.Indexes;
@@ -51,6 +51,7 @@ import org.slf4j.LoggerFactory;
public class TextIndexUtils {
private static final Logger LOGGER =
LoggerFactory.getLogger(TextIndexUtils.class);
+
private TextIndexUtils() {
}
@@ -135,18 +136,20 @@ public class TextIndexUtils {
* @return Lucene Analyzer class instance
* @throws ReflectiveOperationException if instantiation via reflection fails
*/
- public static Analyzer getAnalyzer(TextIndexConfig config) throws
ReflectiveOperationException {
+ public static Analyzer getAnalyzer(TextIndexConfig config)
+ throws ReflectiveOperationException {
String luceneAnalyzerClassName = config.getLuceneAnalyzerClass();
List<String> luceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs();
List<String> luceneAnalyzerClassArgTypes =
config.getLuceneAnalyzerClassArgTypes();
if (null == luceneAnalyzerClassName || luceneAnalyzerClassName.isEmpty()
- ||
(luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName())
- && luceneAnalyzerClassArgs.isEmpty() &&
luceneAnalyzerClassArgTypes.isEmpty())) {
+ ||
((luceneAnalyzerClassName.equals(CaseAwareStandardAnalyzer.class.getName())
+ || luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName()))
+ && luceneAnalyzerClassArgs.isEmpty() &&
luceneAnalyzerClassArgTypes.isEmpty())) {
// When there is no analyzer defined, or when StandardAnalyzer (default)
is used without arguments,
// use existing logic to obtain an instance of StandardAnalyzer with
customized stop words
return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
- config.getStopWordsInclude(), config.getStopWordsExclude(),
config.isCaseSensitive());
+ config.getStopWordsInclude(), config.getStopWordsExclude(),
config.isCaseSensitive());
}
// Custom analyzer + custom configs via reflection
@@ -177,7 +180,7 @@ public class TextIndexUtils {
// Return a new instance of custom lucene analyzer class
return (Analyzer)
luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class<?>[0]))
- .newInstance(argValues.toArray(new Object[0]));
+ .newInstance(argValues.toArray(new Object[0]));
}
/**
@@ -186,7 +189,8 @@ public class TextIndexUtils {
* @return Class object of the value type
* @throws ClassNotFoundException when the value type is not supported
*/
- public static Class<?> parseSupportedTypes(String valueTypeString) throws
ClassNotFoundException {
+ public static Class<?> parseSupportedTypes(String valueTypeString)
+ throws ClassNotFoundException {
try {
// Support both primitive types + class
switch (valueTypeString) {
@@ -223,7 +227,7 @@ public class TextIndexUtils {
* @throws ReflectiveOperationException if value cannot be coerced without
ambiguity or encountered unsupported type
*/
public static Object parseSupportedTypeValues(String stringValue, Class<?>
clazz)
- throws ReflectiveOperationException {
+ throws ReflectiveOperationException {
try {
if (clazz.equals(String.class)) {
return stringValue;
@@ -260,7 +264,7 @@ public class TextIndexUtils {
}
} catch (NumberFormatException | ReflectiveOperationException ex) {
String exceptionMessage = "Custom analyzer argument cannot be coerced
from "
- + stringValue + " to " + clazz.getName() + " type";
+ + stringValue + " to " + clazz.getName() + " type";
LOGGER.error(exceptionMessage);
throw new ReflectiveOperationException(exceptionMessage);
} catch (UnsupportedOperationException ex) {
@@ -280,14 +284,12 @@ public class TextIndexUtils {
if (stopWordsExclude != null) {
stopWordsExclude.forEach(stopWordSet::remove);
}
- if (isCaseSensitive) {
- return new CaseSensitiveAnalyzer(new CharArraySet(stopWordSet, false));
- }
- return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
+ return new CaseAwareStandardAnalyzer(new CharArraySet(stopWordSet,
!isCaseSensitive), isCaseSensitive);
}
public static Constructor<QueryParserBase>
getQueryParserWithStringAndAnalyzerTypeConstructor(
- String queryParserClassName) throws ReflectiveOperationException {
+ String queryParserClassName)
+ throws ReflectiveOperationException {
// Fail-fast if the query parser is specified class is not QueryParseBase
class
final Class<?> queryParserClass = Class.forName(queryParserClassName);
if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]