(pinot) branch master updated: Add CaseSensitiveAnalyzer and support for case-sensitive text indexing (#15803)

xiangfu Sat, 17 May 2025 00:12:58 -0700

This is an automated email from the ASF dual-hosted git repository.

xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new 736f70f059 Add CaseSensitiveAnalyzer and support for case-sensitive 
text indexing (#15803)
736f70f059 is described below

commit 736f70f059b53eb5be5a8aedf61c4974b711aa3e
Author: Xiang Fu <[email protected]>
AuthorDate: Sat May 17 15:12:48 2025 +0800

    Add CaseSensitiveAnalyzer and support for case-sensitive text indexing 
(#15803)
---
 .../integration/tests/custom/TextIndicesTest.java  | 88 +++++++++++++++++++---
 .../segment/index/text/CaseSensitiveAnalyzer.java  | 85 +++++++++++++++++++++
 .../local/segment/store/TextIndexUtils.java        | 10 ++-
 .../pinot/segment/spi/index/TextIndexConfig.java   | 46 +++++++++--
 .../fineFoodReviews_offline_table_config.json      | 15 ++--
 .../fineFoodReviews_realtime_table_config.json     | 12 +--
 6 files changed, 223 insertions(+), 33 deletions(-)

diff --git 
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
 
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
index 353cd00396..bf520e28cf 100644
--- 
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
+++ 
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
@@ -18,6 +18,9 @@
  */
 package org.apache.pinot.integration.tests.custom;
 
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.InputStream;
@@ -25,9 +28,7 @@ import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import javax.annotation.Nullable;
 import org.apache.avro.file.DataFileWriter;
 import org.apache.avro.generic.GenericData;
@@ -49,10 +50,11 @@ import static org.testng.AssertJUnit.fail;
 
 @Test(suiteName = "CustomClusterIntegrationTest")
 public class TextIndicesTest extends CustomDataQueryClusterIntegrationTest {
-
+  private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
   private static final String DEFAULT_TABLE_NAME = "TextIndicesTest";
 
   private static final String TEXT_COLUMN_NAME = "skills";
+  private static final String TEXT_COLUMN_NAME_CASE_SENSITIVE = 
"skills_case_sensitive";
   private static final String TEXT_COLUMN_NAME_NATIVE = "skills_native";
   private static final String TIME_COLUMN_NAME = "millisSinceEpoch";
   private static final int NUM_SKILLS = 28;
@@ -87,7 +89,7 @@ public class TextIndicesTest extends 
CustomDataQueryClusterIntegrationTest {
 
   @Override
   protected List<String> getNoDictionaryColumns() {
-    return Collections.singletonList(TEXT_COLUMN_NAME);
+    return List.of(TEXT_COLUMN_NAME, TEXT_COLUMN_NAME_CASE_SENSITIVE);
   }
 
   @Nullable
@@ -104,13 +106,50 @@ public class TextIndicesTest extends 
CustomDataQueryClusterIntegrationTest {
 
   @Override
   protected List<FieldConfig> getFieldConfigs() {
-    Map<String, String> propertiesMap = new HashMap<>();
-    propertiesMap.put(FieldConfig.TEXT_FST_TYPE, 
FieldConfig.TEXT_NATIVE_FST_LITERAL);
+    ObjectNode textColumnIndexes;
+    try {
+      textColumnIndexes = (ObjectNode) OBJECT_MAPPER.readTree("{\"text\": 
{}}");
+    } catch (JsonProcessingException e) {
+      throw new RuntimeException(e);
+    }
+    FieldConfig textColumnFieldConfig =
+        new FieldConfig(TEXT_COLUMN_NAME, FieldConfig.EncodingType.RAW, null, 
null, null, null, textColumnIndexes, null,
+            null);
 
-    return Arrays.asList(
-        new FieldConfig(TEXT_COLUMN_NAME, FieldConfig.EncodingType.RAW, 
FieldConfig.IndexType.TEXT, null, null),
-        new FieldConfig(TEXT_COLUMN_NAME_NATIVE, FieldConfig.EncodingType.RAW, 
FieldConfig.IndexType.TEXT, null,
-            propertiesMap));
+    ObjectNode textColumnCaseSensitiveIndexes;
+    try {
+      textColumnCaseSensitiveIndexes = (ObjectNode) OBJECT_MAPPER.readTree(
+          "{"
+              + "  \"text\": "
+              + "  {"
+              + "    \"caseSensitive\": \"true\""
+              + "  }"
+              + "}"
+      );
+    } catch (JsonProcessingException e) {
+      throw new RuntimeException(e);
+    }
+    FieldConfig textColumnCaseSensitiveFieldConfig =
+        new FieldConfig(TEXT_COLUMN_NAME_CASE_SENSITIVE, 
FieldConfig.EncodingType.RAW, null, null, null, null,
+            textColumnCaseSensitiveIndexes, null, null);
+
+    ObjectNode textColumnNativeIndexes;
+    try {
+      textColumnNativeIndexes = (ObjectNode) OBJECT_MAPPER.readTree(
+          "{"
+              + "  \"text\": "
+              + "  {"
+              + "    \"fst\": \"NATIVE\""
+              + "  }"
+              + "}"
+      );
+    } catch (JsonProcessingException e) {
+      throw new RuntimeException(e);
+    }
+    FieldConfig textColumnNativeFieldConfig =
+        new FieldConfig(TEXT_COLUMN_NAME_NATIVE, FieldConfig.EncodingType.RAW, 
null, null, null, null,
+            textColumnNativeIndexes, null, null);
+    return Arrays.asList(textColumnFieldConfig, 
textColumnCaseSensitiveFieldConfig, textColumnNativeFieldConfig);
   }
 
   @Override
@@ -122,6 +161,7 @@ public class TextIndicesTest extends 
CustomDataQueryClusterIntegrationTest {
   public Schema createSchema() {
     return new Schema.SchemaBuilder().setSchemaName(getTableName())
         .addSingleValueDimension(TEXT_COLUMN_NAME, FieldSpec.DataType.STRING)
+        .addSingleValueDimension(TEXT_COLUMN_NAME_CASE_SENSITIVE, 
FieldSpec.DataType.STRING)
         .addSingleValueDimension(TEXT_COLUMN_NAME_NATIVE, 
FieldSpec.DataType.STRING)
         .addDateTime(TIME_COLUMN_NAME, FieldSpec.DataType.LONG, 
"1:MILLISECONDS:EPOCH", "1:MILLISECONDS").build();
   }
@@ -150,6 +190,8 @@ public class TextIndicesTest extends 
CustomDataQueryClusterIntegrationTest {
     org.apache.avro.Schema avroSchema = 
org.apache.avro.Schema.createRecord("myRecord", null, null, false);
     avroSchema.setFields(Arrays.asList(new 
org.apache.avro.Schema.Field(TEXT_COLUMN_NAME,
             org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), 
null, null),
+        new org.apache.avro.Schema.Field(TEXT_COLUMN_NAME_CASE_SENSITIVE,
+            org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), 
null, null),
         new org.apache.avro.Schema.Field(TEXT_COLUMN_NAME_NATIVE,
             org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), 
null, null),
         new org.apache.avro.Schema.Field(TIME_COLUMN_NAME,
@@ -159,6 +201,7 @@ public class TextIndicesTest extends 
CustomDataQueryClusterIntegrationTest {
       for (int i = 0; i < NUM_RECORDS; i++) {
         GenericData.Record record = new GenericData.Record(avroSchema);
         record.put(TEXT_COLUMN_NAME, skills.get(i % NUM_SKILLS));
+        record.put(TEXT_COLUMN_NAME_CASE_SENSITIVE, skills.get(i % 
NUM_SKILLS));
         record.put(TEXT_COLUMN_NAME_NATIVE, skills.get(i % NUM_SKILLS));
         record.put(TIME_COLUMN_NAME, System.currentTimeMillis());
         fileWriter.append(record);
@@ -215,8 +258,29 @@ public class TextIndicesTest extends 
CustomDataQueryClusterIntegrationTest {
       Thread.sleep(100);
     }
 
-    
assertTrue(getTextColumnQueryResult(String.format(TEST_TEXT_COLUMN_QUERY_NATIVE,
 getTableName()))
-        == NUM_MATCHING_RECORDS_NATIVE);
+    
assertEquals(getTextColumnQueryResult(String.format(TEST_TEXT_COLUMN_QUERY_NATIVE,
 getTableName())),
+        NUM_MATCHING_RECORDS_NATIVE);
+  }
+
+  @Test(dataProvider = "useBothQueryEngines")
+  public void testTextSearchCountQueryCaseSensitive(boolean 
useMultiStageQueryEngine)
+      throws Exception {
+    setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+    // Keep posting queries until all records are consumed
+    long previousResult = 0;
+
+    String queryWithMatch = "SELECT COUNT(*) FROM %s WHERE 
TEXT_MATCH(skills_case_sensitive, 'Java')";
+    String queryWithoutMatch = "SELECT COUNT(*) FROM %s WHERE 
TEXT_MATCH(skills_case_sensitive, 'java')";
+    while (getCurrentCountStarResult() < NUM_RECORDS) {
+      long result = getTextColumnQueryResult(String.format(queryWithMatch, 
getTableName()));
+      assertTrue(result >= previousResult);
+      previousResult = result;
+      Thread.sleep(100);
+    }
+
+    assertEquals(getTextColumnQueryResult(String.format(queryWithMatch, 
getTableName())), 12000);
+    // Test case sensitive match, all skills are 'Java' not 'java'
+    assertEquals(getTextColumnQueryResult(String.format(queryWithoutMatch, 
getTableName())), 0);
   }
 
   private long getTextColumnQueryResult(String query)
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
new file mode 100644
index 0000000000..d8b003f7ae
--- /dev/null
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+
+/**
+ * A {@link org.apache.lucene.analysis.Analyzer} for case-sensitive text.
+ * It's directly copied from {@link 
org.apache.lucene.analysis.standard.StandardAnalyzer} but
+ * removes the lowercase filter.
+ */
+public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase {
+
+  /** Default maximum allowed token length */
+  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+  private int _maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+  /**
+   * Builds an analyzer with the given stop words.
+   *
+   * @param stopWords stop words
+   */
+  public CaseSensitiveAnalyzer(CharArraySet stopWords) {
+    super(stopWords);
+  }
+
+  /** Builds an analyzer with no stop words. */
+  public CaseSensitiveAnalyzer() {
+    this(CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Set the max allowed token length. Tokens larger than this will be chopped 
up at this token
+   * length and emitted as multiple tokens. If you need to skip such large 
tokens, you could
+   * increase this max length, and then use {@code LengthFilter} to remove 
long tokens. The default
+   * is {@link 
org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+   */
+  public void setMaxTokenLength(int length) {
+    _maxTokenLength = length;
+  }
+
+  /**
+   * Returns the current maximum token length
+   *
+   * @see #setMaxTokenLength
+   */
+  public int getMaxTokenLength() {
+    return _maxTokenLength;
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(final String fieldName) {
+    final StandardTokenizer tokenizer = new StandardTokenizer();
+    tokenizer.setMaxTokenLength(_maxTokenLength);
+    TokenStream tok = new StopFilter(tokenizer, stopwords);
+    return new TokenStreamComponents(
+        r -> {
+          tokenizer.setMaxTokenLength(_maxTokenLength);
+          tokenizer.setReader(r);
+        },
+        tok);
+  }
+}
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index 63383aebbb..8d982af4ee 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.queryparser.classic.QueryParserBase;
 import 
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer;
 import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.spi.V1Constants;
 import org.apache.pinot.segment.spi.V1Constants.Indexes;
@@ -145,7 +146,7 @@ public class TextIndexUtils {
       // When there is no analyzer defined, or when StandardAnalyzer (default) 
is used without arguments,
       // use existing logic to obtain an instance of StandardAnalyzer with 
customized stop words
       return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
-              config.getStopWordsInclude(), config.getStopWordsExclude());
+              config.getStopWordsInclude(), config.getStopWordsExclude(), 
config.isCaseSensitive());
     }
 
     // Custom analyzer + custom configs via reflection
@@ -270,8 +271,8 @@ public class TextIndexUtils {
     }
   }
 
-  public static StandardAnalyzer 
getStandardAnalyzerWithCustomizedStopWords(@Nullable List<String> 
stopWordsInclude,
-      @Nullable List<String> stopWordsExclude) {
+  public static Analyzer getStandardAnalyzerWithCustomizedStopWords(@Nullable 
List<String> stopWordsInclude,
+      @Nullable List<String> stopWordsExclude, boolean isCaseSensitive) {
     HashSet<String> stopWordSet = 
LuceneTextIndexCreator.getDefaultEnglishStopWordsSet();
     if (stopWordsInclude != null) {
       stopWordSet.addAll(stopWordsInclude);
@@ -279,6 +280,9 @@ public class TextIndexUtils {
     if (stopWordsExclude != null) {
       stopWordsExclude.forEach(stopWordSet::remove);
     }
+    if (isCaseSensitive) {
+      return new CaseSensitiveAnalyzer(new CharArraySet(stopWordSet, false));
+    }
     return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
   }
 
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
index b32704c179..fc1de28337 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -42,6 +42,7 @@ public class TextIndexConfig extends IndexConfig {
   private static final boolean 
LUCENE_INDEX_DEFAULT_USE_AND_FOR_MULTI_TERM_QUERIES = false;
   private static final boolean LUCENE_USE_LOG_BYTE_SIZE_MERGE_POLICY = false;
   private static final DocIdTranslatorMode LUCENE_TRANSLATOR_MODE = null;
+  private static final boolean LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX = 
false;
 
   // keep in sync with constructor!
   private static final List<String> PROPERTY_NAMES = List.of(
@@ -49,13 +50,13 @@ public class TextIndexConfig extends IndexConfig {
       "luceneUseCompoundFile", "luceneMaxBufferSizeMB", "luceneAnalyzerClass", 
"luceneAnalyzerClassArgs",
       "luceneAnalyzerClassArgTypes", "luceneQueryParserClass", 
"enablePrefixSuffixMatchingInPhraseQueries",
       "reuseMutableIndex", "luceneNRTCachingDirectoryMaxBufferSizeMB", 
"useLogByteSizeMergePolicy",
-      "docIdTranslatorMode"
+      "docIdTranslatorMode", "caseSensitive"
   );
 
   public static final TextIndexConfig DISABLED =
       new TextIndexConfig(true, null, null, false, false, 
Collections.emptyList(), Collections.emptyList(), false,
           LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, null, null, null, 
false, false, 0, false,
-          null);
+          null, LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX);
 
   private final FSTType _fstType;
   @Nullable
@@ -75,6 +76,7 @@ public class TextIndexConfig extends IndexConfig {
   private final int _luceneNRTCachingDirectoryMaxBufferSizeMB;
   private final boolean _useLogByteSizeMergePolicy;
   private final DocIdTranslatorMode _docIdTranslatorMode;
+  private final boolean _caseSensitive;
 
   public enum DocIdTranslatorMode {
     // build and keep mapping
@@ -98,6 +100,21 @@ public class TextIndexConfig extends IndexConfig {
     }
   }
 
+  public TextIndexConfig(Boolean disabled, FSTType fstType, Object 
rawValueForTextIndex, boolean enableQueryCache,
+      boolean useANDForMultiTermQueries, List<String> stopWordsInclude, 
List<String> stopWordsExclude,
+      Boolean luceneUseCompoundFile, Integer luceneMaxBufferSizeMB, String 
luceneAnalyzerClass,
+      String luceneAnalyzerClassArgs, String luceneAnalyzerClassArgTypes, 
String luceneQueryParserClass,
+      Boolean enablePrefixSuffixMatchingInPhraseQueries, Boolean 
reuseMutableIndex,
+      Integer luceneNRTCachingDirectoryMaxBufferSizeMB, Boolean 
useLogByteSizeMergePolicy,
+      DocIdTranslatorMode docIdTranslatorMode) {
+    this(disabled, fstType, rawValueForTextIndex, enableQueryCache, 
useANDForMultiTermQueries,
+        stopWordsInclude, stopWordsExclude, luceneUseCompoundFile, 
luceneMaxBufferSizeMB, luceneAnalyzerClass,
+        luceneAnalyzerClassArgs, luceneAnalyzerClassArgTypes, 
luceneQueryParserClass,
+        enablePrefixSuffixMatchingInPhraseQueries, reuseMutableIndex,
+        luceneNRTCachingDirectoryMaxBufferSizeMB, useLogByteSizeMergePolicy, 
docIdTranslatorMode,
+        LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX);
+  }
+
   @JsonCreator
   public TextIndexConfig(@JsonProperty("disabled") Boolean disabled,
       @JsonProperty("fst") FSTType fstType,
@@ -116,7 +133,8 @@ public class TextIndexConfig extends IndexConfig {
       @JsonProperty("reuseMutableIndex") Boolean reuseMutableIndex,
       @JsonProperty("luceneNRTCachingDirectoryMaxBufferSizeMB") Integer 
luceneNRTCachingDirectoryMaxBufferSizeMB,
       @JsonProperty("useLogByteSizeMergePolicy") Boolean 
useLogByteSizeMergePolicy,
-      @JsonProperty("docIdTranslatorMode") DocIdTranslatorMode 
docIdTranslatorMode) {
+      @JsonProperty("docIdTranslatorMode") DocIdTranslatorMode 
docIdTranslatorMode,
+      @JsonProperty("caseSensitive") Boolean caseSensitive) {
     super(disabled);
     _fstType = fstType;
     _rawValueForTextIndex = rawValueForTextIndex;
@@ -137,7 +155,7 @@ public class TextIndexConfig extends IndexConfig {
     _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true, 
false);
     _luceneAnalyzerClassArgTypes = 
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
     _luceneQueryParserClass = luceneQueryParserClass == null
-            ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS : 
luceneQueryParserClass;
+        ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS : 
luceneQueryParserClass;
     _enablePrefixSuffixMatchingInPhraseQueries =
         enablePrefixSuffixMatchingInPhraseQueries == null ? 
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH
             : enablePrefixSuffixMatchingInPhraseQueries;
@@ -148,6 +166,7 @@ public class TextIndexConfig extends IndexConfig {
     _useLogByteSizeMergePolicy = useLogByteSizeMergePolicy == null ? 
LUCENE_USE_LOG_BYTE_SIZE_MERGE_POLICY
         : useLogByteSizeMergePolicy;
     _docIdTranslatorMode = docIdTranslatorMode == null ? 
LUCENE_TRANSLATOR_MODE : docIdTranslatorMode;
+    _caseSensitive = caseSensitive == null ? 
LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX : caseSensitive;
   }
 
   public FSTType getFstType() {
@@ -250,6 +269,10 @@ public class TextIndexConfig extends IndexConfig {
     return _luceneNRTCachingDirectoryMaxBufferSizeMB;
   }
 
+  public boolean isCaseSensitive() {
+    return _caseSensitive;
+  }
+
   public static abstract class AbstractBuilder {
     @Nullable
     protected FSTType _fstType;
@@ -272,6 +295,7 @@ public class TextIndexConfig extends IndexConfig {
     protected boolean _useLogByteSizeMergePolicy = 
LUCENE_USE_LOG_BYTE_SIZE_MERGE_POLICY;
     @Nullable
     protected DocIdTranslatorMode _docIdTranslatorMode = 
LUCENE_TRANSLATOR_MODE;
+    protected boolean _caseSensitive = 
LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX;
 
     public AbstractBuilder(@Nullable FSTType fstType) {
       _fstType = fstType;
@@ -296,6 +320,7 @@ public class TextIndexConfig extends IndexConfig {
       _luceneNRTCachingDirectoryMaxBufferSizeMB = 
other._luceneNRTCachingDirectoryMaxBufferSizeMB;
       _useLogByteSizeMergePolicy = other._useLogByteSizeMergePolicy;
       _docIdTranslatorMode = other._docIdTranslatorMode;
+      _caseSensitive = other._caseSensitive;
     }
 
     public TextIndexConfig build() {
@@ -305,7 +330,7 @@ public class TextIndexConfig extends IndexConfig {
           CsvParser.serialize(_luceneAnalyzerClassArgTypes, true, false),
           _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries, 
_reuseMutableIndex,
           _luceneNRTCachingDirectoryMaxBufferSizeMB, 
_useLogByteSizeMergePolicy,
-          _docIdTranslatorMode);
+          _docIdTranslatorMode, _caseSensitive);
     }
 
     public abstract AbstractBuilder withProperties(@Nullable Map<String, 
String> textIndexProperties);
@@ -395,6 +420,11 @@ public class TextIndexConfig extends IndexConfig {
       _docIdTranslatorMode = DocIdTranslatorMode.of(mode);
       return this;
     }
+
+    public AbstractBuilder withCaseSensitive(boolean caseSensitive) {
+      _caseSensitive = caseSensitive;
+      return this;
+    }
   }
 
   @Override
@@ -425,7 +455,8 @@ public class TextIndexConfig extends IndexConfig {
         && Objects.equals(_luceneAnalyzerClass, that._luceneAnalyzerClass)
         && Objects.equals(_luceneAnalyzerClassArgs, 
that._luceneAnalyzerClassArgs)
         && Objects.equals(_luceneAnalyzerClassArgTypes, 
that._luceneAnalyzerClassArgTypes)
-        && Objects.equals(_luceneQueryParserClass, 
that._luceneQueryParserClass);
+        && Objects.equals(_luceneQueryParserClass, 
that._luceneQueryParserClass)
+        && _caseSensitive == that._caseSensitive;
   }
 
   @Override
@@ -434,7 +465,8 @@ public class TextIndexConfig extends IndexConfig {
         _useANDForMultiTermQueries, _stopWordsInclude, _stopWordsExclude, 
_luceneUseCompoundFile,
         _luceneMaxBufferSizeMB, _luceneAnalyzerClass, 
_luceneAnalyzerClassArgs, _luceneAnalyzerClassArgTypes,
         _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries, 
_reuseMutableIndex,
-        _luceneNRTCachingDirectoryMaxBufferSizeMB, _useLogByteSizeMergePolicy, 
_docIdTranslatorMode);
+        _luceneNRTCachingDirectoryMaxBufferSizeMB, _useLogByteSizeMergePolicy, 
_docIdTranslatorMode,
+        _caseSensitive);
   }
 
   public static boolean isProperty(String prop) {
diff --git 
a/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
 
b/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
index ad082e3612..750d18a3d3 100644
--- 
a/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
+++ 
b/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
@@ -28,16 +28,19 @@
         "vectorIndexType": "HNSW",
         "vectorDimension": 1536,
         "vectorDistanceFunction": "COSINE",
-        "version": 1
+        "version": 1,
+        "commitDocs": "1"
       }
     },
     {
-      "encodingType": "RAW",
-      "indexType": "TEXT",
       "name": "Text",
-      "properties": {
-        "deriveNumDocsPerChunkForRawIndex": "true",
-        "rawIndexWriterVersion": "3"
+      "encodingType": "RAW",
+      "indexes": {
+        "text": {
+          "deriveNumDocsPerChunkForRawIndex": "true",
+          "rawIndexWriterVersion": "3",
+          "caseSensitive": "true"
+        }
       }
     }
   ]
diff --git 
a/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
 
b/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
index dd9d551e19..665f762569 100644
--- 
a/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
+++ 
b/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
@@ -61,12 +61,14 @@
       }
     },
     {
-      "encodingType": "RAW",
-      "indexType": "TEXT",
       "name": "Text",
-      "properties": {
-        "deriveNumDocsPerChunkForRawIndex": "true",
-        "rawIndexWriterVersion": "3"
+      "encodingType": "RAW",
+      "indexes": {
+        "text": {
+          "deriveNumDocsPerChunkForRawIndex": "true",
+          "rawIndexWriterVersion": "3",
+          "caseSensitive": "false"
+        }
       }
     }
   ]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(pinot) branch master updated: Add CaseSensitiveAnalyzer and support for case-sensitive text indexing (#15803)

Reply via email to