This is an automated email from the ASF dual-hosted git repository.
xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 736f70f059 Add CaseSensitiveAnalyzer and support for case-sensitive
text indexing (#15803)
736f70f059 is described below
commit 736f70f059b53eb5be5a8aedf61c4974b711aa3e
Author: Xiang Fu <[email protected]>
AuthorDate: Sat May 17 15:12:48 2025 +0800
Add CaseSensitiveAnalyzer and support for case-sensitive text indexing
(#15803)
---
.../integration/tests/custom/TextIndicesTest.java | 88 +++++++++++++++++++---
.../segment/index/text/CaseSensitiveAnalyzer.java | 85 +++++++++++++++++++++
.../local/segment/store/TextIndexUtils.java | 10 ++-
.../pinot/segment/spi/index/TextIndexConfig.java | 46 +++++++++--
.../fineFoodReviews_offline_table_config.json | 15 ++--
.../fineFoodReviews_realtime_table_config.json | 12 +--
6 files changed, 223 insertions(+), 33 deletions(-)
diff --git
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
index 353cd00396..bf520e28cf 100644
---
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
+++
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/TextIndicesTest.java
@@ -18,6 +18,9 @@
*/
package org.apache.pinot.integration.tests.custom;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
@@ -25,9 +28,7 @@ import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import javax.annotation.Nullable;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
@@ -49,10 +50,11 @@ import static org.testng.AssertJUnit.fail;
@Test(suiteName = "CustomClusterIntegrationTest")
public class TextIndicesTest extends CustomDataQueryClusterIntegrationTest {
-
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String DEFAULT_TABLE_NAME = "TextIndicesTest";
private static final String TEXT_COLUMN_NAME = "skills";
+ private static final String TEXT_COLUMN_NAME_CASE_SENSITIVE =
"skills_case_sensitive";
private static final String TEXT_COLUMN_NAME_NATIVE = "skills_native";
private static final String TIME_COLUMN_NAME = "millisSinceEpoch";
private static final int NUM_SKILLS = 28;
@@ -87,7 +89,7 @@ public class TextIndicesTest extends
CustomDataQueryClusterIntegrationTest {
@Override
protected List<String> getNoDictionaryColumns() {
- return Collections.singletonList(TEXT_COLUMN_NAME);
+ return List.of(TEXT_COLUMN_NAME, TEXT_COLUMN_NAME_CASE_SENSITIVE);
}
@Nullable
@@ -104,13 +106,50 @@ public class TextIndicesTest extends
CustomDataQueryClusterIntegrationTest {
@Override
protected List<FieldConfig> getFieldConfigs() {
- Map<String, String> propertiesMap = new HashMap<>();
- propertiesMap.put(FieldConfig.TEXT_FST_TYPE,
FieldConfig.TEXT_NATIVE_FST_LITERAL);
+ ObjectNode textColumnIndexes;
+ try {
+ textColumnIndexes = (ObjectNode) OBJECT_MAPPER.readTree("{\"text\":
{}}");
+ } catch (JsonProcessingException e) {
+ throw new RuntimeException(e);
+ }
+ FieldConfig textColumnFieldConfig =
+ new FieldConfig(TEXT_COLUMN_NAME, FieldConfig.EncodingType.RAW, null,
null, null, null, textColumnIndexes, null,
+ null);
- return Arrays.asList(
- new FieldConfig(TEXT_COLUMN_NAME, FieldConfig.EncodingType.RAW,
FieldConfig.IndexType.TEXT, null, null),
- new FieldConfig(TEXT_COLUMN_NAME_NATIVE, FieldConfig.EncodingType.RAW,
FieldConfig.IndexType.TEXT, null,
- propertiesMap));
+ ObjectNode textColumnCaseSensitiveIndexes;
+ try {
+ textColumnCaseSensitiveIndexes = (ObjectNode) OBJECT_MAPPER.readTree(
+ "{"
+ + " \"text\": "
+ + " {"
+ + " \"caseSensitive\": \"true\""
+ + " }"
+ + "}"
+ );
+ } catch (JsonProcessingException e) {
+ throw new RuntimeException(e);
+ }
+ FieldConfig textColumnCaseSensitiveFieldConfig =
+ new FieldConfig(TEXT_COLUMN_NAME_CASE_SENSITIVE,
FieldConfig.EncodingType.RAW, null, null, null, null,
+ textColumnCaseSensitiveIndexes, null, null);
+
+ ObjectNode textColumnNativeIndexes;
+ try {
+ textColumnNativeIndexes = (ObjectNode) OBJECT_MAPPER.readTree(
+ "{"
+ + " \"text\": "
+ + " {"
+ + " \"fst\": \"NATIVE\""
+ + " }"
+ + "}"
+ );
+ } catch (JsonProcessingException e) {
+ throw new RuntimeException(e);
+ }
+ FieldConfig textColumnNativeFieldConfig =
+ new FieldConfig(TEXT_COLUMN_NAME_NATIVE, FieldConfig.EncodingType.RAW,
null, null, null, null,
+ textColumnNativeIndexes, null, null);
+ return Arrays.asList(textColumnFieldConfig,
textColumnCaseSensitiveFieldConfig, textColumnNativeFieldConfig);
}
@Override
@@ -122,6 +161,7 @@ public class TextIndicesTest extends
CustomDataQueryClusterIntegrationTest {
public Schema createSchema() {
return new Schema.SchemaBuilder().setSchemaName(getTableName())
.addSingleValueDimension(TEXT_COLUMN_NAME, FieldSpec.DataType.STRING)
+ .addSingleValueDimension(TEXT_COLUMN_NAME_CASE_SENSITIVE,
FieldSpec.DataType.STRING)
.addSingleValueDimension(TEXT_COLUMN_NAME_NATIVE,
FieldSpec.DataType.STRING)
.addDateTime(TIME_COLUMN_NAME, FieldSpec.DataType.LONG,
"1:MILLISECONDS:EPOCH", "1:MILLISECONDS").build();
}
@@ -150,6 +190,8 @@ public class TextIndicesTest extends
CustomDataQueryClusterIntegrationTest {
org.apache.avro.Schema avroSchema =
org.apache.avro.Schema.createRecord("myRecord", null, null, false);
avroSchema.setFields(Arrays.asList(new
org.apache.avro.Schema.Field(TEXT_COLUMN_NAME,
org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING),
null, null),
+ new org.apache.avro.Schema.Field(TEXT_COLUMN_NAME_CASE_SENSITIVE,
+ org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING),
null, null),
new org.apache.avro.Schema.Field(TEXT_COLUMN_NAME_NATIVE,
org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING),
null, null),
new org.apache.avro.Schema.Field(TIME_COLUMN_NAME,
@@ -159,6 +201,7 @@ public class TextIndicesTest extends
CustomDataQueryClusterIntegrationTest {
for (int i = 0; i < NUM_RECORDS; i++) {
GenericData.Record record = new GenericData.Record(avroSchema);
record.put(TEXT_COLUMN_NAME, skills.get(i % NUM_SKILLS));
+ record.put(TEXT_COLUMN_NAME_CASE_SENSITIVE, skills.get(i %
NUM_SKILLS));
record.put(TEXT_COLUMN_NAME_NATIVE, skills.get(i % NUM_SKILLS));
record.put(TIME_COLUMN_NAME, System.currentTimeMillis());
fileWriter.append(record);
@@ -215,8 +258,29 @@ public class TextIndicesTest extends
CustomDataQueryClusterIntegrationTest {
Thread.sleep(100);
}
-
assertTrue(getTextColumnQueryResult(String.format(TEST_TEXT_COLUMN_QUERY_NATIVE,
getTableName()))
- == NUM_MATCHING_RECORDS_NATIVE);
+
assertEquals(getTextColumnQueryResult(String.format(TEST_TEXT_COLUMN_QUERY_NATIVE,
getTableName())),
+ NUM_MATCHING_RECORDS_NATIVE);
+ }
+
+ @Test(dataProvider = "useBothQueryEngines")
+ public void testTextSearchCountQueryCaseSensitive(boolean
useMultiStageQueryEngine)
+ throws Exception {
+ setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+ // Keep posting queries until all records are consumed
+ long previousResult = 0;
+
+ String queryWithMatch = "SELECT COUNT(*) FROM %s WHERE
TEXT_MATCH(skills_case_sensitive, 'Java')";
+ String queryWithoutMatch = "SELECT COUNT(*) FROM %s WHERE
TEXT_MATCH(skills_case_sensitive, 'java')";
+ while (getCurrentCountStarResult() < NUM_RECORDS) {
+ long result = getTextColumnQueryResult(String.format(queryWithMatch,
getTableName()));
+ assertTrue(result >= previousResult);
+ previousResult = result;
+ Thread.sleep(100);
+ }
+
+ assertEquals(getTextColumnQueryResult(String.format(queryWithMatch,
getTableName())), 12000);
+ // Test case sensitive match, all skills are 'Java' not 'java'
+ assertEquals(getTextColumnQueryResult(String.format(queryWithoutMatch,
getTableName())), 0);
}
private long getTextColumnQueryResult(String query)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
new file mode 100644
index 0000000000..d8b003f7ae
--- /dev/null
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+
+/**
+ * A {@link org.apache.lucene.analysis.Analyzer} for case-sensitive text.
+ * It's directly copied from {@link
org.apache.lucene.analysis.standard.StandardAnalyzer} but
+ * removes the lowercase filter.
+ */
+public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase {
+
+ /** Default maximum allowed token length */
+ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int _maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopWords stop words
+ */
+ public CaseSensitiveAnalyzer(CharArraySet stopWords) {
+ super(stopWords);
+ }
+
+ /** Builds an analyzer with no stop words. */
+ public CaseSensitiveAnalyzer() {
+ this(CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Set the max allowed token length. Tokens larger than this will be chopped
up at this token
+ * length and emitted as multiple tokens. If you need to skip such large
tokens, you could
+ * increase this max length, and then use {@code LengthFilter} to remove
long tokens. The default
+ * is {@link
org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+ */
+ public void setMaxTokenLength(int length) {
+ _maxTokenLength = length;
+ }
+
+ /**
+ * Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength
+ */
+ public int getMaxTokenLength() {
+ return _maxTokenLength;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(final String fieldName) {
+ final StandardTokenizer tokenizer = new StandardTokenizer();
+ tokenizer.setMaxTokenLength(_maxTokenLength);
+ TokenStream tok = new StopFilter(tokenizer, stopwords);
+ return new TokenStreamComponents(
+ r -> {
+ tokenizer.setMaxTokenLength(_maxTokenLength);
+ tokenizer.setReader(r);
+ },
+ tok);
+ }
+}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index 63383aebbb..8d982af4ee 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer;
import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.V1Constants;
import org.apache.pinot.segment.spi.V1Constants.Indexes;
@@ -145,7 +146,7 @@ public class TextIndexUtils {
// When there is no analyzer defined, or when StandardAnalyzer (default)
is used without arguments,
// use existing logic to obtain an instance of StandardAnalyzer with
customized stop words
return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
- config.getStopWordsInclude(), config.getStopWordsExclude());
+ config.getStopWordsInclude(), config.getStopWordsExclude(),
config.isCaseSensitive());
}
// Custom analyzer + custom configs via reflection
@@ -270,8 +271,8 @@ public class TextIndexUtils {
}
}
- public static StandardAnalyzer
getStandardAnalyzerWithCustomizedStopWords(@Nullable List<String>
stopWordsInclude,
- @Nullable List<String> stopWordsExclude) {
+ public static Analyzer getStandardAnalyzerWithCustomizedStopWords(@Nullable
List<String> stopWordsInclude,
+ @Nullable List<String> stopWordsExclude, boolean isCaseSensitive) {
HashSet<String> stopWordSet =
LuceneTextIndexCreator.getDefaultEnglishStopWordsSet();
if (stopWordsInclude != null) {
stopWordSet.addAll(stopWordsInclude);
@@ -279,6 +280,9 @@ public class TextIndexUtils {
if (stopWordsExclude != null) {
stopWordsExclude.forEach(stopWordSet::remove);
}
+ if (isCaseSensitive) {
+ return new CaseSensitiveAnalyzer(new CharArraySet(stopWordSet, false));
+ }
return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
}
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
index b32704c179..fc1de28337 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -42,6 +42,7 @@ public class TextIndexConfig extends IndexConfig {
private static final boolean
LUCENE_INDEX_DEFAULT_USE_AND_FOR_MULTI_TERM_QUERIES = false;
private static final boolean LUCENE_USE_LOG_BYTE_SIZE_MERGE_POLICY = false;
private static final DocIdTranslatorMode LUCENE_TRANSLATOR_MODE = null;
+ private static final boolean LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX =
false;
// keep in sync with constructor!
private static final List<String> PROPERTY_NAMES = List.of(
@@ -49,13 +50,13 @@ public class TextIndexConfig extends IndexConfig {
"luceneUseCompoundFile", "luceneMaxBufferSizeMB", "luceneAnalyzerClass",
"luceneAnalyzerClassArgs",
"luceneAnalyzerClassArgTypes", "luceneQueryParserClass",
"enablePrefixSuffixMatchingInPhraseQueries",
"reuseMutableIndex", "luceneNRTCachingDirectoryMaxBufferSizeMB",
"useLogByteSizeMergePolicy",
- "docIdTranslatorMode"
+ "docIdTranslatorMode", "caseSensitive"
);
public static final TextIndexConfig DISABLED =
new TextIndexConfig(true, null, null, false, false,
Collections.emptyList(), Collections.emptyList(), false,
LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, null, null, null,
false, false, 0, false,
- null);
+ null, LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX);
private final FSTType _fstType;
@Nullable
@@ -75,6 +76,7 @@ public class TextIndexConfig extends IndexConfig {
private final int _luceneNRTCachingDirectoryMaxBufferSizeMB;
private final boolean _useLogByteSizeMergePolicy;
private final DocIdTranslatorMode _docIdTranslatorMode;
+ private final boolean _caseSensitive;
public enum DocIdTranslatorMode {
// build and keep mapping
@@ -98,6 +100,21 @@ public class TextIndexConfig extends IndexConfig {
}
}
+ public TextIndexConfig(Boolean disabled, FSTType fstType, Object
rawValueForTextIndex, boolean enableQueryCache,
+ boolean useANDForMultiTermQueries, List<String> stopWordsInclude,
List<String> stopWordsExclude,
+ Boolean luceneUseCompoundFile, Integer luceneMaxBufferSizeMB, String
luceneAnalyzerClass,
+ String luceneAnalyzerClassArgs, String luceneAnalyzerClassArgTypes,
String luceneQueryParserClass,
+ Boolean enablePrefixSuffixMatchingInPhraseQueries, Boolean
reuseMutableIndex,
+ Integer luceneNRTCachingDirectoryMaxBufferSizeMB, Boolean
useLogByteSizeMergePolicy,
+ DocIdTranslatorMode docIdTranslatorMode) {
+ this(disabled, fstType, rawValueForTextIndex, enableQueryCache,
useANDForMultiTermQueries,
+ stopWordsInclude, stopWordsExclude, luceneUseCompoundFile,
luceneMaxBufferSizeMB, luceneAnalyzerClass,
+ luceneAnalyzerClassArgs, luceneAnalyzerClassArgTypes,
luceneQueryParserClass,
+ enablePrefixSuffixMatchingInPhraseQueries, reuseMutableIndex,
+ luceneNRTCachingDirectoryMaxBufferSizeMB, useLogByteSizeMergePolicy,
docIdTranslatorMode,
+ LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX);
+ }
+
@JsonCreator
public TextIndexConfig(@JsonProperty("disabled") Boolean disabled,
@JsonProperty("fst") FSTType fstType,
@@ -116,7 +133,8 @@ public class TextIndexConfig extends IndexConfig {
@JsonProperty("reuseMutableIndex") Boolean reuseMutableIndex,
@JsonProperty("luceneNRTCachingDirectoryMaxBufferSizeMB") Integer
luceneNRTCachingDirectoryMaxBufferSizeMB,
@JsonProperty("useLogByteSizeMergePolicy") Boolean
useLogByteSizeMergePolicy,
- @JsonProperty("docIdTranslatorMode") DocIdTranslatorMode
docIdTranslatorMode) {
+ @JsonProperty("docIdTranslatorMode") DocIdTranslatorMode
docIdTranslatorMode,
+ @JsonProperty("caseSensitive") Boolean caseSensitive) {
super(disabled);
_fstType = fstType;
_rawValueForTextIndex = rawValueForTextIndex;
@@ -137,7 +155,7 @@ public class TextIndexConfig extends IndexConfig {
_luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true,
false);
_luceneAnalyzerClassArgTypes =
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
_luceneQueryParserClass = luceneQueryParserClass == null
- ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS :
luceneQueryParserClass;
+ ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS :
luceneQueryParserClass;
_enablePrefixSuffixMatchingInPhraseQueries =
enablePrefixSuffixMatchingInPhraseQueries == null ?
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH
: enablePrefixSuffixMatchingInPhraseQueries;
@@ -148,6 +166,7 @@ public class TextIndexConfig extends IndexConfig {
_useLogByteSizeMergePolicy = useLogByteSizeMergePolicy == null ?
LUCENE_USE_LOG_BYTE_SIZE_MERGE_POLICY
: useLogByteSizeMergePolicy;
_docIdTranslatorMode = docIdTranslatorMode == null ?
LUCENE_TRANSLATOR_MODE : docIdTranslatorMode;
+ _caseSensitive = caseSensitive == null ?
LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX : caseSensitive;
}
public FSTType getFstType() {
@@ -250,6 +269,10 @@ public class TextIndexConfig extends IndexConfig {
return _luceneNRTCachingDirectoryMaxBufferSizeMB;
}
+ public boolean isCaseSensitive() {
+ return _caseSensitive;
+ }
+
public static abstract class AbstractBuilder {
@Nullable
protected FSTType _fstType;
@@ -272,6 +295,7 @@ public class TextIndexConfig extends IndexConfig {
protected boolean _useLogByteSizeMergePolicy =
LUCENE_USE_LOG_BYTE_SIZE_MERGE_POLICY;
@Nullable
protected DocIdTranslatorMode _docIdTranslatorMode =
LUCENE_TRANSLATOR_MODE;
+ protected boolean _caseSensitive =
LUCENE_INDEX_DEFAULT_CASE_SENSITIVE_INDEX;
public AbstractBuilder(@Nullable FSTType fstType) {
_fstType = fstType;
@@ -296,6 +320,7 @@ public class TextIndexConfig extends IndexConfig {
_luceneNRTCachingDirectoryMaxBufferSizeMB =
other._luceneNRTCachingDirectoryMaxBufferSizeMB;
_useLogByteSizeMergePolicy = other._useLogByteSizeMergePolicy;
_docIdTranslatorMode = other._docIdTranslatorMode;
+ _caseSensitive = other._caseSensitive;
}
public TextIndexConfig build() {
@@ -305,7 +330,7 @@ public class TextIndexConfig extends IndexConfig {
CsvParser.serialize(_luceneAnalyzerClassArgTypes, true, false),
_luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries,
_reuseMutableIndex,
_luceneNRTCachingDirectoryMaxBufferSizeMB,
_useLogByteSizeMergePolicy,
- _docIdTranslatorMode);
+ _docIdTranslatorMode, _caseSensitive);
}
public abstract AbstractBuilder withProperties(@Nullable Map<String,
String> textIndexProperties);
@@ -395,6 +420,11 @@ public class TextIndexConfig extends IndexConfig {
_docIdTranslatorMode = DocIdTranslatorMode.of(mode);
return this;
}
+
+ public AbstractBuilder withCaseSensitive(boolean caseSensitive) {
+ _caseSensitive = caseSensitive;
+ return this;
+ }
}
@Override
@@ -425,7 +455,8 @@ public class TextIndexConfig extends IndexConfig {
&& Objects.equals(_luceneAnalyzerClass, that._luceneAnalyzerClass)
&& Objects.equals(_luceneAnalyzerClassArgs,
that._luceneAnalyzerClassArgs)
&& Objects.equals(_luceneAnalyzerClassArgTypes,
that._luceneAnalyzerClassArgTypes)
- && Objects.equals(_luceneQueryParserClass,
that._luceneQueryParserClass);
+ && Objects.equals(_luceneQueryParserClass,
that._luceneQueryParserClass)
+ && _caseSensitive == that._caseSensitive;
}
@Override
@@ -434,7 +465,8 @@ public class TextIndexConfig extends IndexConfig {
_useANDForMultiTermQueries, _stopWordsInclude, _stopWordsExclude,
_luceneUseCompoundFile,
_luceneMaxBufferSizeMB, _luceneAnalyzerClass,
_luceneAnalyzerClassArgs, _luceneAnalyzerClassArgTypes,
_luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries,
_reuseMutableIndex,
- _luceneNRTCachingDirectoryMaxBufferSizeMB, _useLogByteSizeMergePolicy,
_docIdTranslatorMode);
+ _luceneNRTCachingDirectoryMaxBufferSizeMB, _useLogByteSizeMergePolicy,
_docIdTranslatorMode,
+ _caseSensitive);
}
public static boolean isProperty(String prop) {
diff --git
a/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
b/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
index ad082e3612..750d18a3d3 100644
---
a/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
+++
b/pinot-tools/src/main/resources/examples/batch/fineFoodReviews/fineFoodReviews_offline_table_config.json
@@ -28,16 +28,19 @@
"vectorIndexType": "HNSW",
"vectorDimension": 1536,
"vectorDistanceFunction": "COSINE",
- "version": 1
+ "version": 1,
+ "commitDocs": "1"
}
},
{
- "encodingType": "RAW",
- "indexType": "TEXT",
"name": "Text",
- "properties": {
- "deriveNumDocsPerChunkForRawIndex": "true",
- "rawIndexWriterVersion": "3"
+ "encodingType": "RAW",
+ "indexes": {
+ "text": {
+ "deriveNumDocsPerChunkForRawIndex": "true",
+ "rawIndexWriterVersion": "3",
+ "caseSensitive": "true"
+ }
}
}
]
diff --git
a/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
b/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
index dd9d551e19..665f762569 100644
---
a/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
+++
b/pinot-tools/src/main/resources/examples/stream/fineFoodReviews/fineFoodReviews_realtime_table_config.json
@@ -61,12 +61,14 @@
}
},
{
- "encodingType": "RAW",
- "indexType": "TEXT",
"name": "Text",
- "properties": {
- "deriveNumDocsPerChunkForRawIndex": "true",
- "rawIndexWriterVersion": "3"
+ "encodingType": "RAW",
+ "indexes": {
+ "text": {
+ "deriveNumDocsPerChunkForRawIndex": "true",
+ "rawIndexWriterVersion": "3",
+ "caseSensitive": "false"
+ }
}
}
]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]