This is an automated email from the ASF dual-hosted git repository.
gortiz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 30b5490db15 [spi] Fix Asymmetric Serialization in TextIndexConfig
(#17628)
30b5490db15 is described below
commit 30b5490db1538f39ab60a375060272bbbe233d8f
Author: Anshul Singh <[email protected]>
AuthorDate: Wed Feb 4 16:39:29 2026 +0530
[spi] Fix Asymmetric Serialization in TextIndexConfig (#17628)
---
.../pinot/segment/spi/index/TextIndexConfig.java | 45 +++++++-
.../segment/spi/index/TextIndexConfigTest.java | 115 +++++++++++++++++++++
2 files changed, 155 insertions(+), 5 deletions(-)
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
index b3d17f3a0a9..471fd2c9894 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -153,8 +153,8 @@ public class TextIndexConfig extends IndexConfig {
@JsonProperty("luceneUseCompoundFile") Boolean luceneUseCompoundFile,
@JsonProperty("luceneMaxBufferSizeMB") Integer luceneMaxBufferSizeMB,
@JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass,
- @JsonProperty("luceneAnalyzerClassArgs") String luceneAnalyzerClassArgs,
- @JsonProperty("luceneAnalyzerClassArgTypes") String
luceneAnalyzerClassArgTypes,
+ @JsonProperty("luceneAnalyzerClassArgs") Object luceneAnalyzerClassArgs,
+ @JsonProperty("luceneAnalyzerClassArgTypes") Object
luceneAnalyzerClassArgTypes,
@JsonProperty("luceneQueryParserClass") String luceneQueryParserClass,
@JsonProperty("enablePrefixSuffixMatchingInPhraseQueries") Boolean
enablePrefixSuffixMatchingInPhraseQueries,
@JsonProperty("reuseMutableIndex") Boolean reuseMutableIndex,
@@ -179,9 +179,11 @@ public class TextIndexConfig extends IndexConfig {
// Note that we cannot depend on jackson's default behavior to
automatically coerce the comma delimited args to
// List<String>. This is because the args may contain comma and other
special characters such as space. Therefore,
- // we use our own csv parser to parse the values directly.
- _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true,
false);
- _luceneAnalyzerClassArgTypes =
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
+ // we use our own csv parser to parse the values directly when the input
is a String.
+ // However, when round-tripping (serializing then deserializing), Jackson
may produce a List<String> directly,
+ // so we also handle that case.
+ _luceneAnalyzerClassArgs = parseToList(luceneAnalyzerClassArgs, true,
false);
+ _luceneAnalyzerClassArgTypes = parseToList(luceneAnalyzerClassArgTypes,
false, true);
_luceneQueryParserClass = luceneQueryParserClass == null
? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS :
luceneQueryParserClass;
_enablePrefixSuffixMatchingInPhraseQueries =
@@ -198,6 +200,39 @@ public class TextIndexConfig extends IndexConfig {
_storeInSegmentFile = storeInSegmentFile == null ?
LUCENE_INDEX_DEFAULT_STORE_IN_SEGMENT_FILE : storeInSegmentFile;
}
+ /**
+ * Parses the input value to a List of Strings.
+ * Handles both String (CSV format) and List<String> (from JSON array)
inputs.
+ * This enables proper round-trip serialization/deserialization since the
getter returns List<String>
+ * which Jackson serializes as a JSON array, but the original input format
was CSV string.
+ *
+ * @param value the input value (can be String, List, or null)
+ * @param escapeComma if true, don't split on escaped commas when parsing
String
+ * @param trim whether to trim each value
+ * @return parsed list of strings, empty list if input is null or empty
+ */
+ @SuppressWarnings("unchecked")
+ private static List<String> parseToList(final @Nullable Object value, final
boolean escapeComma, final boolean trim) {
+ if (value == null) {
+ return Collections.emptyList();
+ }
+ if (value instanceof List) {
+ final List<?> list = (List<?>) value;
+ if (list.isEmpty()) {
+ return Collections.emptyList();
+ }
+ // Convert each element to String and optionally trim
+ final List<String> result = new ArrayList<>();
+ for (final Object item : list) {
+ final String strItem = item == null ? "" : item.toString();
+ result.add(trim ? strItem.trim() : strItem);
+ }
+ return result;
+ }
+ // String or other types - use CSV parser
+ return CsvParser.parse(value.toString(), escapeComma, trim);
+ }
+
public FSTType getFstType() {
return _fstType;
}
diff --git
a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/TextIndexConfigTest.java
b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/TextIndexConfigTest.java
index 96f57dcccf9..0bc0b28a46e 100644
---
a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/TextIndexConfigTest.java
+++
b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/TextIndexConfigTest.java
@@ -121,4 +121,119 @@ public class TextIndexConfigTest {
assertFalse(config.isLuceneUseCompoundFile(), "Unexpected
luceneUseCompoundFile");
assertEquals(config.getLuceneMaxBufferSizeMB(), 1024, "Unexpected
luceneMaxBufferSize");
}
+
+ @Test
+ public void testRoundTripSerializationWithArrayValues()
+ throws JsonProcessingException {
+ // This test verifies that TextIndexConfig can be round-tripped through
JSON serialization.
+ // When serialized, luceneAnalyzerClassArgs/ArgTypes are output as arrays
(from List<String> getters).
+ // When deserialized, the constructor must accept both String (CSV) and
Array (List) formats.
+ final String confStrWithArrays = "{\n"
+ + " \"disabled\": false,\n"
+ + " \"luceneUseCompoundFile\": true,\n"
+ + " \"luceneMaxBufferSizeMB\": 500,\n"
+ + " \"luceneAnalyzerClass\":
\"org.apache.lucene.analysis.standard.StandardAnalyzer\",\n"
+ + " \"luceneAnalyzerClassArgs\": [],\n"
+ + " \"luceneAnalyzerClassArgTypes\": []\n"
+ + "}";
+ final TextIndexConfig config = JsonUtils.stringToObject(confStrWithArrays,
TextIndexConfig.class);
+
+ assertFalse(config.isDisabled(), "Unexpected disabled");
+ assertTrue(config.isLuceneUseCompoundFile(), "Unexpected
luceneUseCompoundFile");
+ assertEquals(config.getLuceneMaxBufferSizeMB(), 500, "Unexpected
luceneMaxBufferSizeMB");
+ assertEquals(config.getLuceneAnalyzerClass(),
+ "org.apache.lucene.analysis.standard.StandardAnalyzer", "Unexpected
luceneAnalyzerClass");
+ assertTrue(config.getLuceneAnalyzerClassArgs().isEmpty(), "Unexpected
luceneAnalyzerClassArgs");
+ assertTrue(config.getLuceneAnalyzerClassArgTypes().isEmpty(), "Unexpected
luceneAnalyzerClassArgTypes");
+
+ // Now test with non-empty arrays
+ final String confStrWithNonEmptyArrays = "{\n"
+ + " \"luceneAnalyzerClassArgs\": [\"arg1\", \"arg2\"],\n"
+ + " \"luceneAnalyzerClassArgTypes\": [\"java.lang.String\",
\"java.lang.Integer\"]\n"
+ + "}";
+ final TextIndexConfig configWithArgs =
JsonUtils.stringToObject(confStrWithNonEmptyArrays, TextIndexConfig.class);
+
+ assertEquals(configWithArgs.getLuceneAnalyzerClassArgs(),
Lists.newArrayList("arg1", "arg2"),
+ "Unexpected luceneAnalyzerClassArgs");
+ assertEquals(configWithArgs.getLuceneAnalyzerClassArgTypes(),
+ Lists.newArrayList("java.lang.String", "java.lang.Integer"),
+ "Unexpected luceneAnalyzerClassArgTypes");
+ }
+
+ @Test
+ public void testRoundTripSerializationWithStringValues()
+ throws JsonProcessingException {
+ // Test backward compatibility - original CSV string format should still
work
+ final String confStrWithStrings = "{\n"
+ + " \"luceneAnalyzerClassArgs\": \"arg1,arg2\",\n"
+ + " \"luceneAnalyzerClassArgTypes\":
\"java.lang.String,java.lang.Integer\"\n"
+ + "}";
+ final TextIndexConfig config =
JsonUtils.stringToObject(confStrWithStrings, TextIndexConfig.class);
+
+ assertEquals(config.getLuceneAnalyzerClassArgs(),
Lists.newArrayList("arg1", "arg2"),
+ "Unexpected luceneAnalyzerClassArgs");
+ assertEquals(config.getLuceneAnalyzerClassArgTypes(),
+ Lists.newArrayList("java.lang.String", "java.lang.Integer"),
+ "Unexpected luceneAnalyzerClassArgTypes");
+ }
+
+ @Test
+ public void testFullRoundTrip()
+ throws JsonProcessingException {
+ // Create config, serialize to JSON, deserialize back - should work
+ final String originalConfStr = "{\n"
+ + " \"luceneAnalyzerClassArgs\": \"arg1,arg2\",\n"
+ + " \"luceneAnalyzerClassArgTypes\":
\"java.lang.String,java.lang.Integer\"\n"
+ + "}";
+ final TextIndexConfig originalConfig =
JsonUtils.stringToObject(originalConfStr, TextIndexConfig.class);
+
+ // Serialize to JSON (getters return List<String>, so it becomes JSON
arrays)
+ final String serialized = JsonUtils.objectToString(originalConfig);
+
+ // Deserialize back - this was the failing case before the fix
+ final TextIndexConfig deserializedConfig =
JsonUtils.stringToObject(serialized, TextIndexConfig.class);
+
+ assertEquals(deserializedConfig.getLuceneAnalyzerClassArgs(),
originalConfig.getLuceneAnalyzerClassArgs(),
+ "Round-trip failed for luceneAnalyzerClassArgs");
+ assertEquals(deserializedConfig.getLuceneAnalyzerClassArgTypes(),
originalConfig.getLuceneAnalyzerClassArgTypes(),
+ "Round-trip failed for luceneAnalyzerClassArgTypes");
+ }
+
+ @Test
+ public void testEmptyTextConfigRoundTrip()
+ throws JsonProcessingException {
+ // When an empty text config is created from "{}", it gets default values.
+ // When serialized, luceneAnalyzerClassArgs becomes [] (empty array).
+ // When deserialized, this was failing with:
+ // "Cannot deserialize value of type `java.lang.String` from Array value"
+ final String emptyConfig = "{}";
+ final TextIndexConfig originalConfig =
JsonUtils.stringToObject(emptyConfig, TextIndexConfig.class);
+
+ // Verify defaults are applied
+ assertFalse(originalConfig.isDisabled());
+ assertEquals(originalConfig.getLuceneAnalyzerClass(),
+ "org.apache.lucene.analysis.standard.StandardAnalyzer");
+ assertTrue(originalConfig.getLuceneAnalyzerClassArgs().isEmpty());
+ assertTrue(originalConfig.getLuceneAnalyzerClassArgTypes().isEmpty());
+
+ // Serialize to JSON - this produces the problematic format with empty
arrays
+ final String serialized = JsonUtils.objectToString(originalConfig);
+
+ // Verify serialized JSON contains the array format that was causing the
bug
+ assertTrue(serialized.contains("\"luceneAnalyzerClassArgs\":[]"),
+ "Serialized JSON should contain luceneAnalyzerClassArgs as empty
array");
+ assertTrue(serialized.contains("\"luceneAnalyzerClassArgTypes\":[]"),
+ "Serialized JSON should contain luceneAnalyzerClassArgTypes as empty
array");
+
+ // Deserialize back
+ final TextIndexConfig deserializedConfig =
JsonUtils.stringToObject(serialized, TextIndexConfig.class);
+
+ // Verify round-trip preserves the config
+ assertEquals(deserializedConfig.isDisabled(), originalConfig.isDisabled());
+ assertEquals(deserializedConfig.getLuceneAnalyzerClass(),
originalConfig.getLuceneAnalyzerClass());
+ assertEquals(deserializedConfig.getLuceneAnalyzerClassArgs(),
originalConfig.getLuceneAnalyzerClassArgs());
+ assertEquals(deserializedConfig.getLuceneAnalyzerClassArgTypes(),
originalConfig.getLuceneAnalyzerClassArgTypes());
+ assertEquals(deserializedConfig.isLuceneUseCompoundFile(),
originalConfig.isLuceneUseCompoundFile());
+ assertEquals(deserializedConfig.getLuceneMaxBufferSizeMB(),
originalConfig.getLuceneMaxBufferSizeMB());
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]