This is an automated email from the ASF dual-hosted git repository.
Jackie-Jiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 8609e4d7469 Persist maxRowLengthInBytes in ColumnMetadata (#18494)
8609e4d7469 is described below
commit 8609e4d7469dc1872bee8a0a225fe0b3232a21e8
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Wed May 13 21:42:32 2026 -0700
Persist maxRowLengthInBytes in ColumnMetadata (#18494)
---
.../org/apache/pinot/core/util/CrcUtilsTest.java | 8 +--
.../segment/creator/impl/BaseSegmentCreator.java | 12 ++--
.../converter/RealtimeSegmentConverterTest.java | 4 +-
.../local/segment/index/ColumnMetadataTest.java | 66 ++++++++++++++++++++++
.../apache/pinot/segment/spi/ColumnMetadata.java | 4 +-
.../org/apache/pinot/segment/spi/V1Constants.java | 8 ++-
.../spi/index/metadata/ColumnMetadataImpl.java | 39 ++++++-------
7 files changed, 105 insertions(+), 36 deletions(-)
diff --git
a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
index 94019398200..b504303770e 100644
--- a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
@@ -105,10 +105,10 @@ public class CrcUtilsTest {
driver.build();
File indexDir = driver.getOutputDirectory();
- assertEquals(CrcUtils.computeCrc(indexDir), 2716945490L);
+ assertEquals(CrcUtils.computeCrc(indexDir), 1541360722L);
new SegmentV1V2ToV3FormatConverter().convert(indexDir);
- assertEquals(CrcUtils.computeCrc(indexDir), 1116143598L);
+ assertEquals(CrcUtils.computeCrc(indexDir), 4234543086L);
}
@Test
@@ -135,10 +135,10 @@ public class CrcUtilsTest {
driver.build();
File indexDir = driver.getOutputDirectory();
- assertEquals(CrcUtils.computeCrc(indexDir), 1019509989L);
+ assertEquals(CrcUtils.computeCrc(indexDir), 4137909477L);
new SegmentV1V2ToV3FormatConverter().convert(indexDir);
- assertEquals(CrcUtils.computeCrc(indexDir), 237260274L);
+ assertEquals(CrcUtils.computeCrc(indexDir), 3355659762L);
}
@Test
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
index ca98d274d08..12d9cf88bdc 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
@@ -563,9 +563,7 @@ public abstract class BaseSegmentCreator implements
SegmentCreator {
CommonsConfigurationUtils.saveToFile(properties, metadataFile);
}
- /// Adds column metadata information to the properties configuration. The
`forwardIndexEncoding` is persisted under
- /// [V1Constants.MetadataKeys.Column#FORWARD_INDEX_ENCODING]; readers
loading older segments that lack this key fall
- /// back to inferring the encoding from `HAS_DICTIONARY`.
+ /// Adds column metadata information to the properties configuration.
public static void addColumnMetadataInfo(PropertiesConfiguration properties,
String column,
ColumnStatistics columnStatistics, int totalDocs, FieldSpec fieldSpec,
boolean hasDictionary,
int dictionaryElementSize, FieldConfig.EncodingType
forwardIndexEncoding, boolean autoGenerated) {
@@ -591,6 +589,10 @@ public abstract class BaseSegmentCreator implements
SegmentCreator {
if (storedType == DataType.STRING) {
properties.setProperty(getKeyFor(column, IS_ASCII),
String.valueOf(columnStatistics.isAscii()));
}
+ if (!fieldSpec.isSingleValueField()) {
+ properties.setProperty(getKeyFor(column, MAX_ROW_LENGTH_IN_BYTES),
+ String.valueOf(columnStatistics.getMaxRowLengthInBytes()));
+ }
}
properties.setProperty(getKeyFor(column, DICTIONARY_ELEMENT_SIZE),
String.valueOf(dictionaryElementSize));
// TODO: When the column is raw (no dictionary), we should set
BITS_PER_ELEMENT to -1 (invalid). Currently we set
@@ -600,10 +602,10 @@ public abstract class BaseSegmentCreator implements
SegmentCreator {
// See https://github.com/apache/pinot/pull/16921 for details
properties.setProperty(getKeyFor(column, BITS_PER_ELEMENT),
String.valueOf(PinotDataBitSet.getNumBitsPerValue(cardinality - 1)));
- properties.setProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
- String.valueOf(columnStatistics.getTotalNumberOfEntries()));
properties.setProperty(getKeyFor(column, MAX_MULTI_VALUE_ELEMENTS),
String.valueOf(columnStatistics.getMaxNumberOfMultiValues()));
+ properties.setProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
+ String.valueOf(columnStatistics.getTotalNumberOfEntries()));
if (autoGenerated) {
properties.setProperty(getKeyFor(column, IS_AUTO_GENERATED), "true");
}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
index b5b2d400a28..ca466549b47 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -506,8 +506,8 @@ public class RealtimeSegmentConverterTest implements
PinotBuffersAfterMethodChec
public static Object[][] optimizeDictionaryTypeParams() {
// Format: {optimizeDictionaryType, expectedCRC}, crc is used here to
check the correct dictionary type was used.
return new Object[][]{
- {true, "94495458"},
- {false, "2028789000"},
+ {true, "145089250"},
+ {false, "2079382792"},
};
}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
index 66c87da4d6e..9ad96deb5f0 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
@@ -64,6 +64,7 @@ import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
import static org.testng.Assert.*;
@@ -127,6 +128,7 @@ public class ColumnMetadataTest {
assertTrue(col3Meta.isAscii());
assertEquals(col3Meta.getTotalNumberOfEntries(), 100000);
assertEquals(col3Meta.getMaxNumberOfMultiValues(), 0);
+ assertEquals(col3Meta.getMaxRowLengthInBytes(), 4);
assertEquals(col3Meta.getBitsPerElement(), 3);
assertFalse(col3Meta.isAutoGenerated());
@@ -145,6 +147,7 @@ public class ColumnMetadataTest {
assertTrue(col4Meta.isAscii());
assertEquals(col4Meta.getTotalNumberOfEntries(), 100000);
assertEquals(col4Meta.getMaxNumberOfMultiValues(), 0);
+ assertEquals(col4Meta.getMaxRowLengthInBytes(), 4);
assertEquals(col4Meta.getBitsPerElement(), -1);
assertFalse(col4Meta.isAutoGenerated());
@@ -163,6 +166,7 @@ public class ColumnMetadataTest {
assertFalse(col6Meta.isAscii());
assertEquals(col6Meta.getTotalNumberOfEntries(), 106688);
assertEquals(col6Meta.getMaxNumberOfMultiValues(), 13);
+ assertEquals(col6Meta.getMaxRowLengthInBytes(), 52);
assertEquals(col6Meta.getBitsPerElement(), 15);
assertFalse(col6Meta.isAutoGenerated());
@@ -181,6 +185,7 @@ public class ColumnMetadataTest {
assertFalse(col7Meta.isAscii());
assertEquals(col7Meta.getTotalNumberOfEntries(), 134090);
assertEquals(col7Meta.getMaxNumberOfMultiValues(), 24);
+ assertEquals(col7Meta.getMaxRowLengthInBytes(), 96);
assertEquals(col7Meta.getBitsPerElement(), -1);
assertFalse(col7Meta.isAutoGenerated());
@@ -200,6 +205,7 @@ public class ColumnMetadataTest {
assertFalse(timeColumn.isAscii());
assertEquals(timeColumn.getTotalNumberOfEntries(), 100000);
assertEquals(timeColumn.getMaxNumberOfMultiValues(), 0);
+ assertEquals(timeColumn.getMaxRowLengthInBytes(), 4);
assertEquals(timeColumn.getBitsPerElement(), 1);
assertFalse(timeColumn.isAutoGenerated());
}
@@ -297,6 +303,66 @@ public class ColumnMetadataTest {
"\r\n\r\n utils em::C:\\dir\\utils\r\nPSParentPath :
Mi");
}
+ @Test
+ public void testMaxRowLengthInBytesPersistedForMvVarLength() {
+ DimensionFieldSpec fieldSpec = new DimensionFieldSpec("mvString",
DataType.STRING, false);
+ ColumnStatistics stats = mock(ColumnStatistics.class);
+ when(stats.getLengthOfShortestElement()).thenReturn(2);
+ when(stats.getLengthOfLongestElement()).thenReturn(10);
+ when(stats.getMaxNumberOfMultiValues()).thenReturn(5);
+ // Real per-row max (42) deliberately differs from the naive maxMV *
longest (50) so the assertion
+ // fails if the reader silently falls back to the derived formula.
+ when(stats.getMaxRowLengthInBytes()).thenReturn(42);
+ when(stats.getTotalNumberOfEntries()).thenReturn(20);
+
+ PropertiesConfiguration config = new PropertiesConfiguration();
+ BaseSegmentCreator.addColumnMetadataInfo(config, "mvString", stats, 10,
fieldSpec, false, -1,
+ FieldConfig.EncodingType.RAW, false);
+ ColumnMetadataImpl meta =
ColumnMetadataImpl.fromPropertiesConfiguration(config, 10, "mvString");
+ assertEquals(meta.getMaxRowLengthInBytes(), 42);
+ }
+
+ @Test
+ public void
testMaxRowLengthInBytesAbsentForMvVarLengthFallsBackToUnavailable() {
+ // Simulate a pre-1.6.0 segment: MV var-length column whose metadata
predates MAX_ROW_LENGTH_IN_BYTES.
+ DimensionFieldSpec fieldSpec = new DimensionFieldSpec("mvString",
DataType.STRING, false);
+ ColumnStatistics stats = mock(ColumnStatistics.class);
+ when(stats.getLengthOfShortestElement()).thenReturn(2);
+ when(stats.getLengthOfLongestElement()).thenReturn(10);
+ when(stats.getMaxNumberOfMultiValues()).thenReturn(5);
+ when(stats.getMaxRowLengthInBytes()).thenReturn(42);
+ when(stats.getTotalNumberOfEntries()).thenReturn(20);
+
+ PropertiesConfiguration config = new PropertiesConfiguration();
+ BaseSegmentCreator.addColumnMetadataInfo(config, "mvString", stats, 10,
fieldSpec, false, -1,
+ FieldConfig.EncodingType.RAW, false);
+ config.clearProperty("column.mvString.maxRowLengthInBytes");
+ ColumnMetadataImpl meta =
ColumnMetadataImpl.fromPropertiesConfiguration(config, 10, "mvString");
+ assertEquals(meta.getMaxRowLengthInBytes(), ColumnMetadata.UNAVAILABLE);
+ }
+
+ @Test
+ public void testMaxRowLengthInBytesForSvAndFixedMv() {
+ // SV columns derive maxRowLengthInBytes from the longest element
regardless of what was persisted.
+ ColumnMetadataImpl svFixed = new ColumnMetadataImpl.Builder()
+ .setFieldSpec(new DimensionFieldSpec("svInt", DataType.INT, true))
+ .build();
+ assertEquals(svFixed.getMaxRowLengthInBytes(), 4);
+
+ ColumnMetadataImpl svVarLength = new ColumnMetadataImpl.Builder()
+ .setFieldSpec(new DimensionFieldSpec("svString", DataType.STRING,
true))
+ .setLengthOfLongestElement(7)
+ .build();
+ assertEquals(svVarLength.getMaxRowLengthInBytes(), 7);
+
+ // Fixed-width MV columns derive maxRowLengthInBytes from
maxNumberOfMultiValues * storedType.size().
+ ColumnMetadataImpl mvFixed = new ColumnMetadataImpl.Builder()
+ .setFieldSpec(new DimensionFieldSpec("mvInt", DataType.INT, false))
+ .setMaxNumberOfMultiValues(13)
+ .build();
+ assertEquals(mvFixed.getMaxRowLengthInBytes(), 52);
+ }
+
@Test
public void testComplexFieldSpec() {
ComplexFieldSpec intMapFieldSpec = new ComplexFieldSpec("intMap",
DataType.MAP, true,
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
index 940a6bcc80e..f493db66dc5 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
@@ -34,9 +34,7 @@ public interface ColumnMetadata extends ColumnShape {
@JsonProperty("hasDictionary")
boolean hasDictionary();
- /// Returns the forward-index encoding for this column. The value is
persisted under
- /// [V1Constants.MetadataKeys.Column#FORWARD_INDEX_ENCODING]; for old
segments that lack the key, implementations
- /// derive it from [#hasDictionary()].
+ /// Returns the forward-index encoding for this column.
EncodingType getForwardIndexEncoding();
/// Returns `true` when both min and max value are invalid, and there is no
need to regenerate them.
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
index 7605c98c554..86699135256 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
@@ -138,9 +138,8 @@ public class V1Constants {
public static final String CARDINALITY = "cardinality";
// Mandatory, treated as `true` when missing for backward compatibility
public static final String HAS_DICTIONARY = "hasDictionary";
- // Mandatory for new segments. Records the forward-index encoding (RAW
or DICTIONARY). Old segments written
- // before this key was introduced do not have it; readers fall back to
inferring the encoding from
- // HAS_DICTIONARY in that case.
+ // Mandatory, treated as `DICTIONARY` when `hasDictionary=true`, `RAW`
when `hasDictionary=false`
+ // NOTE: Added in release 1.6.0. Only exist in segment created after
1.6.0 release.
public static final String FORWARD_INDEX_ENCODING =
"forwardIndexEncoding";
// Mandatory, treated as `false` when missing for backward compatibility
public static final String IS_SORTED = "isSorted";
@@ -168,6 +167,9 @@ public class V1Constants {
// Optional, exist for MV columns
// TODO: Changed to optional on reader side in release 1.6.0. Change
writer side after 1.6.0 release.
public static final String TOTAL_NUMBER_OF_ENTRIES =
"totalNumberOfEntries";
+ // Optional, exist for MV variable-length types
+ // NOTE: Added in release 1.6.0. Only exist in segment created after
1.6.0 release.
+ public static final String MAX_ROW_LENGTH_IN_BYTES =
"maxRowLengthInBytes";
// Optional, default false
public static final String IS_AUTO_GENERATED = "isAutoGenerated";
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
index ef00a4563ea..bd2bb43ee6e 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
@@ -321,16 +321,13 @@ public class ColumnMetadataImpl implements ColumnMetadata
{
public static ColumnMetadataImpl
fromPropertiesConfiguration(PropertiesConfiguration config, int totalDocs,
String column) {
FieldSpec fieldSpec = extractFieldSpec(column, config);
- boolean hasDictionary = config.getBoolean(Column.getKeyFor(column,
Column.HAS_DICTIONARY), true);
- // Old segments may not have FORWARD_INDEX_ENCODING — derive from
HAS_DICTIONARY in that case.
- EncodingType forwardIndexEncoding =
config.getEnum(Column.getKeyFor(column, Column.FORWARD_INDEX_ENCODING),
- EncodingType.class, hasDictionary ? EncodingType.DICTIONARY :
EncodingType.RAW);
Builder builder = new Builder()
.setFieldSpec(fieldSpec)
.setTotalDocs(totalDocs)
.setCardinality(config.getInt(Column.getKeyFor(column,
Column.CARDINALITY)))
- .setHasDictionary(hasDictionary)
- .setForwardIndexEncoding(forwardIndexEncoding)
+ .setHasDictionary(config.getBoolean(Column.getKeyFor(column,
Column.HAS_DICTIONARY), true))
+ .setForwardIndexEncoding(
+ config.getEnum(Column.getKeyFor(column,
Column.FORWARD_INDEX_ENCODING), EncodingType.class, null))
.setSorted(config.getBoolean(Column.getKeyFor(column,
Column.IS_SORTED), false))
.setLengthOfShortestElement(
config.getInt(Column.getKeyFor(column,
Column.LENGTH_OF_SHORTEST_ELEMENT), UNAVAILABLE))
@@ -341,6 +338,8 @@ public class ColumnMetadataImpl implements ColumnMetadata {
.setTotalNumberOfEntries(config.getInt(Column.getKeyFor(column,
Column.TOTAL_NUMBER_OF_ENTRIES), UNAVAILABLE))
.setMaxNumberOfMultiValues(
config.getInt(Column.getKeyFor(column,
Column.MAX_MULTI_VALUE_ELEMENTS), UNAVAILABLE))
+ .setMaxRowLengthInBytes(
+ config.getInt(Column.getKeyFor(column,
Column.MAX_ROW_LENGTH_IN_BYTES), UNAVAILABLE))
.setBitsPerElement(config.getInt(Column.getKeyFor(column,
Column.BITS_PER_ELEMENT), UNAVAILABLE))
.setAutoGenerated(config.getBoolean(Column.getKeyFor(column,
Column.IS_AUTO_GENERATED), false));
@@ -503,6 +502,7 @@ public class ColumnMetadataImpl implements ColumnMetadata {
private boolean _isAscii;
private int _totalNumberOfEntries;
private int _maxNumberOfMultiValues;
+ private int _maxRowLengthInBytes;
private int _bitsPerElement;
private PartitionFunction _partitionFunction;
private Set<Integer> _partitions;
@@ -583,6 +583,11 @@ public class ColumnMetadataImpl implements ColumnMetadata {
return this;
}
+ public Builder setMaxRowLengthInBytes(int maxRowLengthInBytes) {
+ _maxRowLengthInBytes = maxRowLengthInBytes;
+ return this;
+ }
+
public Builder setBitsPerElement(int bitsPerElement) {
_bitsPerElement = bitsPerElement;
return this;
@@ -604,6 +609,11 @@ public class ColumnMetadataImpl implements ColumnMetadata {
}
public ColumnMetadataImpl build() {
+ // Canonicalize forward index encoding
+ if (_forwardIndexEncoding == null) {
+ _forwardIndexEncoding = _hasDictionary ? EncodingType.DICTIONARY :
EncodingType.RAW;
+ }
+
// Canonicalize length of shortest/longest element
DataType storedType = _fieldSpec.getDataType().getStoredType();
if (storedType.isFixedWidth()) {
@@ -621,17 +631,12 @@ public class ColumnMetadataImpl implements ColumnMetadata
{
}
// Canonicalize MV related fields
- int maxRowLengthInBytes;
if (_fieldSpec.isSingleValueField()) {
_totalNumberOfEntries = _totalDocs;
_maxNumberOfMultiValues = 0;
- maxRowLengthInBytes = _lengthOfLongestElement;
- } else {
- if (_lengthOfShortestElement >= 0 && _lengthOfShortestElement ==
_lengthOfLongestElement) {
- maxRowLengthInBytes = _maxNumberOfMultiValues *
_lengthOfLongestElement;
- } else {
- maxRowLengthInBytes = UNAVAILABLE;
- }
+ _maxRowLengthInBytes = _lengthOfLongestElement;
+ } else if (storedType.isFixedWidth()) {
+ _maxRowLengthInBytes = _maxNumberOfMultiValues * storedType.size();
}
// Canonicalize bits per element
@@ -639,13 +644,9 @@ public class ColumnMetadataImpl implements ColumnMetadata {
_bitsPerElement = UNAVAILABLE;
}
- if (_forwardIndexEncoding == null) {
- _forwardIndexEncoding = _hasDictionary ? EncodingType.DICTIONARY :
EncodingType.RAW;
- }
-
return new ColumnMetadataImpl(_fieldSpec, _totalDocs, _cardinality,
_hasDictionary, _forwardIndexEncoding,
_sorted, _minValue, _maxValue, _minMaxValueInvalid,
_lengthOfShortestElement, _lengthOfLongestElement,
- _isAscii, _totalNumberOfEntries, _maxNumberOfMultiValues,
maxRowLengthInBytes, _bitsPerElement,
+ _isAscii, _totalNumberOfEntries, _maxNumberOfMultiValues,
_maxRowLengthInBytes, _bitsPerElement,
_partitionFunction, _partitions, _autoGenerated);
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]