This is an automated email from the ASF dual-hosted git repository.

Jackie-Jiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 8609e4d7469 Persist maxRowLengthInBytes in ColumnMetadata (#18494)
8609e4d7469 is described below

commit 8609e4d7469dc1872bee8a0a225fe0b3232a21e8
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Wed May 13 21:42:32 2026 -0700

    Persist maxRowLengthInBytes in ColumnMetadata (#18494)
---
 .../org/apache/pinot/core/util/CrcUtilsTest.java   |  8 +--
 .../segment/creator/impl/BaseSegmentCreator.java   | 12 ++--
 .../converter/RealtimeSegmentConverterTest.java    |  4 +-
 .../local/segment/index/ColumnMetadataTest.java    | 66 ++++++++++++++++++++++
 .../apache/pinot/segment/spi/ColumnMetadata.java   |  4 +-
 .../org/apache/pinot/segment/spi/V1Constants.java  |  8 ++-
 .../spi/index/metadata/ColumnMetadataImpl.java     | 39 ++++++-------
 7 files changed, 105 insertions(+), 36 deletions(-)

diff --git 
a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java 
b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
index 94019398200..b504303770e 100644
--- a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
@@ -105,10 +105,10 @@ public class CrcUtilsTest {
     driver.build();
 
     File indexDir = driver.getOutputDirectory();
-    assertEquals(CrcUtils.computeCrc(indexDir), 2716945490L);
+    assertEquals(CrcUtils.computeCrc(indexDir), 1541360722L);
 
     new SegmentV1V2ToV3FormatConverter().convert(indexDir);
-    assertEquals(CrcUtils.computeCrc(indexDir), 1116143598L);
+    assertEquals(CrcUtils.computeCrc(indexDir), 4234543086L);
   }
 
   @Test
@@ -135,10 +135,10 @@ public class CrcUtilsTest {
     driver.build();
 
     File indexDir = driver.getOutputDirectory();
-    assertEquals(CrcUtils.computeCrc(indexDir), 1019509989L);
+    assertEquals(CrcUtils.computeCrc(indexDir), 4137909477L);
 
     new SegmentV1V2ToV3FormatConverter().convert(indexDir);
-    assertEquals(CrcUtils.computeCrc(indexDir), 237260274L);
+    assertEquals(CrcUtils.computeCrc(indexDir), 3355659762L);
   }
 
   @Test
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
index ca98d274d08..12d9cf88bdc 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java
@@ -563,9 +563,7 @@ public abstract class BaseSegmentCreator implements 
SegmentCreator {
     CommonsConfigurationUtils.saveToFile(properties, metadataFile);
   }
 
-  /// Adds column metadata information to the properties configuration. The 
`forwardIndexEncoding` is persisted under
-  /// [V1Constants.MetadataKeys.Column#FORWARD_INDEX_ENCODING]; readers 
loading older segments that lack this key fall
-  /// back to inferring the encoding from `HAS_DICTIONARY`.
+  /// Adds column metadata information to the properties configuration.
   public static void addColumnMetadataInfo(PropertiesConfiguration properties, 
String column,
       ColumnStatistics columnStatistics, int totalDocs, FieldSpec fieldSpec, 
boolean hasDictionary,
       int dictionaryElementSize, FieldConfig.EncodingType 
forwardIndexEncoding, boolean autoGenerated) {
@@ -591,6 +589,10 @@ public abstract class BaseSegmentCreator implements 
SegmentCreator {
       if (storedType == DataType.STRING) {
         properties.setProperty(getKeyFor(column, IS_ASCII), 
String.valueOf(columnStatistics.isAscii()));
       }
+      if (!fieldSpec.isSingleValueField()) {
+        properties.setProperty(getKeyFor(column, MAX_ROW_LENGTH_IN_BYTES),
+            String.valueOf(columnStatistics.getMaxRowLengthInBytes()));
+      }
     }
     properties.setProperty(getKeyFor(column, DICTIONARY_ELEMENT_SIZE), 
String.valueOf(dictionaryElementSize));
     // TODO: When the column is raw (no dictionary), we should set 
BITS_PER_ELEMENT to -1 (invalid). Currently we set
@@ -600,10 +602,10 @@ public abstract class BaseSegmentCreator implements 
SegmentCreator {
     //       See https://github.com/apache/pinot/pull/16921 for details
     properties.setProperty(getKeyFor(column, BITS_PER_ELEMENT),
         String.valueOf(PinotDataBitSet.getNumBitsPerValue(cardinality - 1)));
-    properties.setProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
-        String.valueOf(columnStatistics.getTotalNumberOfEntries()));
     properties.setProperty(getKeyFor(column, MAX_MULTI_VALUE_ELEMENTS),
         String.valueOf(columnStatistics.getMaxNumberOfMultiValues()));
+    properties.setProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
+        String.valueOf(columnStatistics.getTotalNumberOfEntries()));
     if (autoGenerated) {
       properties.setProperty(getKeyFor(column, IS_AUTO_GENERATED), "true");
     }
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
index b5b2d400a28..ca466549b47 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -506,8 +506,8 @@ public class RealtimeSegmentConverterTest implements 
PinotBuffersAfterMethodChec
   public static Object[][] optimizeDictionaryTypeParams() {
     // Format: {optimizeDictionaryType, expectedCRC}, crc is used here to 
check the correct dictionary type was used.
     return new Object[][]{
-        {true, "94495458"},
-        {false, "2028789000"},
+        {true, "145089250"},
+        {false, "2079382792"},
     };
   }
 
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
index 66c87da4d6e..9ad96deb5f0 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/ColumnMetadataTest.java
@@ -64,6 +64,7 @@ import org.testng.annotations.BeforeMethod;
 import org.testng.annotations.Test;
 
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 import static org.testng.Assert.*;
 
 
@@ -127,6 +128,7 @@ public class ColumnMetadataTest {
     assertTrue(col3Meta.isAscii());
     assertEquals(col3Meta.getTotalNumberOfEntries(), 100000);
     assertEquals(col3Meta.getMaxNumberOfMultiValues(), 0);
+    assertEquals(col3Meta.getMaxRowLengthInBytes(), 4);
     assertEquals(col3Meta.getBitsPerElement(), 3);
     assertFalse(col3Meta.isAutoGenerated());
 
@@ -145,6 +147,7 @@ public class ColumnMetadataTest {
     assertTrue(col4Meta.isAscii());
     assertEquals(col4Meta.getTotalNumberOfEntries(), 100000);
     assertEquals(col4Meta.getMaxNumberOfMultiValues(), 0);
+    assertEquals(col4Meta.getMaxRowLengthInBytes(), 4);
     assertEquals(col4Meta.getBitsPerElement(), -1);
     assertFalse(col4Meta.isAutoGenerated());
 
@@ -163,6 +166,7 @@ public class ColumnMetadataTest {
     assertFalse(col6Meta.isAscii());
     assertEquals(col6Meta.getTotalNumberOfEntries(), 106688);
     assertEquals(col6Meta.getMaxNumberOfMultiValues(), 13);
+    assertEquals(col6Meta.getMaxRowLengthInBytes(), 52);
     assertEquals(col6Meta.getBitsPerElement(), 15);
     assertFalse(col6Meta.isAutoGenerated());
 
@@ -181,6 +185,7 @@ public class ColumnMetadataTest {
     assertFalse(col7Meta.isAscii());
     assertEquals(col7Meta.getTotalNumberOfEntries(), 134090);
     assertEquals(col7Meta.getMaxNumberOfMultiValues(), 24);
+    assertEquals(col7Meta.getMaxRowLengthInBytes(), 96);
     assertEquals(col7Meta.getBitsPerElement(), -1);
     assertFalse(col7Meta.isAutoGenerated());
 
@@ -200,6 +205,7 @@ public class ColumnMetadataTest {
     assertFalse(timeColumn.isAscii());
     assertEquals(timeColumn.getTotalNumberOfEntries(), 100000);
     assertEquals(timeColumn.getMaxNumberOfMultiValues(), 0);
+    assertEquals(timeColumn.getMaxRowLengthInBytes(), 4);
     assertEquals(timeColumn.getBitsPerElement(), 1);
     assertFalse(timeColumn.isAutoGenerated());
   }
@@ -297,6 +303,66 @@ public class ColumnMetadataTest {
         "\r\n\r\n  utils   em::C:\\dir\\utils\r\nPSParentPath            : 
Mi");
   }
 
+  @Test
+  public void testMaxRowLengthInBytesPersistedForMvVarLength() {
+    DimensionFieldSpec fieldSpec = new DimensionFieldSpec("mvString", 
DataType.STRING, false);
+    ColumnStatistics stats = mock(ColumnStatistics.class);
+    when(stats.getLengthOfShortestElement()).thenReturn(2);
+    when(stats.getLengthOfLongestElement()).thenReturn(10);
+    when(stats.getMaxNumberOfMultiValues()).thenReturn(5);
+    // Real per-row max (42) deliberately differs from the naive maxMV * 
longest (50) so the assertion
+    // fails if the reader silently falls back to the derived formula.
+    when(stats.getMaxRowLengthInBytes()).thenReturn(42);
+    when(stats.getTotalNumberOfEntries()).thenReturn(20);
+
+    PropertiesConfiguration config = new PropertiesConfiguration();
+    BaseSegmentCreator.addColumnMetadataInfo(config, "mvString", stats, 10, 
fieldSpec, false, -1,
+        FieldConfig.EncodingType.RAW, false);
+    ColumnMetadataImpl meta = 
ColumnMetadataImpl.fromPropertiesConfiguration(config, 10, "mvString");
+    assertEquals(meta.getMaxRowLengthInBytes(), 42);
+  }
+
+  @Test
+  public void 
testMaxRowLengthInBytesAbsentForMvVarLengthFallsBackToUnavailable() {
+    // Simulate a pre-1.6.0 segment: MV var-length column whose metadata 
predates MAX_ROW_LENGTH_IN_BYTES.
+    DimensionFieldSpec fieldSpec = new DimensionFieldSpec("mvString", 
DataType.STRING, false);
+    ColumnStatistics stats = mock(ColumnStatistics.class);
+    when(stats.getLengthOfShortestElement()).thenReturn(2);
+    when(stats.getLengthOfLongestElement()).thenReturn(10);
+    when(stats.getMaxNumberOfMultiValues()).thenReturn(5);
+    when(stats.getMaxRowLengthInBytes()).thenReturn(42);
+    when(stats.getTotalNumberOfEntries()).thenReturn(20);
+
+    PropertiesConfiguration config = new PropertiesConfiguration();
+    BaseSegmentCreator.addColumnMetadataInfo(config, "mvString", stats, 10, 
fieldSpec, false, -1,
+        FieldConfig.EncodingType.RAW, false);
+    config.clearProperty("column.mvString.maxRowLengthInBytes");
+    ColumnMetadataImpl meta = 
ColumnMetadataImpl.fromPropertiesConfiguration(config, 10, "mvString");
+    assertEquals(meta.getMaxRowLengthInBytes(), ColumnMetadata.UNAVAILABLE);
+  }
+
+  @Test
+  public void testMaxRowLengthInBytesForSvAndFixedMv() {
+    // SV columns derive maxRowLengthInBytes from the longest element 
regardless of what was persisted.
+    ColumnMetadataImpl svFixed = new ColumnMetadataImpl.Builder()
+        .setFieldSpec(new DimensionFieldSpec("svInt", DataType.INT, true))
+        .build();
+    assertEquals(svFixed.getMaxRowLengthInBytes(), 4);
+
+    ColumnMetadataImpl svVarLength = new ColumnMetadataImpl.Builder()
+        .setFieldSpec(new DimensionFieldSpec("svString", DataType.STRING, 
true))
+        .setLengthOfLongestElement(7)
+        .build();
+    assertEquals(svVarLength.getMaxRowLengthInBytes(), 7);
+
+    // Fixed-width MV columns derive maxRowLengthInBytes from 
maxNumberOfMultiValues * storedType.size().
+    ColumnMetadataImpl mvFixed = new ColumnMetadataImpl.Builder()
+        .setFieldSpec(new DimensionFieldSpec("mvInt", DataType.INT, false))
+        .setMaxNumberOfMultiValues(13)
+        .build();
+    assertEquals(mvFixed.getMaxRowLengthInBytes(), 52);
+  }
+
   @Test
   public void testComplexFieldSpec() {
     ComplexFieldSpec intMapFieldSpec = new ComplexFieldSpec("intMap", 
DataType.MAP, true,
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
index 940a6bcc80e..f493db66dc5 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java
@@ -34,9 +34,7 @@ public interface ColumnMetadata extends ColumnShape {
   @JsonProperty("hasDictionary")
   boolean hasDictionary();
 
-  /// Returns the forward-index encoding for this column. The value is 
persisted under
-  /// [V1Constants.MetadataKeys.Column#FORWARD_INDEX_ENCODING]; for old 
segments that lack the key, implementations
-  /// derive it from [#hasDictionary()].
+  /// Returns the forward-index encoding for this column.
   EncodingType getForwardIndexEncoding();
 
   /// Returns `true` when both min and max value are invalid, and there is no 
need to regenerate them.
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
index 7605c98c554..86699135256 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java
@@ -138,9 +138,8 @@ public class V1Constants {
       public static final String CARDINALITY = "cardinality";
       // Mandatory, treated as `true` when missing for backward compatibility
       public static final String HAS_DICTIONARY = "hasDictionary";
-      // Mandatory for new segments. Records the forward-index encoding (RAW 
or DICTIONARY). Old segments written
-      // before this key was introduced do not have it; readers fall back to 
inferring the encoding from
-      // HAS_DICTIONARY in that case.
+      // Mandatory, treated as `DICTIONARY` when `hasDictionary=true`, `RAW` 
when `hasDictionary=false`
+      // NOTE: Added in release 1.6.0. Only exist in segment created after 
1.6.0 release.
       public static final String FORWARD_INDEX_ENCODING = 
"forwardIndexEncoding";
       // Mandatory, treated as `false` when missing for backward compatibility
       public static final String IS_SORTED = "isSorted";
@@ -168,6 +167,9 @@ public class V1Constants {
       // Optional, exist for MV columns
       // TODO: Changed to optional on reader side in release 1.6.0. Change 
writer side after 1.6.0 release.
       public static final String TOTAL_NUMBER_OF_ENTRIES = 
"totalNumberOfEntries";
+      // Optional, exist for MV variable-length types
+      // NOTE: Added in release 1.6.0. Only exist in segment created after 
1.6.0 release.
+      public static final String MAX_ROW_LENGTH_IN_BYTES = 
"maxRowLengthInBytes";
       // Optional, default false
       public static final String IS_AUTO_GENERATED = "isAutoGenerated";
 
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
index ef00a4563ea..bd2bb43ee6e 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
@@ -321,16 +321,13 @@ public class ColumnMetadataImpl implements ColumnMetadata 
{
   public static ColumnMetadataImpl 
fromPropertiesConfiguration(PropertiesConfiguration config, int totalDocs,
       String column) {
     FieldSpec fieldSpec = extractFieldSpec(column, config);
-    boolean hasDictionary = config.getBoolean(Column.getKeyFor(column, 
Column.HAS_DICTIONARY), true);
-    // Old segments may not have FORWARD_INDEX_ENCODING — derive from 
HAS_DICTIONARY in that case.
-    EncodingType forwardIndexEncoding = 
config.getEnum(Column.getKeyFor(column, Column.FORWARD_INDEX_ENCODING),
-        EncodingType.class, hasDictionary ? EncodingType.DICTIONARY : 
EncodingType.RAW);
     Builder builder = new Builder()
         .setFieldSpec(fieldSpec)
         .setTotalDocs(totalDocs)
         .setCardinality(config.getInt(Column.getKeyFor(column, 
Column.CARDINALITY)))
-        .setHasDictionary(hasDictionary)
-        .setForwardIndexEncoding(forwardIndexEncoding)
+        .setHasDictionary(config.getBoolean(Column.getKeyFor(column, 
Column.HAS_DICTIONARY), true))
+        .setForwardIndexEncoding(
+            config.getEnum(Column.getKeyFor(column, 
Column.FORWARD_INDEX_ENCODING), EncodingType.class, null))
         .setSorted(config.getBoolean(Column.getKeyFor(column, 
Column.IS_SORTED), false))
         .setLengthOfShortestElement(
             config.getInt(Column.getKeyFor(column, 
Column.LENGTH_OF_SHORTEST_ELEMENT), UNAVAILABLE))
@@ -341,6 +338,8 @@ public class ColumnMetadataImpl implements ColumnMetadata {
         .setTotalNumberOfEntries(config.getInt(Column.getKeyFor(column, 
Column.TOTAL_NUMBER_OF_ENTRIES), UNAVAILABLE))
         .setMaxNumberOfMultiValues(
             config.getInt(Column.getKeyFor(column, 
Column.MAX_MULTI_VALUE_ELEMENTS), UNAVAILABLE))
+        .setMaxRowLengthInBytes(
+            config.getInt(Column.getKeyFor(column, 
Column.MAX_ROW_LENGTH_IN_BYTES), UNAVAILABLE))
         .setBitsPerElement(config.getInt(Column.getKeyFor(column, 
Column.BITS_PER_ELEMENT), UNAVAILABLE))
         .setAutoGenerated(config.getBoolean(Column.getKeyFor(column, 
Column.IS_AUTO_GENERATED), false));
 
@@ -503,6 +502,7 @@ public class ColumnMetadataImpl implements ColumnMetadata {
     private boolean _isAscii;
     private int _totalNumberOfEntries;
     private int _maxNumberOfMultiValues;
+    private int _maxRowLengthInBytes;
     private int _bitsPerElement;
     private PartitionFunction _partitionFunction;
     private Set<Integer> _partitions;
@@ -583,6 +583,11 @@ public class ColumnMetadataImpl implements ColumnMetadata {
       return this;
     }
 
+    public Builder setMaxRowLengthInBytes(int maxRowLengthInBytes) {
+      _maxRowLengthInBytes = maxRowLengthInBytes;
+      return this;
+    }
+
     public Builder setBitsPerElement(int bitsPerElement) {
       _bitsPerElement = bitsPerElement;
       return this;
@@ -604,6 +609,11 @@ public class ColumnMetadataImpl implements ColumnMetadata {
     }
 
     public ColumnMetadataImpl build() {
+      // Canonicalize forward index encoding
+      if (_forwardIndexEncoding == null) {
+        _forwardIndexEncoding = _hasDictionary ? EncodingType.DICTIONARY : 
EncodingType.RAW;
+      }
+
       // Canonicalize length of shortest/longest element
       DataType storedType = _fieldSpec.getDataType().getStoredType();
       if (storedType.isFixedWidth()) {
@@ -621,17 +631,12 @@ public class ColumnMetadataImpl implements ColumnMetadata 
{
       }
 
       // Canonicalize MV related fields
-      int maxRowLengthInBytes;
       if (_fieldSpec.isSingleValueField()) {
         _totalNumberOfEntries = _totalDocs;
         _maxNumberOfMultiValues = 0;
-        maxRowLengthInBytes = _lengthOfLongestElement;
-      } else {
-        if (_lengthOfShortestElement >= 0 && _lengthOfShortestElement == 
_lengthOfLongestElement) {
-          maxRowLengthInBytes = _maxNumberOfMultiValues * 
_lengthOfLongestElement;
-        } else {
-          maxRowLengthInBytes = UNAVAILABLE;
-        }
+        _maxRowLengthInBytes = _lengthOfLongestElement;
+      } else if (storedType.isFixedWidth()) {
+        _maxRowLengthInBytes = _maxNumberOfMultiValues * storedType.size();
       }
 
       // Canonicalize bits per element
@@ -639,13 +644,9 @@ public class ColumnMetadataImpl implements ColumnMetadata {
         _bitsPerElement = UNAVAILABLE;
       }
 
-      if (_forwardIndexEncoding == null) {
-        _forwardIndexEncoding = _hasDictionary ? EncodingType.DICTIONARY : 
EncodingType.RAW;
-      }
-
       return new ColumnMetadataImpl(_fieldSpec, _totalDocs, _cardinality, 
_hasDictionary, _forwardIndexEncoding,
           _sorted, _minValue, _maxValue, _minMaxValueInvalid, 
_lengthOfShortestElement, _lengthOfLongestElement,
-          _isAscii, _totalNumberOfEntries, _maxNumberOfMultiValues, 
maxRowLengthInBytes, _bitsPerElement,
+          _isAscii, _totalNumberOfEntries, _maxNumberOfMultiValues, 
_maxRowLengthInBytes, _bitsPerElement,
           _partitionFunction, _partitions, _autoGenerated);
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to