Jackie-Jiang commented on code in PR #18496:
URL: https://github.com/apache/pinot/pull/18496#discussion_r3243169076


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java:
##########
@@ -1194,56 +1167,67 @@ private void 
rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment
       File indexDir)
       throws Exception {
     String column = columnMetadata.getColumnName();
-    // Get the forward index reader factory and create a reader
-    IndexReaderFactory<ForwardIndexReader> readerFactory = 
StandardIndexes.forward().getReaderFactory();
-    try (ForwardIndexReader<?> reader = 
readerFactory.createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
-        columnMetadata)) {
-      Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata);
-      IndexCreationContext.Builder builder =
-          new IndexCreationContext.Builder(indexDir, _tableConfig, 
columnMetadata).withDictionary(false);
-      // Encoding flows through ForwardIndexConfig set below. The row length 
is derived from persisted metadata via
-      // the Builder constructor; only fall back to scanning when metadata 
returns UNAVAILABLE (var-width MV columns
-      // with varying element lengths).
-      if (columnMetadata.getMaxRowLengthInBytes() == 
ColumnMetadata.UNAVAILABLE) {
-        builder.withMaxRowLengthInBytes(getMaxRowLength(columnMetadata, 
reader, dictionary));
-      }
-      ForwardIndexConfig config = 
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward());
-      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(builder.build(), config)) {
-        forwardIndexRewriteHelper(column, columnMetadata, reader, creator, 
columnMetadata.getTotalDocs(), null,
+    FieldIndexConfigs indexConfigs = _fieldIndexConfigs.get(column);
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()
+        .createIndexReader(segmentWriter, indexConfigs, columnMetadata);
+        Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata)) {
+      IndexCreationContext context = new 
IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build();
+      ForwardIndexConfig config = 
indexConfigs.getConfig(StandardIndexes.forward());
+      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(context, config)) {
+        forwardIndexRewriteHelper(column, columnMetadata, forwardIndex, 
creator, columnMetadata.getTotalDocs(), null,
             dictionary);
       }
     }
   }
 
-  /**
-   * Returns the max row length for a column.
-   * - For SV column, this is the length of the longest value.
-   * - For MV column, this is the length of the longest MV entry (sum of 
lengths of all elements).
-   */
-  private int getMaxRowLength(ColumnMetadata columnMetadata, 
ForwardIndexReader<?> forwardIndex,
-      @Nullable Dictionary dictionary)
-      throws IOException {
-    String column = columnMetadata.getColumnName();
+  /// Scans the column and persists the 1.6.0-era length stats when they are 
missing from segment metadata. After
+  /// this returns, per-op handlers can read those stats off [ColumnMetadata] 
when they construct their forward-index
+  /// creator. No-op when:
+  /// - the column is fixed-width (length stats derive from 
`storedType.size()`);
+  /// - the metadata already carries the stats (`getLengthOfShortestElement() 
>= 0` — proxy for "1.6.0 stats
+  ///   present" since the four keys were introduced together);
+  /// - the column has no forward index on disk (those are rebuilt via
+  ///   [InvertedIndexAndDictionaryBasedForwardIndexCreator], which handles 
its own backfill).
+  private void backfillMissingLengthStatsForColumn(String column, 
SegmentDirectory.Writer segmentWriter)
+      throws Exception {
+    ColumnMetadata columnMetadata = 
_segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
     DataType storedType = columnMetadata.getDataType().getStoredType();
-    assert !storedType.isFixedWidth();
-
-    try (PinotSegmentColumnReader columnReader = new 
PinotSegmentColumnReader(forwardIndex, dictionary, null,
-        columnMetadata.getMaxNumberOfMultiValues())) {
-      AbstractColumnStatisticsCollector statsCollector = 
getStatsCollector(column, storedType);
-      int numDocs = columnMetadata.getTotalDocs();
-      for (int i = 0; i < numDocs; i++) {
-        statsCollector.collect(columnReader.getValue(i));
+    if (storedType.isFixedWidth() || 
columnMetadata.getLengthOfShortestElement() >= 0) {
+      return;
+    }
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()
+        .createIndexReader(segmentWriter, _fieldIndexConfigs.get(column), 
columnMetadata)) {
+      if (forwardIndex == null) {
+        return;
+      }
+      boolean dictionaryEncoded = forwardIndex.isDictionaryEncoded();
+      try (Dictionary dictionary = dictionaryEncoded ? 
DictionaryIndexType.read(segmentWriter, columnMetadata) : null;
+          PinotSegmentColumnReader columnReader = new 
PinotSegmentColumnReader(forwardIndex, dictionary, null,
+              columnMetadata.getMaxNumberOfMultiValues())) {
+        AbstractColumnStatisticsCollector statsCollector = 
getStatsCollector(column, storedType, false);
+        int numDocs = columnMetadata.getTotalDocs();
+        for (int i = 0; i < numDocs; i++) {
+          statsCollector.collect(columnReader.getValue(i));
+        }
+        Map<String, String> metadataProperties = new HashMap<>();
+        metadataProperties.put(getKeyFor(column, FORWARD_INDEX_ENCODING),
+            dictionaryEncoded ? EncodingType.DICTIONARY.name() : 
EncodingType.RAW.name());
+        metadataProperties.put(getKeyFor(column, LENGTH_OF_SHORTEST_ELEMENT),
+            String.valueOf(statsCollector.getLengthOfShortestElement()));

Review Comment:
   No code change needed. The latest commit adds 
`Preconditions.checkState(segmentMetadata.getTotalDocs() > 0)` in 
`BaseIndexHandler`'s constructor, with matching `assert numDocs > 0` inside 
`backfillMissingStats` and `assert _numDocs > 0 && _cardinality > 0` inside 
`InvertedIndexAndDictionaryBasedForwardIndexCreator`. 
`SegmentPreProcessor.process()` already short-circuits empty segments at the 
top, so the empty-column case can no longer reach the backfill — the 
`Integer.MAX_VALUE` scenario is impossible by construction.



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java:
##########
@@ -1194,56 +1167,67 @@ private void 
rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment
       File indexDir)
       throws Exception {
     String column = columnMetadata.getColumnName();
-    // Get the forward index reader factory and create a reader
-    IndexReaderFactory<ForwardIndexReader> readerFactory = 
StandardIndexes.forward().getReaderFactory();
-    try (ForwardIndexReader<?> reader = 
readerFactory.createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
-        columnMetadata)) {
-      Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata);
-      IndexCreationContext.Builder builder =
-          new IndexCreationContext.Builder(indexDir, _tableConfig, 
columnMetadata).withDictionary(false);
-      // Encoding flows through ForwardIndexConfig set below. The row length 
is derived from persisted metadata via
-      // the Builder constructor; only fall back to scanning when metadata 
returns UNAVAILABLE (var-width MV columns
-      // with varying element lengths).
-      if (columnMetadata.getMaxRowLengthInBytes() == 
ColumnMetadata.UNAVAILABLE) {
-        builder.withMaxRowLengthInBytes(getMaxRowLength(columnMetadata, 
reader, dictionary));
-      }
-      ForwardIndexConfig config = 
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward());
-      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(builder.build(), config)) {
-        forwardIndexRewriteHelper(column, columnMetadata, reader, creator, 
columnMetadata.getTotalDocs(), null,
+    FieldIndexConfigs indexConfigs = _fieldIndexConfigs.get(column);
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()
+        .createIndexReader(segmentWriter, indexConfigs, columnMetadata);
+        Dictionary dictionary = DictionaryIndexType.read(segmentWriter, 
columnMetadata)) {
+      IndexCreationContext context = new 
IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build();
+      ForwardIndexConfig config = 
indexConfigs.getConfig(StandardIndexes.forward());
+      try (ForwardIndexCreator creator = 
StandardIndexes.forward().createIndexCreator(context, config)) {
+        forwardIndexRewriteHelper(column, columnMetadata, forwardIndex, 
creator, columnMetadata.getTotalDocs(), null,
             dictionary);
       }
     }
   }
 
-  /**
-   * Returns the max row length for a column.
-   * - For SV column, this is the length of the longest value.
-   * - For MV column, this is the length of the longest MV entry (sum of 
lengths of all elements).
-   */
-  private int getMaxRowLength(ColumnMetadata columnMetadata, 
ForwardIndexReader<?> forwardIndex,
-      @Nullable Dictionary dictionary)
-      throws IOException {
-    String column = columnMetadata.getColumnName();
+  /// Scans the column and persists the 1.6.0-era length stats when they are 
missing from segment metadata. After
+  /// this returns, per-op handlers can read those stats off [ColumnMetadata] 
when they construct their forward-index
+  /// creator. No-op when:
+  /// - the column is fixed-width (length stats derive from 
`storedType.size()`);
+  /// - the metadata already carries the stats (`getLengthOfShortestElement() 
>= 0` — proxy for "1.6.0 stats
+  ///   present" since the four keys were introduced together);
+  /// - the column has no forward index on disk (those are rebuilt via
+  ///   [InvertedIndexAndDictionaryBasedForwardIndexCreator], which handles 
its own backfill).
+  private void backfillMissingLengthStatsForColumn(String column, 
SegmentDirectory.Writer segmentWriter)
+      throws Exception {
+    ColumnMetadata columnMetadata = 
_segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
     DataType storedType = columnMetadata.getDataType().getStoredType();
-    assert !storedType.isFixedWidth();
-
-    try (PinotSegmentColumnReader columnReader = new 
PinotSegmentColumnReader(forwardIndex, dictionary, null,
-        columnMetadata.getMaxNumberOfMultiValues())) {
-      AbstractColumnStatisticsCollector statsCollector = 
getStatsCollector(column, storedType);
-      int numDocs = columnMetadata.getTotalDocs();
-      for (int i = 0; i < numDocs; i++) {
-        statsCollector.collect(columnReader.getValue(i));
+    if (storedType.isFixedWidth() || 
columnMetadata.getLengthOfShortestElement() >= 0) {
+      return;
+    }
+    try (ForwardIndexReader<?> forwardIndex = 
StandardIndexes.forward().getReaderFactory()

Review Comment:
   Added `testBackfillMissingStats` and 
`testBackfillMissingStatsForMvViaMaxRowLengthTrigger` in 
`ForwardIndexHandlerTest`. The first strips `LENGTH_OF_SHORTEST_ELEMENT` / 
`LENGTH_OF_LONGEST_ELEMENT` / `IS_ASCII` (+ `MAX_ROW_LENGTH_IN_BYTES` for MV) 
from `DIM_SNAPPY_STRING` (SV) and `DIM_MV_PASS_THROUGH_STRING` (MV) to simulate 
pre-1.6.0 metadata, triggers a compression change, and verifies the stats are 
backfilled to their original values. The second exercises the new MV-specific 
trigger (`getMaxRowLengthInBytes() >= 0`) by stripping only 
`MAX_ROW_LENGTH_IN_BYTES` and confirming the pre-pass still fires.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to