Jackie-Jiang commented on code in PR #18496:
URL: https://github.com/apache/pinot/pull/18496#discussion_r3243169076
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java:
##########
@@ -1194,56 +1167,67 @@ private void
rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment
File indexDir)
throws Exception {
String column = columnMetadata.getColumnName();
- // Get the forward index reader factory and create a reader
- IndexReaderFactory<ForwardIndexReader> readerFactory =
StandardIndexes.forward().getReaderFactory();
- try (ForwardIndexReader<?> reader =
readerFactory.createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
- columnMetadata)) {
- Dictionary dictionary = DictionaryIndexType.read(segmentWriter,
columnMetadata);
- IndexCreationContext.Builder builder =
- new IndexCreationContext.Builder(indexDir, _tableConfig,
columnMetadata).withDictionary(false);
- // Encoding flows through ForwardIndexConfig set below. The row length
is derived from persisted metadata via
- // the Builder constructor; only fall back to scanning when metadata
returns UNAVAILABLE (var-width MV columns
- // with varying element lengths).
- if (columnMetadata.getMaxRowLengthInBytes() ==
ColumnMetadata.UNAVAILABLE) {
- builder.withMaxRowLengthInBytes(getMaxRowLength(columnMetadata,
reader, dictionary));
- }
- ForwardIndexConfig config =
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward());
- try (ForwardIndexCreator creator =
StandardIndexes.forward().createIndexCreator(builder.build(), config)) {
- forwardIndexRewriteHelper(column, columnMetadata, reader, creator,
columnMetadata.getTotalDocs(), null,
+ FieldIndexConfigs indexConfigs = _fieldIndexConfigs.get(column);
+ try (ForwardIndexReader<?> forwardIndex =
StandardIndexes.forward().getReaderFactory()
+ .createIndexReader(segmentWriter, indexConfigs, columnMetadata);
+ Dictionary dictionary = DictionaryIndexType.read(segmentWriter,
columnMetadata)) {
+ IndexCreationContext context = new
IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build();
+ ForwardIndexConfig config =
indexConfigs.getConfig(StandardIndexes.forward());
+ try (ForwardIndexCreator creator =
StandardIndexes.forward().createIndexCreator(context, config)) {
+ forwardIndexRewriteHelper(column, columnMetadata, forwardIndex,
creator, columnMetadata.getTotalDocs(), null,
dictionary);
}
}
}
- /**
- * Returns the max row length for a column.
- * - For SV column, this is the length of the longest value.
- * - For MV column, this is the length of the longest MV entry (sum of
lengths of all elements).
- */
- private int getMaxRowLength(ColumnMetadata columnMetadata,
ForwardIndexReader<?> forwardIndex,
- @Nullable Dictionary dictionary)
- throws IOException {
- String column = columnMetadata.getColumnName();
+ /// Scans the column and persists the 1.6.0-era length stats when they are
missing from segment metadata. After
+ /// this returns, per-op handlers can read those stats off [ColumnMetadata]
when they construct their forward-index
+ /// creator. No-op when:
+ /// - the column is fixed-width (length stats derive from
`storedType.size()`);
+ /// - the metadata already carries the stats (`getLengthOfShortestElement()
>= 0` — proxy for "1.6.0 stats
+ /// present" since the four keys were introduced together);
+ /// - the column has no forward index on disk (those are rebuilt via
+ /// [InvertedIndexAndDictionaryBasedForwardIndexCreator], which handles
its own backfill).
+ private void backfillMissingLengthStatsForColumn(String column,
SegmentDirectory.Writer segmentWriter)
+ throws Exception {
+ ColumnMetadata columnMetadata =
_segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
DataType storedType = columnMetadata.getDataType().getStoredType();
- assert !storedType.isFixedWidth();
-
- try (PinotSegmentColumnReader columnReader = new
PinotSegmentColumnReader(forwardIndex, dictionary, null,
- columnMetadata.getMaxNumberOfMultiValues())) {
- AbstractColumnStatisticsCollector statsCollector =
getStatsCollector(column, storedType);
- int numDocs = columnMetadata.getTotalDocs();
- for (int i = 0; i < numDocs; i++) {
- statsCollector.collect(columnReader.getValue(i));
+ if (storedType.isFixedWidth() ||
columnMetadata.getLengthOfShortestElement() >= 0) {
+ return;
+ }
+ try (ForwardIndexReader<?> forwardIndex =
StandardIndexes.forward().getReaderFactory()
+ .createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
columnMetadata)) {
+ if (forwardIndex == null) {
+ return;
+ }
+ boolean dictionaryEncoded = forwardIndex.isDictionaryEncoded();
+ try (Dictionary dictionary = dictionaryEncoded ?
DictionaryIndexType.read(segmentWriter, columnMetadata) : null;
+ PinotSegmentColumnReader columnReader = new
PinotSegmentColumnReader(forwardIndex, dictionary, null,
+ columnMetadata.getMaxNumberOfMultiValues())) {
+ AbstractColumnStatisticsCollector statsCollector =
getStatsCollector(column, storedType, false);
+ int numDocs = columnMetadata.getTotalDocs();
+ for (int i = 0; i < numDocs; i++) {
+ statsCollector.collect(columnReader.getValue(i));
+ }
+ Map<String, String> metadataProperties = new HashMap<>();
+ metadataProperties.put(getKeyFor(column, FORWARD_INDEX_ENCODING),
+ dictionaryEncoded ? EncodingType.DICTIONARY.name() :
EncodingType.RAW.name());
+ metadataProperties.put(getKeyFor(column, LENGTH_OF_SHORTEST_ELEMENT),
+ String.valueOf(statsCollector.getLengthOfShortestElement()));
Review Comment:
No code change needed. The latest commit adds
`Preconditions.checkState(segmentMetadata.getTotalDocs() > 0)` in
`BaseIndexHandler`'s constructor, with matching `assert numDocs > 0` inside
`backfillMissingStats` and `assert _numDocs > 0 && _cardinality > 0` inside
`InvertedIndexAndDictionaryBasedForwardIndexCreator`.
`SegmentPreProcessor.process()` already short-circuits empty segments at the
top, so the empty-column case can no longer reach the backfill — the
`Integer.MAX_VALUE` scenario is impossible by construction.
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java:
##########
@@ -1194,56 +1167,67 @@ private void
rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment
File indexDir)
throws Exception {
String column = columnMetadata.getColumnName();
- // Get the forward index reader factory and create a reader
- IndexReaderFactory<ForwardIndexReader> readerFactory =
StandardIndexes.forward().getReaderFactory();
- try (ForwardIndexReader<?> reader =
readerFactory.createIndexReader(segmentWriter, _fieldIndexConfigs.get(column),
- columnMetadata)) {
- Dictionary dictionary = DictionaryIndexType.read(segmentWriter,
columnMetadata);
- IndexCreationContext.Builder builder =
- new IndexCreationContext.Builder(indexDir, _tableConfig,
columnMetadata).withDictionary(false);
- // Encoding flows through ForwardIndexConfig set below. The row length
is derived from persisted metadata via
- // the Builder constructor; only fall back to scanning when metadata
returns UNAVAILABLE (var-width MV columns
- // with varying element lengths).
- if (columnMetadata.getMaxRowLengthInBytes() ==
ColumnMetadata.UNAVAILABLE) {
- builder.withMaxRowLengthInBytes(getMaxRowLength(columnMetadata,
reader, dictionary));
- }
- ForwardIndexConfig config =
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward());
- try (ForwardIndexCreator creator =
StandardIndexes.forward().createIndexCreator(builder.build(), config)) {
- forwardIndexRewriteHelper(column, columnMetadata, reader, creator,
columnMetadata.getTotalDocs(), null,
+ FieldIndexConfigs indexConfigs = _fieldIndexConfigs.get(column);
+ try (ForwardIndexReader<?> forwardIndex =
StandardIndexes.forward().getReaderFactory()
+ .createIndexReader(segmentWriter, indexConfigs, columnMetadata);
+ Dictionary dictionary = DictionaryIndexType.read(segmentWriter,
columnMetadata)) {
+ IndexCreationContext context = new
IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build();
+ ForwardIndexConfig config =
indexConfigs.getConfig(StandardIndexes.forward());
+ try (ForwardIndexCreator creator =
StandardIndexes.forward().createIndexCreator(context, config)) {
+ forwardIndexRewriteHelper(column, columnMetadata, forwardIndex,
creator, columnMetadata.getTotalDocs(), null,
dictionary);
}
}
}
- /**
- * Returns the max row length for a column.
- * - For SV column, this is the length of the longest value.
- * - For MV column, this is the length of the longest MV entry (sum of
lengths of all elements).
- */
- private int getMaxRowLength(ColumnMetadata columnMetadata,
ForwardIndexReader<?> forwardIndex,
- @Nullable Dictionary dictionary)
- throws IOException {
- String column = columnMetadata.getColumnName();
+ /// Scans the column and persists the 1.6.0-era length stats when they are
missing from segment metadata. After
+ /// this returns, per-op handlers can read those stats off [ColumnMetadata]
when they construct their forward-index
+ /// creator. No-op when:
+ /// - the column is fixed-width (length stats derive from
`storedType.size()`);
+ /// - the metadata already carries the stats (`getLengthOfShortestElement()
>= 0` — proxy for "1.6.0 stats
+ /// present" since the four keys were introduced together);
+ /// - the column has no forward index on disk (those are rebuilt via
+ /// [InvertedIndexAndDictionaryBasedForwardIndexCreator], which handles
its own backfill).
+ private void backfillMissingLengthStatsForColumn(String column,
SegmentDirectory.Writer segmentWriter)
+ throws Exception {
+ ColumnMetadata columnMetadata =
_segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column);
DataType storedType = columnMetadata.getDataType().getStoredType();
- assert !storedType.isFixedWidth();
-
- try (PinotSegmentColumnReader columnReader = new
PinotSegmentColumnReader(forwardIndex, dictionary, null,
- columnMetadata.getMaxNumberOfMultiValues())) {
- AbstractColumnStatisticsCollector statsCollector =
getStatsCollector(column, storedType);
- int numDocs = columnMetadata.getTotalDocs();
- for (int i = 0; i < numDocs; i++) {
- statsCollector.collect(columnReader.getValue(i));
+ if (storedType.isFixedWidth() ||
columnMetadata.getLengthOfShortestElement() >= 0) {
+ return;
+ }
+ try (ForwardIndexReader<?> forwardIndex =
StandardIndexes.forward().getReaderFactory()
Review Comment:
Added `testBackfillMissingStats` and
`testBackfillMissingStatsForMvViaMaxRowLengthTrigger` in
`ForwardIndexHandlerTest`. The first strips `LENGTH_OF_SHORTEST_ELEMENT` /
`LENGTH_OF_LONGEST_ELEMENT` / `IS_ASCII` (+ `MAX_ROW_LENGTH_IN_BYTES` for MV)
from `DIM_SNAPPY_STRING` (SV) and `DIM_MV_PASS_THROUGH_STRING` (MV) to simulate
pre-1.6.0 metadata, triggers a compression change, and verifies the stats are
backfilled to their original values. The second exercises the new MV-specific
trigger (`getMaxRowLengthInBytes() >= 0`) by stripping only
`MAX_ROW_LENGTH_IN_BYTES` and confirming the pre-pass still fires.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]