ankitsultana commented on code in PR #16344: URL: https://github.com/apache/pinot/pull/16344#discussion_r2308330686
########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/RealtimeSegmentStatsContainer.java: ########## @@ -38,30 +42,99 @@ public class RealtimeSegmentStatsContainer implements SegmentPreIndexStatsContainer { private final MutableSegment _mutableSegment; private final Map<String, ColumnStatistics> _columnStatisticsMap = new HashMap<>(); + private final int _totalDocCount; public RealtimeSegmentStatsContainer(MutableSegment mutableSegment, @Nullable int[] sortedDocIds, StatsCollectorConfig statsCollectorConfig) { + this(mutableSegment, sortedDocIds, statsCollectorConfig, null); + } + + public RealtimeSegmentStatsContainer(MutableSegment mutableSegment, @Nullable int[] sortedDocIds, + StatsCollectorConfig statsCollectorConfig, @Nullable RecordReader recordReader) { _mutableSegment = mutableSegment; + // Determine if we're using compacted reader + boolean isUsingCompactedReader = recordReader instanceof CompactedPinotSegmentRecordReader; + + // Determine the correct total document count based on whether compaction is being used + if (isUsingCompactedReader && mutableSegment.getValidDocIds() != null) { + _totalDocCount = mutableSegment.getValidDocIds().getMutableRoaringBitmap().getCardinality(); + } else { + _totalDocCount = mutableSegment.getNumDocsIndexed(); + } + // Create all column statistics + // Determine compaction mode once for all columns + boolean useCompactedStatistics = isUsingCompactedReader && mutableSegment.getValidDocIds() != null; + ThreadSafeMutableRoaringBitmap validDocIds = useCompactedStatistics ? mutableSegment.getValidDocIds() : null; Review Comment: You are using the Mutable Segment's validDocIds at two places: once here to pass to the column statistics, and the other time in RealtimeSegmentConverter to create the record reader. We need to make sure that the validDocIds don't change across these two call sites during a segment commit, otherwise the dictionary ids can change in that time? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org