Re: [PR] Add codec pipeline framework for raw forward index encoding [pinot]

via GitHub Sun, 19 Apr 2026 20:50:24 -0700


Copilot commented on code in PR #18229:
URL: https://github.com/apache/pinot/pull/18229#discussion_r3108140393



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/CodecRegistry.java:
##########
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.io.codec;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import javax.annotation.Nullable;
+import org.apache.pinot.segment.spi.codec.ChunkCodecHandler;
+import org.apache.pinot.segment.spi.codec.CodecSpecParser;
+
+
+/**
+ * Registry of known {@link ChunkCodecHandler} instances, looked up by
+ * {@link ChunkCodecHandler#name()}.
+ *
+ * <p>The singleton {@link #DEFAULT} instance is pre-populated with all 
built-in codecs and is
+ * immutable after class-loading; calling {@link #register} on it will throw.
+ *
+ * <p>Custom registries (e.g. in tests) should be created with {@link 
#CodecRegistry()} and
+ * populated via {@link #register} before use.  The custom registry instance 
itself is not
+ * thread-safe for concurrent writes; callers must complete all registrations 
before sharing
+ * the registry across threads.
+ *
+ * <p>Lookup is case-insensitive.
+ */
+public final class CodecRegistry {
+
+  /**
+   * Default, immutable registry containing all built-in codecs.
+   * Safe for concurrent reads; does not allow {@link #register}.
+   */
+  public static final CodecRegistry DEFAULT;
+
+  static {
+    Map<String, ChunkCodecHandler<?>> m = new LinkedHashMap<>();
+    m.put(DeltaCodecDefinition.INSTANCE.name().toUpperCase(), 
DeltaCodecDefinition.INSTANCE);
+    m.put(DeltaDeltaCodecDefinition.INSTANCE.name().toUpperCase(), 
DeltaDeltaCodecDefinition.INSTANCE);
+    m.put(ZstdCodecDefinition.INSTANCE.name().toUpperCase(), 
ZstdCodecDefinition.INSTANCE);
+    m.put(Lz4CodecDefinition.INSTANCE.name().toUpperCase(), 
Lz4CodecDefinition.INSTANCE);
+    m.put(SnappyCodecDefinition.INSTANCE.name().toUpperCase(), 
SnappyCodecDefinition.INSTANCE);
+    m.put(GzipCodecDefinition.INSTANCE.name().toUpperCase(), 
GzipCodecDefinition.INSTANCE);
+    DEFAULT = new CodecRegistry(Collections.unmodifiableMap(m));
+  }

Review Comment:
   The PR description says the built-in codec registry contains DELTA/ZSTD/LZ4, 
but the default registry here also registers DELTADELTA, SNAPPY, and GZIP. 
Please either update the PR description to match the actual shipped built-ins, 
or drop these codecs from DEFAULT if they are not intended to be part of the 
initial contract.



##########
pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/CodecPipelineForwardIndexTest.java:
##########
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.forward;
+
+import java.io.File;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.segment.local.PinotBuffersAfterMethodCheckRule;
+import org.apache.pinot.segment.local.io.codec.CodecPipelineExecutor;
+import org.apache.pinot.segment.local.io.codec.CodecRegistry;
+import 
org.apache.pinot.segment.local.io.writer.impl.FixedByteChunkForwardIndexWriterV7;
+import 
org.apache.pinot.segment.local.segment.index.readers.forward.ChunkReaderContext;
+import 
org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkSVForwardIndexReaderV7;
+import org.apache.pinot.segment.spi.codec.CodecContext;
+import org.apache.pinot.segment.spi.index.ForwardIndexConfig;
+import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.*;
+
+
+/**
+ * Roundtrip encode/decode tests for the codec-pipeline forward index (version 
7).
+ *
+ * <p>Covers:
+ * <ul>
+ *   <li>INT and LONG single-value columns</li>
+ *   <li>DELTA only, ZSTD only, CODEC(DELTA,ZSTD) pipelines</li>
+ *   <li>Version check on the written file</li>
+ *   <li>Canonical spec round-trips correctly through the header</li>
+ * </ul>
+ */
+public class CodecPipelineForwardIndexTest implements 
PinotBuffersAfterMethodCheckRule {
+
+  private static final int NUM_VALUES = 10_007;
+  private static final int DOCS_PER_CHUNK = 1024;
+  private static final String TEST_FILE_PREFIX =
+      System.getProperty("java.io.tmpdir") + File.separator + 
"CodecPipelineFwdTest";
+  // Fixed seed for deterministic test data — this test verifies correctness, 
not random coverage
+  private static final Random RANDOM = new Random(42L);
+

Review Comment:
   The test uses a single static shared Random instance. Because it is stateful 
(and not thread-safe), the generated expected values depend on test execution 
order and can become flaky under parallel TestNG execution. Use a 
per-test/per-invocation Random seeded deterministically (e.g., derive a seed 
from spec+type) or precompute deterministic inputs without shared mutable RNG 
state.
   ```suggestion
     // Base seed for deterministic per-invocation test data generation.
     private static final long RANDOM_SEED = 42L;
   
     private static Random createRandom(String spec, DataType dataType) {
       long seed = RANDOM_SEED;
       seed = 31 * seed + spec.hashCode();
       seed = 31 * seed + dataType.hashCode();
       return new Random(seed);
     }
   ```



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriterV7.java:
##########
@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.io.writer.impl;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import javax.annotation.concurrent.NotThreadSafe;
+import org.apache.pinot.segment.local.io.codec.CodecPipelineExecutor;
+import org.apache.pinot.segment.spi.index.ForwardIndexConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Chunk-based raw (non-dictionary-encoded) forward index writer for 
single-value fixed-width
+ * columns (INT, LONG) that uses a {@link CodecPipelineExecutor} for encoding.
+ *
+ * <p>This writer introduces <strong>version 7</strong> of the fixed-byte 
chunk raw forward index
+ * format.  The on-disk layout is:
+ *
+ * <pre>
+ * File header:
+ *   version              (int, value = 7)
+ *   numChunks            (int)
+ *   numDocsPerChunk      (int, normalised to power-of-2)
+ *   sizeOfEntry          (int, bytes per logical value, e.g. 4 for INT)
+ *   totalDocs            (int)
+ *   codecSpecLength      (int, byte length of the UTF-8 encoded canonical 
codec spec)
+ *   dataHeaderStart      (int, byte offset from file start where chunk-offset 
table begins)
+ *   codecSpec            (byte[], UTF-8 encoded canonical spec, length = 
codecSpecLength)
+ *   chunkOffsets         (long[numChunks], absolute byte offset of each 
chunk's per-chunk header)
+ * Data (per chunk):
+ *   compressedSize       (int, byte length of the encoded payload that 
follows)
+ *   uncompressedSize     (int, byte length of the original decoded chunk data)
+ *   payload              (byte[], encoded chunk data, length = compressedSize)
+ * </pre>
+ *
+ * <p>Each chunk contains {@code numDocsPerChunk} values encoded by the 
pipeline.  Chunk offsets
+ * are 8-byte longs to support files larger than 2 GB.  The per-chunk size 
header allows readers
+ * to verify decompression output and to skip/read chunks without scanning 
adjacent offsets.
+ *
+ * <p>This class is <em>not</em> thread-safe.
+ */
+@NotThreadSafe
+public class FixedByteChunkForwardIndexWriterV7 implements Closeable {
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(FixedByteChunkForwardIndexWriterV7.class);
+
+  public static final int VERSION = 
ForwardIndexConfig.CODEC_PIPELINE_WRITER_VERSION;
+
+  /** Bytes written before each chunk payload: compressedSize (int) + 
uncompressedSize (int). */
+  public static final int CHUNK_HEADER_BYTES = 2 * Integer.BYTES;
+
+  // Number of fixed int fields before the codec spec: version, numChunks, 
numDocsPerChunk,
+  // sizeOfEntry, totalDocs, codecSpecLength, dataHeaderStart
+  private static final int FIXED_HEADER_INT_COUNT = 7;
+
+  private final FileChannel _dataFile;
+  private final CodecPipelineExecutor _executor;
+  private final int _numDocsPerChunk;
+  private final int _sizeOfEntry;
+  private final ByteBuffer _header;
+  private final ByteBuffer _chunkBuffer;
+  private final int _numChunks;
+  private final int _totalDocs;
+
+  private long _dataOffset;
+  private int _docsWritten;
+  private int _chunksWritten;
+
+  /**
+   * Creates a new writer.
+   *
+   * @param file            output file
+   * @param executor        pre-validated pipeline executor
+   * @param totalDocs       total number of documents to write
+   * @param numDocsPerChunk target documents per chunk (will be rounded up to 
power-of-2)
+   * @param sizeOfEntry     bytes per value (e.g. 4 for INT, 8 for LONG)
+   */
+  public FixedByteChunkForwardIndexWriterV7(File file, CodecPipelineExecutor 
executor, int totalDocs,
+      int numDocsPerChunk, int sizeOfEntry)
+      throws IOException {
+    _executor = executor;
+    _sizeOfEntry = sizeOfEntry;
+    _totalDocs = totalDocs;
+    _numDocsPerChunk = normalizePower2(numDocsPerChunk);
+    _numChunks = (totalDocs + _numDocsPerChunk - 1) / _numDocsPerChunk;
+    _docsWritten = 0;
+    _chunksWritten = 0;
+
+    byte[] specBytes = 
executor.getCanonicalSpec().getBytes(StandardCharsets.UTF_8);
+
+    // Header layout:
+    //   7 ints of fixed fields
+    //   specBytes.length bytes of codec spec
+    //   numChunks longs of chunk offsets
+    long fixedHeaderBytesLong = (long) FIXED_HEADER_INT_COUNT * Integer.BYTES;
+    long dataHeaderStartLong = fixedHeaderBytesLong + specBytes.length;
+    long chunkOffsetTableBytesLong = (long) _numChunks * Long.BYTES;
+    long totalHeaderBytesLong = dataHeaderStartLong + 
chunkOffsetTableBytesLong;
+    if (totalHeaderBytesLong > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException(
+          "Header size " + totalHeaderBytesLong + " bytes exceeds 
Integer.MAX_VALUE. Reduce totalDocs or"
+              + " increase numDocsPerChunk.");
+    }
+    int dataHeaderStart = (int) dataHeaderStartLong;
+    int totalHeaderBytes = (int) totalHeaderBytesLong;
+
+    _header = ByteBuffer.allocateDirect(totalHeaderBytes);
+    _header.putInt(VERSION);
+    _header.putInt(_numChunks);
+    _header.putInt(_numDocsPerChunk);
+    _header.putInt(sizeOfEntry);
+    _header.putInt(totalDocs);
+    _header.putInt(specBytes.length);
+    _header.putInt(dataHeaderStart);
+    _header.put(specBytes);
+    // chunk offsets will be filled in during writeChunk() calls
+
+    _dataOffset = totalHeaderBytes;
+
+    long chunkSizeLong = (long) sizeOfEntry * _numDocsPerChunk;
+    if (chunkSizeLong > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException(
+          "Chunk size " + chunkSizeLong + " bytes overflows int. Reduce 
numDocsPerChunk or sizeOfEntry.");
+    }
+    _chunkBuffer = ByteBuffer.allocateDirect((int) chunkSizeLong);
+
+    _dataFile = new RandomAccessFile(file, "rw").getChannel();
+  }
+
+  /** Writes a 4-byte integer value. */
+  public void putInt(int value) {
+    _chunkBuffer.putInt(value);
+    _docsWritten++;
+    flushIfNeeded();
+  }
+
+  /** Writes an 8-byte long value. */
+  public void putLong(long value) {
+    _chunkBuffer.putLong(value);
+    _docsWritten++;
+    flushIfNeeded();
+  }
+
+  private void flushIfNeeded() {
+    if (_chunkBuffer.position() == _numDocsPerChunk * _sizeOfEntry) {
+      writeChunk();
+    }
+  }
+
+  private void writeChunk() {
+    _chunkBuffer.flip();
+    int uncompressedSize = _chunkBuffer.remaining();
+    try {
+      ByteBuffer encoded = _executor.compress(_chunkBuffer);
+      int compressedSize = encoded.remaining();
+
+      // Per-chunk header: compressedSize (int) + uncompressedSize (int)
+      ByteBuffer chunkHeader = ByteBuffer.allocateDirect(CHUNK_HEADER_BYTES);

Review Comment:
   writeChunk() allocates a new direct ByteBuffer for the per-chunk header on 
every chunk. For large segments this can create substantial direct-memory 
allocation/cleanup overhead. Consider reusing a single (heap or direct) 8-byte 
buffer for the chunk header, or writing the two ints via a reusable ByteBuffer 
field.
   ```suggestion
         ByteBuffer chunkHeader = ByteBuffer.allocate(CHUNK_HEADER_BYTES);
   ```



##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/CodecSpecParser.java:
##########
@@ -0,0 +1,227 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.codec;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+
+/**
+ * Phase-1 (structural) parser for the Codec DSL.
+ *
+ * <p>Accepts the following grammar:
+ * <pre>
+ *   spec        ::= pipeline | invocation
+ *   pipeline    ::= "CODEC" "(" invocation ("," invocation)* ")"
+ *   invocation  ::= NAME
+ *                 | NAME "(" args ")"
+ *   args        ::= ε | arg ("," arg)*
+ *   arg         ::= [+-]? [0-9]+    // numeric literals; leading sign 
accepted but not used in v1
+ *   NAME        ::= [A-Za-z_][A-Za-z0-9_]*
+ * </pre>
+ *
+ * <p>Examples:
+ * <ul>
+ *   <li>{@code DELTA} → single-stage pipeline with no args</li>
+ *   <li>{@code ZSTD(3)} → single-stage pipeline, arg "3"</li>
+ *   <li>{@code CODEC(DELTA,ZSTD(8))} → two-stage pipeline</li>
+ * </ul>
+ *
+ * <p>This class performs only structural parsing; semantic validation (valid 
codec names,
+ * argument ranges, type compatibility) is done in a second phase by the 
registry and
+ * pipeline validator.
+ *
+ * <p>This class is stateless and thread-safe.
+ */
+public final class CodecSpecParser {
+
+  /**
+   * The keyword used for multi-stage pipeline notation (e.g. {@code 
CODEC(DELTA,ZSTD(3))}).
+   *
+   * <p>This name is permanently reserved — it cannot be registered as a codec 
name.
+   * See {@code CodecRegistry.register}.
+   */
+  public static final String PIPELINE_KEYWORD = "CODEC";
+
+  private CodecSpecParser() {
+  }
+
+  /**
+   * Parses {@code spec} into a {@link CodecPipeline} AST.
+   *
+   * @param spec the codec DSL string; must not be {@code null} or blank
+   * @return the parsed pipeline
+   * @throws IllegalArgumentException if the spec is syntactically invalid
+   */
+  public static CodecPipeline parse(String spec) {
+    if (spec == null || spec.isBlank()) {
+      throw new IllegalArgumentException("codecSpec must not be null or 
blank");
+    }
+    String trimmed = spec.trim();
+    Parser p = new Parser(trimmed);
+    CodecPipeline pipeline = p.parsePipelineOrInvocation();
+    p.expectEof();
+    return pipeline;
+  }
+
+  // -------------------------------------------------------------------------
+  // Internal recursive-descent parser
+  // -------------------------------------------------------------------------
+
+  private static final class Parser {
+    private final String _input;
+    private int _pos;
+
+    Parser(String input) {
+      _input = input;
+      _pos = 0;
+    }
+
+    CodecPipeline parsePipelineOrInvocation() {
+      String name = parseName();
+      if (PIPELINE_KEYWORD.equalsIgnoreCase(name) && peek() == '(') {
+        // Multi-stage pipeline: CODEC(stage,stage,...)
+        consume('(');
+        if (peek() == ')') {
+          throw new IllegalArgumentException("CODEC pipeline must have at 
least one stage in: " + _input);
+        }
+        List<CodecInvocation> stages = new ArrayList<>();
+        stages.add(parseInvocation());
+        while (peek() == ',') {
+          consume(',');
+          stages.add(parseInvocation());
+        }
+        consume(')');
+        return new CodecPipeline(stages);
+      }
+      // Single invocation; name already consumed, check for optional args
+      List<String> args = Collections.emptyList();
+      if (peek() == '(') {
+        consume('(');
+        args = parseArgs();
+        consume(')');
+      }
+      return new CodecPipeline(Collections.singletonList(new 
CodecInvocation(name.toUpperCase(), args)));
+    }
+
+    CodecInvocation parseInvocation() {
+      String name = parseName().toUpperCase();
+      List<String> args = Collections.emptyList();
+      if (peek() == '(') {
+        consume('(');
+        args = parseArgs();
+        consume(')');
+      }
+      return new CodecInvocation(name, args);
+    }
+
+    /** Parse NAME = [A-Za-z_][A-Za-z0-9_]* */
+    String parseName() {
+      skipWhitespace();
+      if (_pos >= _input.length()) {
+        throw new IllegalArgumentException("Expected codec name but reached 
end of input: " + _input);
+      }
+      char first = _input.charAt(_pos);
+      if (!Character.isLetter(first) && first != '_') {
+        throw new IllegalArgumentException(
+            "Expected codec name starting with letter at position " + _pos + " 
in: " + _input);
+      }

Review Comment:
   In CodecSpecParser.parseName(), the error message says the name must start 
with a letter, but the implementation also allows '_' as the first character. 
This makes diagnostics misleading for specs like "_FOO". Please update the 
exception message to match the accepted grammar (letter or underscore).



##########
pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java:
##########
@@ -2354,6 +2358,191 @@ private void validateMetadataProperties(String column, 
boolean hasDictionary, in
     assertEquals(columnMetadata.getMaxValue(), maxValue);
   }
 
+  /**
+   * Rollback test: when a V7 codec-pipeline forward index exists on disk but 
the table config
+   * is reverted to a legacy {@code compressionCodec} (no {@code codecSpec}),
+   * {@code shouldChangeRawCompressionType()} must schedule
+   * {@link ForwardIndexHandler.Operation#CHANGE_INDEX_COMPRESSION_TYPE} so 
the segment is
+   * rewritten back to the legacy format — enabling rollback to pre-V7 servers.
+   */
+  @Test
+  public void testComputeOperationNoChangeCompressionForV7CodecPipelineColumn()

Review Comment:
   This test method name is misleading: the Javadoc and assertions verify that 
a rewrite *is* scheduled for a V7 codec-pipeline segment when the config is 
reverted to legacy compressionCodec, but the name says "NoChangeCompression". 
Please rename the test to reflect the rollback/rewrite behavior being asserted.
   ```suggestion
     public void 
testComputeOperationChangeCompressionForV7CodecPipelineRollbackToLegacyCompressionCodec()
   ```



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkSVForwardIndexReaderV7.java:
##########
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.readers.forward;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import org.apache.pinot.segment.local.io.codec.CodecPipelineExecutor;
+import org.apache.pinot.segment.local.io.codec.CodecRegistry;
+import 
org.apache.pinot.segment.local.io.writer.impl.FixedByteChunkForwardIndexWriterV7;
+import org.apache.pinot.segment.spi.codec.CodecContext;
+import org.apache.pinot.segment.spi.index.ForwardIndexConfig;
+import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
+import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Chunk-based single-value raw forward index reader for version-7 files 
written by
+ * {@link FixedByteChunkForwardIndexWriterV7}.
+ *
+ * <p>Reads the canonical codec spec from the file header, instantiates a
+ * {@link CodecPipelineExecutor}, and uses it to decode each chunk on demand.
+ *
+ * <p>Supported data types: INT, LONG.
+ */
+public final class FixedByteChunkSVForwardIndexReaderV7 implements 
ForwardIndexReader<ChunkReaderContext> {
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(FixedByteChunkSVForwardIndexReaderV7.class);
+
+  public static final int VERSION = 
ForwardIndexConfig.CODEC_PIPELINE_WRITER_VERSION;
+
+  private final PinotDataBuffer _dataBuffer;
+  private final DataType _storedType;
+  private final int _numChunks;
+  private final int _numDocsPerChunk;
+  private final int _shift; // log2(numDocsPerChunk) for fast chunk id calc
+  private final int _totalDocs;
+  private final int _dataHeaderStart;
+  private final CodecPipelineExecutor _executor;
+  private final String _canonicalSpec;
+
+  public FixedByteChunkSVForwardIndexReaderV7(PinotDataBuffer dataBuffer, 
DataType storedType) {
+    _dataBuffer = dataBuffer;
+    _storedType = storedType;
+
+    if (storedType != DataType.INT && storedType != DataType.LONG) {
+      throw new IllegalArgumentException(
+          "FixedByteChunkSVForwardIndexReaderV7 only supports INT and LONG, 
got: " + storedType);
+    }
+
+    int offset = 0;
+    int version = dataBuffer.getInt(offset);
+    if (version != VERSION) {
+      throw new IllegalArgumentException("Expected version " + VERSION + " but 
got " + version);
+    }
+    offset += Integer.BYTES;
+
+    _numChunks = dataBuffer.getInt(offset);
+    offset += Integer.BYTES;
+
+    _numDocsPerChunk = dataBuffer.getInt(offset);
+    offset += Integer.BYTES;
+    if (_numDocsPerChunk <= 0 || (_numDocsPerChunk & (_numDocsPerChunk - 1)) 
!= 0) {
+      throw new IllegalArgumentException(
+          "Invalid numDocsPerChunk in forward index header: " + 
_numDocsPerChunk
+              + ". Expected a positive power of two.");
+    }
+    _shift = Integer.numberOfTrailingZeros(_numDocsPerChunk);
+

Review Comment:
   FixedByteChunkSVForwardIndexReaderV7 reads `numChunks` from the header but 
never uses it to validate header consistency. If `totalDocs`, 
`numDocsPerChunk`, and `numChunks` disagree (corrupt or mismatched file), 
`getChunkOffset()` can read past the chunk-offset table. Consider validating 
`numChunks == ceil(totalDocs/numDocsPerChunk)` during construction and using 
`numChunks` to bounds-check `chunkId` before reading offsets.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Add codec pipeline framework for raw forward index encoding [pinot]

Reply via email to