jpountz commented on code in PR #13585:
URL: https://github.com/apache/lucene/pull/13585#discussion_r1689502270


##########
lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java:
##########
@@ -0,0 +1,1998 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene912;
+
+import static org.apache.lucene.codecs.lucene912.ForUtil.BLOCK_SIZE;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.SKIP_TOTAL_SIZE;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
+import static 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
+
+import java.io.IOException;
+import java.util.AbstractList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.RandomAccess;
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import 
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Impact;
+import org.apache.lucene.index.Impacts;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SlowImpactsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.ReadAdvice;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Concrete class that reads docId(maybe frq,pos,offset,payloads) list with 
postings format.
+ *
+ * @lucene.experimental
+ */
+public final class Lucene912PostingsReader extends PostingsReaderBase {
+
+  private final IndexInput docIn;
+  private final IndexInput posIn;
+  private final IndexInput payIn;
+
+  private final int version;
+
+  /** Sole constructor. */
+  public Lucene912PostingsReader(SegmentReadState state) throws IOException {
+    boolean success = false;
+    IndexInput docIn = null;
+    IndexInput posIn = null;
+    IndexInput payIn = null;
+
+    // NOTE: these data files are too costly to verify checksum against all 
the bytes on open,
+    // but for now we at least verify proper structure of the checksum footer: 
which looks
+    // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms 
of corruption
+    // such as file truncation.
+
+    String docName =
+        IndexFileNames.segmentFileName(
+            state.segmentInfo.name, state.segmentSuffix, 
Lucene912PostingsFormat.DOC_EXTENSION);
+    try {
+      // Postings have a forward-only access pattern, so pass 
ReadAdvice.NORMAL to perform
+      // readahead.
+      docIn = state.directory.openInput(docName, 
state.context.withReadAdvice(ReadAdvice.NORMAL));
+      version =
+          CodecUtil.checkIndexHeader(
+              docIn,
+              DOC_CODEC,
+              VERSION_START,
+              VERSION_CURRENT,
+              state.segmentInfo.getId(),
+              state.segmentSuffix);
+      CodecUtil.retrieveChecksum(docIn);
+
+      if (state.fieldInfos.hasProx()) {
+        String proxName =
+            IndexFileNames.segmentFileName(
+                state.segmentInfo.name, state.segmentSuffix, 
Lucene912PostingsFormat.POS_EXTENSION);
+        posIn = state.directory.openInput(proxName, state.context);
+        CodecUtil.checkIndexHeader(
+            posIn, POS_CODEC, version, version, state.segmentInfo.getId(), 
state.segmentSuffix);
+        CodecUtil.retrieveChecksum(posIn);
+
+        if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
+          String payName =
+              IndexFileNames.segmentFileName(
+                  state.segmentInfo.name,
+                  state.segmentSuffix,
+                  Lucene912PostingsFormat.PAY_EXTENSION);
+          payIn = state.directory.openInput(payName, state.context);
+          CodecUtil.checkIndexHeader(
+              payIn, PAY_CODEC, version, version, state.segmentInfo.getId(), 
state.segmentSuffix);
+          CodecUtil.retrieveChecksum(payIn);
+        }
+      }
+
+      this.docIn = docIn;
+      this.posIn = posIn;
+      this.payIn = payIn;
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(docIn, posIn, payIn);
+      }
+    }
+  }
+
+  @Override
+  public void init(IndexInput termsIn, SegmentReadState state) throws 
IOException {
+    // Make sure we are talking to the matching postings writer
+    CodecUtil.checkIndexHeader(
+        termsIn,
+        TERMS_CODEC,
+        VERSION_START,
+        VERSION_CURRENT,
+        state.segmentInfo.getId(),
+        state.segmentSuffix);
+    final int indexBlockSize = termsIn.readVInt();
+    if (indexBlockSize != BLOCK_SIZE) {
+      throw new IllegalStateException(
+          "index-time BLOCK_SIZE ("
+              + indexBlockSize
+              + ") != read-time BLOCK_SIZE ("
+              + BLOCK_SIZE
+              + ")");
+    }
+  }
+
+  static void prefixSum(long[] buffer, int count, long base) {
+    buffer[0] += base;
+    for (int i = 1; i < count; ++i) {
+      buffer[i] += buffer[i - 1];
+    }
+  }
+
+  static int findFirstGreater(long[] buffer, int target, int from) {
+    for (int i = from; i < BLOCK_SIZE; ++i) {
+      if (buffer[i] >= target) {
+        return i;
+      }
+    }
+    return BLOCK_SIZE;
+  }
+
+  @Override
+  public BlockTermState newTermState() {
+    return new IntBlockTermState();
+  }
+
+  @Override
+  public void close() throws IOException {
+    IOUtils.close(docIn, posIn, payIn);
+  }
+
+  @Override
+  public void decodeTerm(
+      DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean 
absolute)
+      throws IOException {
+    final IntBlockTermState termState = (IntBlockTermState) _termState;
+    final boolean fieldHasPositions =
+        
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
 >= 0;
+    final boolean fieldHasOffsets =
+        
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
+            >= 0;
+    final boolean fieldHasPayloads = fieldInfo.hasPayloads();
+
+    if (absolute) {
+      termState.docStartFP = 0;
+      termState.posStartFP = 0;
+      termState.payStartFP = 0;
+    }
+
+    final long l = in.readVLong();
+    if ((l & 0x01) == 0) {
+      termState.docStartFP += l >>> 1;
+      if (termState.docFreq == 1) {
+        termState.singletonDocID = in.readVInt();
+      } else {
+        termState.singletonDocID = -1;
+      }
+    } else {
+      assert absolute == false;
+      assert termState.singletonDocID != -1;
+      termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1);
+    }
+
+    if (fieldHasPositions) {
+      termState.posStartFP += in.readVLong();
+      if (fieldHasOffsets || fieldHasPayloads) {
+        termState.payStartFP += in.readVLong();
+      }
+      if (termState.totalTermFreq > BLOCK_SIZE) {
+        termState.lastPosBlockOffset = in.readVLong();
+      } else {
+        termState.lastPosBlockOffset = -1;
+      }
+    }
+  }
+
+  @Override
+  public PostingsEnum postings(
+      FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int 
flags)
+      throws IOException {
+
+    boolean indexHasPositions =
+        
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
 >= 0;
+
+    if (indexHasPositions == false
+        || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == 
false) {
+      BlockDocsEnum docsEnum;
+      if (reuse instanceof BlockDocsEnum) {
+        docsEnum = (BlockDocsEnum) reuse;
+        if (!docsEnum.canReuse(docIn, fieldInfo)) {
+          docsEnum = new BlockDocsEnum(fieldInfo);
+        }
+      } else {
+        docsEnum = new BlockDocsEnum(fieldInfo);
+      }
+      return docsEnum.reset((IntBlockTermState) termState, flags);
+    } else {
+      EverythingEnum everythingEnum;
+      if (reuse instanceof EverythingEnum) {
+        everythingEnum = (EverythingEnum) reuse;
+        if (!everythingEnum.canReuse(docIn, fieldInfo)) {
+          everythingEnum = new EverythingEnum(fieldInfo);
+        }
+      } else {
+        everythingEnum = new EverythingEnum(fieldInfo);
+      }
+      return everythingEnum.reset((IntBlockTermState) termState, flags);
+    }
+  }
+
+  @Override
+  public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int 
flags)
+      throws IOException {
+    final boolean indexHasFreqs =
+        fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 
0;
+    final boolean indexHasPositions =
+        
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
 >= 0;
+
+    if (state.docFreq >= BLOCK_SIZE
+        && indexHasFreqs
+        && (indexHasPositions == false
+            || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == 
false)) {
+      return new BlockImpactsDocsEnum(fieldInfo, (IntBlockTermState) state);
+    }
+
+    final boolean indexHasOffsets =
+        
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
+            >= 0;
+    final boolean indexHasPayloads = fieldInfo.hasPayloads();
+
+    if (state.docFreq >= BLOCK_SIZE
+        && indexHasPositions
+        && (indexHasOffsets == false
+            || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == 
false)
+        && (indexHasPayloads == false
+            || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == 
false)) {
+      return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) 
state);
+    }
+
+    return new SlowImpactsEnum(postings(fieldInfo, state, null, flags));
+  }
+
+  final class BlockDocsEnum extends PostingsEnum {
+
+    final ForUtil forUtil = new ForUtil();
+    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
+    final PForUtil pforUtil = new PForUtil(forUtil);
+
+    private final long[] docBuffer = new long[BLOCK_SIZE + 1];
+    private final long[] freqBuffer = new long[BLOCK_SIZE];
+
+    private int docBufferUpto;
+
+    final IndexInput startDocIn;
+
+    IndexInput docIn;
+    final boolean indexHasFreq;
+    final boolean indexHasPos;
+    final boolean indexHasOffsetsOrPayloads;
+
+    private int docFreq; // number of docs in this posting list
+    private long totalTermFreq; // sum of freqBuffer in this posting list (or 
docFreq when omitted)
+    private int blockUpto; // number of docs in or before the current block
+    private int doc; // doc we last read
+    private long accum; // accumulator for doc deltas
+
+    // level 0 skip data
+    private int lastDocInBlock;
+    // level 1 skip data
+    private int nextSkipDoc;
+    private long nextSkipOffset;
+    private int nextSkipBlockUpto;
+
+    private boolean needsFreq; // true if the caller actually needs frequencies
+    private int singletonDocID; // docid when there is a single pulsed 
posting, otherwise -1
+    private long freqFP;
+
+    public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
+      this.startDocIn = Lucene912PostingsReader.this.docIn;
+      this.docIn = null;
+      indexHasFreq = 
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+      indexHasPos =
+          
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
 >= 0;
+      indexHasOffsetsOrPayloads =
+          fieldInfo
+                      .getIndexOptions()
+                      
.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
+                  >= 0
+              || fieldInfo.hasPayloads();
+      // We set the last element of docBuffer to NO_MORE_DOCS, it helps save 
conditionals in
+      // advance()
+      docBuffer[BLOCK_SIZE] = NO_MORE_DOCS;
+    }
+
+    public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
+      return docIn == startDocIn
+              && indexHasFreq
+                  == 
(fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0)
+              && indexHasPos
+                  == (fieldInfo
+                          .getIndexOptions()
+                          .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
+                      >= 0)
+              && indexHasOffsetsOrPayloads
+                  == fieldInfo
+                          .getIndexOptions()
+                          
.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
+                      >= 0
+          || fieldInfo.hasPayloads();
+    }
+
+    public PostingsEnum reset(IntBlockTermState termState, int flags) throws 
IOException {
+      docFreq = termState.docFreq;
+      totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
+      singletonDocID = termState.singletonDocID;
+      if (docIn == null) {
+        // lazy init
+        docIn = startDocIn.clone();
+      }
+      prefetchPostings(docIn, termState);
+
+      doc = -1;
+      this.needsFreq = PostingsEnum.featureRequested(flags, 
PostingsEnum.FREQS);
+      if (indexHasFreq == false || needsFreq == false) {
+        // Filling this buffer may not be cheap when doing primary key 
lookups, so we make sure to
+        // not fill more than `docFreq` entries.
+        Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1);
+      }
+      accum = -1;
+      blockUpto = 0;
+      lastDocInBlock = -1;
+      nextSkipDoc = -1;
+      nextSkipOffset = termState.docStartFP;
+      nextSkipBlockUpto = 0;
+      docBufferUpto = BLOCK_SIZE;
+      freqFP = -1;
+      return this;
+    }
+
+    @Override
+    public int freq() throws IOException {
+      if (freqFP != -1) {
+        docIn.seek(freqFP);
+        pforUtil.decode(docIn, freqBuffer);
+      }
+
+      return (int) freqBuffer[docBufferUpto - 1];
+    }
+
+    @Override
+    public int nextPosition() throws IOException {
+      return -1;
+    }
+
+    @Override
+    public int startOffset() throws IOException {
+      return -1;
+    }
+
+    @Override
+    public int endOffset() throws IOException {
+      return -1;
+    }
+
+    @Override
+    public BytesRef getPayload() throws IOException {
+      return null;
+    }
+
+    @Override
+    public int docID() {
+      return doc;
+    }
+
+    private void refillFullBlock() throws IOException {
+      assert docFreq - blockUpto >= BLOCK_SIZE;
+
+      forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
+
+      if (indexHasFreq) {
+        if (needsFreq) {
+          freqFP = docIn.getFilePointer();
+        }
+        pforUtil.skip(docIn);
+      }
+      blockUpto += BLOCK_SIZE;
+      accum = docBuffer[BLOCK_SIZE - 1];
+      docBufferUpto = 0;
+      assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
+    }
+
+    private void refillRemainder() throws IOException {
+      final int left = docFreq - blockUpto;
+      assert left >= 0;
+      assert left < BLOCK_SIZE;
+
+      if (docFreq == 1) {
+        docBuffer[0] = singletonDocID;
+        freqBuffer[0] = totalTermFreq;
+        docBuffer[1] = NO_MORE_DOCS;
+        blockUpto++;
+      } else {
+        // Read vInts:
+        PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, 
indexHasFreq, needsFreq);
+        prefixSum(docBuffer, left, accum);
+        docBuffer[left] = NO_MORE_DOCS;
+        blockUpto += left;
+      }
+      docBufferUpto = 0;
+      freqFP = -1;
+    }
+
+    private void skipLevel1To(int target) throws IOException {
+      while (true) {
+        accum = nextSkipDoc;
+        lastDocInBlock = nextSkipDoc;
+        docIn.seek(nextSkipOffset);
+        blockUpto = nextSkipBlockUpto;
+        nextSkipBlockUpto += SKIP_TOTAL_SIZE;
+
+        if (docFreq - blockUpto < SKIP_TOTAL_SIZE) {
+          nextSkipDoc = DocIdSetIterator.NO_MORE_DOCS;
+          break;
+        }
+
+        nextSkipDoc += docIn.readVInt();
+        long impactLength;
+        if (indexHasFreq) {
+          impactLength = docIn.readVLong();
+        } else {
+          impactLength = 0L;
+        }
+        long skipLength = docIn.readVLong();

Review Comment:
   This is already what BlockDocsEnum does when `nextDoc()` is used (unless I 
misunderstood your recommendation), see line 531 in 
`BlockDocsEnum#moveToNextBlock`. I cannot do it for level 1 because I do not 
know at the time of decoding skip data whether the caller will later call 
advance(), but the overhead should be low in any case since we're only decoding 
level 1 skip data every 32 blocks.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to