[ https://issues.apache.org/jira/browse/LUCENE-7526?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15616618#comment-15616618 ]
ASF GitHub Bot commented on LUCENE-7526: ---------------------------------------- Github user Timothy055 commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/105#discussion_r85611978 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositePostingsEnum.java --- @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.uhighlight; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; + + +final class CompositePostingsEnum extends PostingsEnum { + + private static final int NO_MORE_POSITIONS = -2; + private final BytesRef term; + private final int freq; + private final PriorityQueue<BoundsCheckingPostingsEnum> queue; + + + /** + * This class is used to ensure we don't over iterate the underlying + * postings enum by keeping track of the position relative to the + * frequency. + * Ideally this would've been an implementation of a PostingsEnum + * but it would have to delegate most methods and it seemed easier + * to just wrap the tweaked method. + */ + private static final class BoundsCheckingPostingsEnum { + + + private final PostingsEnum postingsEnum; + private final int freq; + private int position; + private int nextPosition; + private int positionInc = 1; + + private int startOffset; + private int endOffset; + + BoundsCheckingPostingsEnum(PostingsEnum postingsEnum) throws IOException { + this.postingsEnum = postingsEnum; + this.freq = postingsEnum.freq(); + nextPosition = postingsEnum.nextPosition(); + position = nextPosition; + startOffset = postingsEnum.startOffset(); + endOffset = postingsEnum.endOffset(); + } + + private boolean hasMorePositions() throws IOException { + return positionInc < freq; + } + + /** + * Returns the next position of the underlying postings enum unless + * it cannot iterate further and returns NO_MORE_POSITIONS; + * @return + * @throws IOException + */ + private int nextPosition() throws IOException { + position = nextPosition; + startOffset = postingsEnum.startOffset(); + endOffset = postingsEnum.endOffset(); + if (hasMorePositions()) { + positionInc++; + nextPosition = postingsEnum.nextPosition(); + } else { + nextPosition = NO_MORE_POSITIONS; + } + return position; + } + + } + + CompositePostingsEnum(BytesRef term, List<PostingsEnum> postingsEnums) throws IOException { + this.term = term; + queue = new PriorityQueue<BoundsCheckingPostingsEnum>(postingsEnums.size()) { + @Override + protected boolean lessThan(BoundsCheckingPostingsEnum a, BoundsCheckingPostingsEnum b) { + return a.position < b.position; + } + }; + + int freqAdd = 0; + for (PostingsEnum postingsEnum : postingsEnums) { + queue.add(new BoundsCheckingPostingsEnum(postingsEnum)); + freqAdd += postingsEnum.freq(); + } + freq = freqAdd; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int nextPosition() throws IOException { + int position = NO_MORE_POSITIONS; + while (queue.size() >= 1) { + queue.top().nextPosition(); + queue.updateTop(); //the new position may be behind another postingsEnum in the queue + position = queue.top().position; + + if (position == NO_MORE_POSITIONS) { + queue.pop(); //this postingsEnum is consumed, let's get rid of it + } else { + break; //we got a new position + } + + } + return position; + } + + @Override + public int startOffset() throws IOException { + return queue.top().startOffset; + } + + @Override + public int endOffset() throws IOException { + return queue.top().endOffset; + } + + @Override + public BytesRef getPayload() throws IOException { + //The UnifiedHighlighter depends on the payload for a wildcard + //being the term representing it + return term; + } + + @Override + public int docID() { + return queue.top().postingsEnum.docID(); --- End diff -- K > Improvements to UnifiedHighlighter OffsetStrategies > --------------------------------------------------- > > Key: LUCENE-7526 > URL: https://issues.apache.org/jira/browse/LUCENE-7526 > Project: Lucene - Core > Issue Type: Improvement > Components: modules/highlighter > Reporter: Timothy M. Rodriguez > Assignee: David Smiley > Priority: Minor > Fix For: 6.4 > > > This ticket improves several of the UnifiedHighlighter FieldOffsetStrategies > by reducing reliance on creating or re-creating TokenStreams. > The primary changes are as follows: > * AnalysisOffsetStrategy - split into two offset strategies > ** MemoryIndexOffsetStrategy - the primary analysis mode that utilizes a > MemoryIndex for producing Offsets > ** TokenStreamOffsetStrategy - an offset strategy that avoids creating a > MemoryIndex. Can only be used if the query distills down to terms and > automata. > * TokenStream removal > ** MemoryIndexOffsetStrategy - previously a TokenStream was created to fill > the memory index and then once consumed a new one was generated by > uninverting the MemoryIndex back into a TokenStream if there were automata > (wildcard/mtq queries) involved. Now this is avoided, which should save > memory and avoid a second pass over the data. > ** TermVectorOffsetStrategy - this was refactored in a similar way to avoid > generating a TokenStream if automata are involved. > ** PostingsWithTermVectorsOffsetStrategy - similar refactoring > * CompositePostingsEnum - aggregates several underlying PostingsEnums for > wildcard/mtq queries. This should improve relevancy by providing unified > metrics for a wildcard across all it's term matches > * Added a HighlightFlag for enabling the newly separated > TokenStreamOffsetStrategy since it can adversely affect passage relevancy -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org