[GitHub] [lucene] apanimesh061 commented on a change in pull request #412: LUCENE-10197: UnifiedHighlighter should use builders for thread-safety

GitBox Sat, 04 Dec 2021 21:50:48 -0800


apanimesh061 commented on a change in pull request #412:
URL: https://github.com/apache/lucene/pull/412#discussion_r762514373




##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -113,118 +112,239 @@
   protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY =
       new LabelledCharArrayMatcher[0];
 
-  protected final IndexSearcher searcher; // if null, can only use 
highlightWithoutSearcher
+  protected final IndexSearcher searcher;
 
   protected final Analyzer indexAnalyzer;
 
-  private boolean defaultHandleMtq = true; // e.g. wildcards
+  private final int maxLength;
 
-  private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or 
"query debugging"
+  private final Supplier<BreakIterator> defaultBreakIterator;
 
-  // For analysis, prefer MemoryIndexOffsetStrategy
-  private boolean defaultPassageRelevancyOverSpeed = true;
+  private final Predicate<String> defaultFieldMatcher;
 
-  private int maxLength = DEFAULT_MAX_LENGTH;
+  private final PassageScorer defaultScorer;
 
-  // BreakIterator is stateful so we use a Supplier factory method
-  private Supplier<BreakIterator> defaultBreakIterator =
-      () -> BreakIterator.getSentenceInstance(Locale.ROOT);
+  private final PassageFormatter defaultFormatter;
 
-  private Predicate<String> defaultFieldMatcher;
+  private final int defaultMaxNoHighlightPassages;
 
-  private PassageScorer defaultScorer = new PassageScorer();
+  // lazy initialized with double-check locking; protected so subclass can init
+  protected volatile FieldInfos fieldInfos;
 
-  private PassageFormatter defaultFormatter = new DefaultPassageFormatter();
+  private final int cacheFieldValCharsThreshold;
 
-  private int defaultMaxNoHighlightPassages = -1;
+  private final Set<HighlightFlag> flags;
 
-  // lazy initialized with double-check locking; protected so subclass can init
-  protected volatile FieldInfos fieldInfos;
+  /** Builder for UnifiedHighlighter. */
+  public static class Builder {
+    /** If null, can only use highlightWithoutSearcher. */
+    private IndexSearcher searcher;
 
-  private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
+    private Analyzer indexAnalyzer;
+    private boolean handleMultiTermQuery = true;
+    private boolean highlightPhrasesStrictly = true;
+    private boolean passageRelevancyOverSpeed = true;
+    private boolean weightMatches = true;
+    private int maxLength = DEFAULT_MAX_LENGTH;
 
-  /** Extracts matching terms after rewriting against an empty index */
-  protected static Set<Term> extractTerms(Query query) throws IOException {
-    Set<Term> queryTerms = new HashSet<>();
-    
EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
-    return queryTerms;
-  }
+    /** BreakIterator is stateful so we use a Supplier factory method. */
+    private Supplier<BreakIterator> breakIterator =
+        () -> BreakIterator.getSentenceInstance(Locale.ROOT);
 
-  /**
-   * Constructs the highlighter with the given index searcher and analyzer.
-   *
-   * @param indexSearcher Usually required, unless {@link 
#highlightWithoutSearcher(String, Query,
-   *     String, int)} is used, in which case this needs to be null.
-   * @param indexAnalyzer Required, even if in some circumstances it isn't 
used.
-   */
-  public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer 
indexAnalyzer) {
-    this.searcher = indexSearcher; // TODO: make non nullable
-    this.indexAnalyzer =
-        Objects.requireNonNull(
-            indexAnalyzer,
-            "indexAnalyzer is required" + " (even if in some circumstances it 
isn't used)");
-  }
+    private Predicate<String> fieldMatcher;
+    private PassageScorer scorer = new PassageScorer();
+    private PassageFormatter formatter = new DefaultPassageFormatter();
+    private int maxNoHighlightPassages = -1;
+    private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
+    private Set<HighlightFlag> flags;
 
-  public void setHandleMultiTermQuery(boolean handleMtq) {
-    this.defaultHandleMtq = handleMtq;
-  }
+    /**
+     * Usually required, unless {@link #highlightWithoutSearcher(String, 
Query, String, int)} is
+     * used, in which case this needs to be null.
+     */
+    public Builder withSearcher(IndexSearcher value) {
+      this.searcher = value;
+      return self();
+    }
 
-  public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) {
-    this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly;
-  }
+    /**
+     * This method sets the analyzer for the UH object. Required, even if in 
some circumstances it
+     * isn' used. The null check is performed in the constructor.
+     */
+    public Builder withIndexAnalyzer(Analyzer value) {
+      this.indexAnalyzer = value;
+      return self();
+    }
 
-  public void setMaxLength(int maxLength) {
-    if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
-      // two reasons: no overflow problems in 
BreakIterator.preceding(offset+1),
-      // our sentinel in the offsets queue uses this value to terminate.
-      throw new IllegalArgumentException("maxLength must be < 
Integer.MAX_VALUE");
+    /**
+     * User-defined set of {@link HighlightFlag} values which will override 
the flags set by {@link
+     * #withHandleMultiTermQuery(boolean)}, {@link 
#withHighlightPhrasesStrictly(boolean)}, {@link
+     * #withPassageRelevancyOverSpeed(boolean)} and {@link 
#withWeightMatches(boolean)}.
+     *
+     * @param values - set of {@link HighlightFlag} values.
+     */
+    public Builder withFlags(Set<HighlightFlag> values) {
+      this.flags = values;
+      return self();
     }
-    this.maxLength = maxLength;
-  }
 
-  public void setBreakIterator(Supplier<BreakIterator> breakIterator) {
-    this.defaultBreakIterator = breakIterator;
-  }
+    /**
+     * Here position sensitive queries (e.g. phrases and {@link SpanQuery}ies) 
are highlighted
+     * strictly based on query matches (slower) versus any/all occurrences of 
the underlying terms.
+     * By default it's enabled, but there's no overhead if such queries aren't 
used.
+     */
+    public Builder withHighlightPhrasesStrictly(boolean value) {
+      this.highlightPhrasesStrictly = value;
+      return self();
+    }
 
-  public void setScorer(PassageScorer scorer) {
-    this.defaultScorer = scorer;
-  }
+    /**
+     * Here {@link org.apache.lucene.search.MultiTermQuery} derivatives will 
be highlighted. By
+     * default it's enabled. MTQ highlighting can be expensive, particularly 
when using offsets in
+     * postings.
+     */
+    public Builder withHandleMultiTermQuery(boolean value) {
+      this.handleMultiTermQuery = value;
+      return self();
+    }
 
-  public void setFormatter(PassageFormatter formatter) {
-    this.defaultFormatter = formatter;
-  }
+    /** Passage relevancy is more important than speed. True by default. */
+    public Builder withPassageRelevancyOverSpeed(boolean value) {
+      this.passageRelevancyOverSpeed = value;
+      return self();
+    }
 
-  public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) {
-    this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages;
-  }
+    /**
+     * Internally use the {@link Weight#matches(LeafReaderContext, int)} API 
for highlighting. It's
+     * more accurate to the query, and the snippets can be a little different 
for phrases because
+     * the whole phrase is marked up instead of each word. The passage 
relevancy calculation can be
+     * different (maybe worse?) and it's slower when highlighting many fields. 
Use of this flag
+     * requires {@link HighlightFlag#MULTI_TERM_QUERY} and {@link 
HighlightFlag#PHRASES} and {@link
+     * HighlightFlag#PASSAGE_RELEVANCY_OVER_SPEED}. True by default because 
those booleans are true
+     * by default.
+     */
+    public Builder withWeightMatches(boolean value) {
+      this.weightMatches = value;
+      return self();
+    }
+
+    public Builder withMaxLength(int value) {
+      if (value < 0 || value == Integer.MAX_VALUE) {
+        // two reasons: no overflow problems in 
BreakIterator.preceding(offset+1),
+        // our sentinel in the offsets queue uses this value to terminate.
+        throw new IllegalArgumentException("maxLength must be < 
Integer.MAX_VALUE");
+      }
+      this.maxLength = value;
+      return self();
+    }
+
+    public Builder withBreakIterator(Supplier<BreakIterator> value) {
+      this.breakIterator = value;
+      return self();
+    }
 
-  public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) {
-    this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
+    public Builder withFieldMatcher(Predicate<String> value) {
+      this.fieldMatcher = value;
+      return self();
+    }
+
+    public Builder withScorer(PassageScorer value) {
+      this.scorer = value;
+      return self();
+    }
+
+    public Builder withFormatter(PassageFormatter value) {
+      this.formatter = value;
+      return self();
+    }
+
+    public Builder withMaxNoHighlightPassages(int value) {
+      this.maxNoHighlightPassages = value;
+      return self();
+    }
+
+    public Builder withCacheFieldValCharsThreshold(int value) {
+      this.cacheFieldValCharsThreshold = value;
+      return self();
+    }
+
+    /**
+     * This method returns the set of of {@link HighlightFlag}s, which will be 
applied to the UH
+     * object. The output depends on the values provided to {@link
+     * #withHandleMultiTermQuery(boolean)}, {@link 
#withHighlightPhrasesStrictly(boolean)}, {@link
+     * #withPassageRelevancyOverSpeed(boolean)} and {@link 
#withWeightMatches(boolean)}.
+     *
+     * @return a set of {@link HighlightFlag}s.
+     */
+    protected Set<HighlightFlag> evaluateFlags() {
+      if (Objects.nonNull(flags) && !flags.isEmpty()) {

Review comment:
       Yeah that makes sense. Will fix.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] apanimesh061 commented on a change in pull request #412: LUCENE-10197: UnifiedHighlighter should use builders for thread-safety

Reply via email to