dsmiley commented on a change in pull request #412:
URL: https://github.com/apache/lucene/pull/412#discussion_r739877887



##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -143,88 +135,168 @@
 
   private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
 
-  /** Extracts matching terms after rewriting against an empty index */
-  protected static Set<Term> extractTerms(Query query) throws IOException {
-    Set<Term> queryTerms = new HashSet<>();
-    
EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
-    return queryTerms;
-  }
+  private Set<HighlightFlag> flags;
+
+  /** Builder for UnifiedHighlighter. */
+  public abstract static class Builder<T extends Builder<T>> {

Review comment:
       Why abstract?

##########
File path: 
lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
##########
@@ -96,129 +96,14 @@ protected OffsetsEnum 
createOffsetsEnumFromReader(LeafReader leafReader, int doc
    */
   @Test
   public void testUnifiedHighlighterExtensibility() {
-    final int maxLength = 1000;
-    UnifiedHighlighter uh =
-        new UnifiedHighlighter(null, new MockAnalyzer(random())) {
-
-          @Override
-          protected Map<String, Object[]> highlightFieldsAsObjects(
-              String[] fieldsIn, Query query, int[] docIdsIn, int[] 
maxPassagesIn)
-              throws IOException {
-            return super.highlightFieldsAsObjects(fieldsIn, query, docIdsIn, 
maxPassagesIn);
-          }
-
-          @Override
-          protected OffsetSource getOffsetSource(String field) {
-            return super.getOffsetSource(field);
-          }
-
-          @Override
-          protected BreakIterator getBreakIterator(String field) {
-            return super.getBreakIterator(field);
-          }
-
-          @Override
-          protected PassageScorer getScorer(String field) {
-            return super.getScorer(field);
-          }
-
-          @Override
-          protected PassageFormatter getFormatter(String field) {
-            return super.getFormatter(field);
-          }
-
-          @Override
-          public Analyzer getIndexAnalyzer() {
-            return super.getIndexAnalyzer();
-          }
-
-          @Override
-          public IndexSearcher getIndexSearcher() {
-            return super.getIndexSearcher();
-          }
-
-          @Override
-          protected int getMaxNoHighlightPassages(String field) {
-            return super.getMaxNoHighlightPassages(field);
-          }
-
-          @Override
-          protected Boolean requiresRewrite(SpanQuery spanQuery) {
-            return super.requiresRewrite(spanQuery);
-          }
-
-          @Override
-          protected LimitedStoredFieldVisitor 
newLimitedStoredFieldsVisitor(String[] fields) {
-            return super.newLimitedStoredFieldsVisitor(fields);
-          }
-
-          @Override
-          protected List<CharSequence[]> loadFieldValues(
-              String[] fields, DocIdSetIterator docIter, int 
cacheCharsThreshold)
-              throws IOException {
-            return super.loadFieldValues(fields, docIter, cacheCharsThreshold);
-          }
-
-          @Override
-          protected FieldHighlighter getFieldHighlighter(
-              String field, Query query, Set<Term> allTerms, int maxPassages) {
-            // THIS IS A COPY of the superclass impl; but use 
CustomFieldHighlighter
-            UHComponents components = getHighlightComponents(field, query, 
allTerms);
-            OffsetSource offsetSource = getOptimizedOffsetSource(components);
-
-            // test all is accessible
-            components.getField();
-            components.getFieldMatcher();
-            components.getQuery();
-            components.getTerms();
-            components.getPhraseHelper();
-            components.getAutomata();
-            components.hasUnrecognizedQueryPart();
-            components.getHighlightFlags();
-
-            return new CustomFieldHighlighter(
-                field,
-                getOffsetStrategy(offsetSource, components),
-                new SplittingBreakIterator(
-                    getBreakIterator(field), 
UnifiedHighlighter.MULTIVAL_SEP_CHAR),
-                getScorer(field),
-                maxPassages,
-                getMaxNoHighlightPassages(field),
-                getFormatter(field));
-          }
-
-          @Override
-          protected UHComponents getHighlightComponents(
-              String field, Query query, Set<Term> allTerms) {
-            Predicate<String> fieldMatcher = getFieldMatcher(field);
-            BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
-            Set<HighlightFlag> highlightFlags = getFlags(field);
-            PhraseHelper phraseHelper = getPhraseHelper(field, query, 
highlightFlags);
-            LabelledCharArrayMatcher[] automata = getAutomata(field, query, 
highlightFlags);
-            boolean queryHasUnrecognizedPart = false;
-            return new UHComponents(
-                field,
-                fieldMatcher,
-                query,
-                terms,
-                phraseHelper,
-                automata,
-                queryHasUnrecognizedPart,
-                highlightFlags);
-          }
-
-          @Override
-          protected FieldOffsetStrategy getOffsetStrategy(
-              OffsetSource offsetSource, UHComponents components) {
-            return super.getOffsetStrategy(offsetSource, components);
-          }
-
-          @Override
-          public int getMaxLength() {
-            return maxLength;
-          }
-        };
-    assertEquals(uh.getMaxLength(), maxLength);
+    SubUnifiedHighlighter uh =
+        SubUnifiedHighlighter.builder()
+            .withSearcher(null)
+            .withIndexAnalyzer(new MockAnalyzer(random()))
+            .withNewField(false)
+            .build();

Review comment:
       Just to be clear, the point of `testUnifiedHighlighterExtensibility` is 
so that we're aware of potential backwards-compatibility breaks that we might 
more easily overlook.  It's not a great technique... I've heard of Java build 
plugins that can alert you to this as well.  Any way, your refactor, I think, 
obscures this since the subclass is no longer within the method definition.  I 
think you made this change because you thought we need some typed builder but I 
don't think we do.  Just have an anonymous implementation of the builder that 
overrides build() to return an anonymous subclass of the UnifiedHighlighter.

##########
File path: 
lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
##########
@@ -319,4 +329,12 @@ public void withTopDocs(IndexSearcher searcher, Query q, 
TopDocs hits) throws Ex
       }
     }
   }
+
+  /** This ConcreteBuilder has been created for this class only. */
+  private static class ConcreteBuilder extends 
UnifiedHighlighter.Builder<ConcreteBuilder> {

Review comment:
       I doubt we need "ConcreteBuilder" unless we need to add new options.

##########
File path: 
lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
##########
@@ -288,17 +283,32 @@ private void reset(IndexSearcher searcher) {
         return;
       }
       lastSearcher = searcher;
-      highlighter =
-          new UnifiedHighlighter(searcher, analyzer) {
+      ConcreteBuilder concreteBuilder =
+          new ConcreteBuilder()
+              .withSearcher(searcher)
+              .withIndexAnalyzer(analyzer)
+              .withBreakIterator(() -> 
BreakIterator.getSentenceInstance(Locale.ENGLISH))
+              .withMaxLength(maxDocCharsToAnalyze)
+              .withHighlightPhrasesStrictly(true)
+              .withHandleMultiTermQuery(true);
+      UnifiedHighlighter.Builder<?> builder =
+          new UnifiedHighlighter.Builder<ConcreteBuilder>() {
             @Override
-            protected OffsetSource getOffsetSource(String field) {
-              return offsetSource != null ? offsetSource : 
super.getOffsetSource(field);
+            protected ConcreteBuilder self() {

Review comment:
       Is overriding `self` actually necessary here?

##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -143,88 +135,168 @@
 
   private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
 
-  /** Extracts matching terms after rewriting against an empty index */
-  protected static Set<Term> extractTerms(Query query) throws IOException {
-    Set<Term> queryTerms = new HashSet<>();
-    
EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
-    return queryTerms;
-  }
+  private Set<HighlightFlag> flags;
+
+  /** Builder for UnifiedHighlighter. */
+  public abstract static class Builder<T extends Builder<T>> {
+    private IndexSearcher searcher;
+    private Analyzer indexAnalyzer;
+    private boolean handleMultiTermQuery = true;
+    private boolean highlightPhrasesStrictly = true;
+    private boolean passageRelevancyOverSpeed = true;
+    private int maxLength = DEFAULT_MAX_LENGTH;
+    private Supplier<BreakIterator> breakIterator =
+        () -> BreakIterator.getSentenceInstance(Locale.ROOT);
+    private Predicate<String> fieldMatcher;
+    private PassageScorer scorer = new PassageScorer();
+    private PassageFormatter formatter = new DefaultPassageFormatter();
+    private int maxNoHighlightPassages = -1;
+    private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
+    private Set<HighlightFlag> flags;
+
+    public T withSearcher(IndexSearcher value) {

Review comment:
       All these "with" methods need Javadocs because users will see these and 
want to know what they do at a glance.  They won't go looking at protected 
methods on UnifiedHighlighter for this; that's too much work.  You can simply 
move these javadocs from where they were and add a javadoc back-reference from 
those places.

##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -117,13 +116,6 @@
 
   protected final Analyzer indexAnalyzer;
 
-  private boolean defaultHandleMtq = true; // e.g. wildcards
-
-  private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or 
"query debugging"
-
-  // For analysis, prefer MemoryIndexOffsetStrategy
-  private boolean defaultPassageRelevancyOverSpeed = true;
-
   private int maxLength = DEFAULT_MAX_LENGTH;

Review comment:
       All fields should be final.

##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -143,88 +135,168 @@
 
   private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
 
-  /** Extracts matching terms after rewriting against an empty index */
-  protected static Set<Term> extractTerms(Query query) throws IOException {
-    Set<Term> queryTerms = new HashSet<>();
-    
EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
-    return queryTerms;
-  }
+  private Set<HighlightFlag> flags;
+
+  /** Builder for UnifiedHighlighter. */
+  public abstract static class Builder<T extends Builder<T>> {
+    private IndexSearcher searcher;
+    private Analyzer indexAnalyzer;
+    private boolean handleMultiTermQuery = true;
+    private boolean highlightPhrasesStrictly = true;
+    private boolean passageRelevancyOverSpeed = true;
+    private int maxLength = DEFAULT_MAX_LENGTH;
+    private Supplier<BreakIterator> breakIterator =
+        () -> BreakIterator.getSentenceInstance(Locale.ROOT);
+    private Predicate<String> fieldMatcher;
+    private PassageScorer scorer = new PassageScorer();
+    private PassageFormatter formatter = new DefaultPassageFormatter();
+    private int maxNoHighlightPassages = -1;
+    private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
+    private Set<HighlightFlag> flags;
+
+    public T withSearcher(IndexSearcher value) {
+      this.searcher = value;
+      return self();
+    }
 
-  /**
-   * Constructs the highlighter with the given index searcher and analyzer.

Review comment:
       This good info has no vanished?

##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -143,88 +135,168 @@
 
   private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
 
-  /** Extracts matching terms after rewriting against an empty index */
-  protected static Set<Term> extractTerms(Query query) throws IOException {
-    Set<Term> queryTerms = new HashSet<>();
-    
EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
-    return queryTerms;
-  }
+  private Set<HighlightFlag> flags;
+
+  /** Builder for UnifiedHighlighter. */
+  public abstract static class Builder<T extends Builder<T>> {
+    private IndexSearcher searcher;
+    private Analyzer indexAnalyzer;
+    private boolean handleMultiTermQuery = true;
+    private boolean highlightPhrasesStrictly = true;
+    private boolean passageRelevancyOverSpeed = true;
+    private int maxLength = DEFAULT_MAX_LENGTH;
+    private Supplier<BreakIterator> breakIterator =
+        () -> BreakIterator.getSentenceInstance(Locale.ROOT);
+    private Predicate<String> fieldMatcher;
+    private PassageScorer scorer = new PassageScorer();
+    private PassageFormatter formatter = new DefaultPassageFormatter();
+    private int maxNoHighlightPassages = -1;
+    private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
+    private Set<HighlightFlag> flags;
+
+    public T withSearcher(IndexSearcher value) {
+      this.searcher = value;
+      return self();
+    }
 
-  /**
-   * Constructs the highlighter with the given index searcher and analyzer.
-   *
-   * @param indexSearcher Usually required, unless {@link 
#highlightWithoutSearcher(String, Query,
-   *     String, int)} is used, in which case this needs to be null.
-   * @param indexAnalyzer Required, even if in some circumstances it isn't 
used.
-   */
-  public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer 
indexAnalyzer) {
-    this.searcher = indexSearcher; // TODO: make non nullable
-    this.indexAnalyzer =
-        Objects.requireNonNull(
-            indexAnalyzer,
-            "indexAnalyzer is required" + " (even if in some circumstances it 
isn't used)");
-  }
+    public T withIndexAnalyzer(Analyzer value) {
+      this.indexAnalyzer = value;
+      return self();
+    }
 
-  public void setHandleMultiTermQuery(boolean handleMtq) {
-    this.defaultHandleMtq = handleMtq;
-  }
+    public T withFlags(Set<HighlightFlag> values) {
+      this.flags = values;
+      return self();
+    }
 
-  public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) {
-    this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly;
-  }
+    public T withHandleMultiTermQuery(boolean value) {
+      this.handleMultiTermQuery = value;
+      return self();
+    }
 
-  public void setMaxLength(int maxLength) {
-    if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
-      // two reasons: no overflow problems in 
BreakIterator.preceding(offset+1),
-      // our sentinel in the offsets queue uses this value to terminate.
-      throw new IllegalArgumentException("maxLength must be < 
Integer.MAX_VALUE");
+    public T withHighlightPhrasesStrictly(boolean value) {
+      this.highlightPhrasesStrictly = value;
+      return self();
     }
-    this.maxLength = maxLength;
-  }
 
-  public void setBreakIterator(Supplier<BreakIterator> breakIterator) {
-    this.defaultBreakIterator = breakIterator;
-  }
+    public T withPassageRelevancyOverSpeed(boolean value) {
+      this.passageRelevancyOverSpeed = value;
+      return self();
+    }
 
-  public void setScorer(PassageScorer scorer) {
-    this.defaultScorer = scorer;
-  }
+    public T withMaxLength(int value) {
+      if (value < 0 || value == Integer.MAX_VALUE) {
+        // two reasons: no overflow problems in 
BreakIterator.preceding(offset+1),
+        // our sentinel in the offsets queue uses this value to terminate.
+        throw new IllegalArgumentException("maxLength must be < 
Integer.MAX_VALUE");
+      }
+      this.maxLength = value;
+      return self();
+    }
 
-  public void setFormatter(PassageFormatter formatter) {
-    this.defaultFormatter = formatter;
-  }
+    public T withBreakIterator(Supplier<BreakIterator> value) {
+      this.breakIterator = value;
+      return self();
+    }
 
-  public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) {
-    this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages;
-  }
+    public T withFieldMatcher(Predicate<String> value) {
+      this.fieldMatcher = value;
+      return self();
+    }
 
-  public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) {
-    this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
-  }
+    public T withScorer(PassageScorer value) {
+      this.scorer = value;
+      return self();
+    }
+
+    public T withFormatter(PassageFormatter value) {
+      this.formatter = value;
+      return self();
+    }
+
+    public T withMaxNoHighlightPassages(int value) {
+      this.maxNoHighlightPassages = value;
+      return self();
+    }
+
+    public T withCacheFieldValCharsThreshold(int value) {
+      this.cacheFieldValCharsThreshold = value;
+      return self();
+    }
+
+    protected abstract T self();
+
+    public UnifiedHighlighter build() {
+      return new UnifiedHighlighter(this);
+    }
+
+    protected Set<HighlightFlag> evaluateFlags() {
+      if (Objects.nonNull(flags) && !flags.isEmpty()) {
+        return flags;
+      }
+
+      Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
+      if (handleMultiTermQuery) {
+        highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
+      }
+      if (highlightPhrasesStrictly) {
+        highlightFlags.add(HighlightFlag.PHRASES);
+      }
+      if (passageRelevancyOverSpeed) {
+        highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
+      }
 
-  public void setFieldMatcher(Predicate<String> predicate) {
-    this.defaultFieldMatcher = predicate;
+      // Evaluate if WEIGHT_MATCHES can be added as a flag.
+      if (highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)

Review comment:
       I think we should have a `withWeightMatches` as well so that a user can 
easily opt-out.

##########
File path: 
lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
##########
@@ -726,54 +752,54 @@ public void testWhichMTQMatched() throws Exception {
     // Default formatter just bolds each hit:
     assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", 
snippets[0]);
 
-    // Now use our own formatter, that also stuffs the
-    // matching term's text into the result:
-    highlighter =
-        new UnifiedHighlighter(searcher, indexAnalyzer) {
-
+    PassageFormatter passageFormatter =
+        new PassageFormatter() {
           @Override
-          protected PassageFormatter getFormatter(String field) {
-            return new PassageFormatter() {
-
-              @Override
-              public Object format(Passage[] passages, String content) {
-                // Copied from DefaultPassageFormatter, but
-                // tweaked to include the matched term:
-                StringBuilder sb = new StringBuilder();
-                int pos = 0;
-                for (Passage passage : passages) {
-                  // don't add ellipsis if its the first one, or if its 
connected.
-                  if (passage.getStartOffset() > pos && pos > 0) {
-                    sb.append("... ");
-                  }
-                  pos = passage.getStartOffset();
-                  for (int i = 0; i < passage.getNumMatches(); i++) {
-                    int start = passage.getMatchStarts()[i];
-                    int end = passage.getMatchEnds()[i];
-                    // its possible to have overlapping terms
-                    if (start > pos) {
-                      sb.append(content, pos, start);
-                    }
-                    if (end > pos) {
-                      sb.append("<b>");
-                      sb.append(content, Math.max(pos, start), end);
-                      sb.append('(');
-                      sb.append(passage.getMatchTerms()[i].utf8ToString());
-                      sb.append(')');
-                      sb.append("</b>");
-                      pos = end;
-                    }
-                  }
-                  // its possible a "term" from the analyzer could span a 
sentence boundary.
-                  sb.append(content, pos, Math.max(pos, 
passage.getEndOffset()));
-                  pos = passage.getEndOffset();
+          public Object format(Passage[] passages, String content) {
+            // Copied from DefaultPassageFormatter, but
+            // tweaked to include the matched term:
+            StringBuilder sb = new StringBuilder();
+            int pos = 0;
+            for (Passage passage : passages) {
+              // don't add ellipsis if its the first one, or if its connected.
+              if (passage.getStartOffset() > pos && pos > 0) {
+                sb.append("... ");
+              }
+              pos = passage.getStartOffset();
+              for (int i = 0; i < passage.getNumMatches(); i++) {
+                int start = passage.getMatchStarts()[i];
+                int end = passage.getMatchEnds()[i];
+                // its possible to have overlapping terms
+                if (start > pos) {
+                  sb.append(content, pos, start);
+                }
+                if (end > pos) {
+                  sb.append("<b>");
+                  sb.append(content, Math.max(pos, start), end);
+                  sb.append('(');
+                  sb.append(passage.getMatchTerms()[i].utf8ToString());
+                  sb.append(')');
+                  sb.append("</b>");
+                  pos = end;
                 }
-                return sb.toString();
               }
-            };
+              // its possible a "term" from the analyzer could span a sentence 
boundary.
+              sb.append(content, pos, Math.max(pos, passage.getEndOffset()));
+              pos = passage.getEndOffset();
+            }
+            return sb.toString();
           }
         };
 
+    // Now use our own formatter, that also stuffs the
+    // matching term's text into the result:

Review comment:
       This comment used to immediately precede the formatter definition 
because it was talking about it.  After your refactoring (which I like), the 
comment is not in the right place anymore.

##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
##########
@@ -143,88 +135,168 @@
 
   private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
 
-  /** Extracts matching terms after rewriting against an empty index */
-  protected static Set<Term> extractTerms(Query query) throws IOException {
-    Set<Term> queryTerms = new HashSet<>();
-    
EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
-    return queryTerms;
-  }
+  private Set<HighlightFlag> flags;
+
+  /** Builder for UnifiedHighlighter. */
+  public abstract static class Builder<T extends Builder<T>> {

Review comment:
       I do appreciate the point of the generic type, but I suspect it's value 
isn't worth the verbosity for the, I believe, common case where a custom 
Builder type isn't necessary.  It's only necessary when new options are being 
added.

##########
File path: 
lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
##########
@@ -117,7 +117,11 @@ public void testWildcards() throws Exception {
     iw.close();
 
     IndexSearcher searcher = newSearcher(ir);
-    UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, 
indexAnalyzer);
+    UnifiedHighlighter.ConcreteBuilder concreteBuilder =

Review comment:
       Nitpick: I wouldn't bother declaring an intermediate variable for the 
builder.  With "fluent" interfaces (what a builder is), you can chain method 
calls and get the final thing you want -- the UnifiedHighlighter instance 
itself.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to