Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/384#discussion_r191578308 --- Diff: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java --- @@ -31,80 +33,106 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.FiniteStringsIterator; import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.Util; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; - /** - * Token stream which converts a provided token stream to an automaton. - * The accepted strings enumeration from the automaton are available through the - * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute - * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store - * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}) + * Concatenates/Joins every incoming token with a separator into one output token for every path through the + * token stream (which is a graph). In simple cases this yields one token, but in the presence of any tokens with + * a zero positionIncrmeent (e.g. synonyms) it will be more. This filter uses the token bytes, position increment, + * and position length of the incoming stream. Other attributes are not used or manipulated. * * @lucene.experimental */ -public final class CompletionTokenStream extends TokenStream { +public final class ConcatenateGraphFilter extends TokenFilter { + + /* + * Token stream which converts a provided token stream to an automaton. + * The accepted strings enumeration from the automaton are available through the + * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute + * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store + * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)}) + */ + + /** + * Represents the separation between tokens, if + * <code>preserveSep</code> is <code>true</code>. + */ + public final static char SEP_CHAR = '\u001F'; --- End diff -- One user is the SolrTextTagger which needs to know the character separator between concatenated terms. https://github.com/OpenSextant/SolrTextTagger/blob/master/src/main/java/org/opensextant/solrtexttagger/TermPrefixCursor.java#L45 (being added to Solr in SOLR-12376). Hmmm, though it is a byte there, which is to your point. Maybe I should change to a byte and add a little comment about this?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org