Gehel has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/383084 )
Change subject: Revert "Stop maintaining FuzzyLikeThis" ...................................................................... Revert "Stop maintaining FuzzyLikeThis" This reverts commit 3454e729ccd77d92666bc2347daceb760f83cf56. We still need it, the quick workaround did not work quite well. Suggested approach is now to add potential replacements behind an activation flag (url param) so that production usage is not affected by future experiments. Bug: T177727 Change-Id: I3f3051c87297d3194f52fff6f3ed77b0326c4d84 --- M src/main/java/org/wikimedia/search/extra/ExtraPlugin.java A src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java A src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java A src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java A src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java 5 files changed, 737 insertions(+), 0 deletions(-) Approvals: jenkins-bot: Verified Gehel: Looks good to me, approved diff --git a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java index a570c8b..1f51284 100644 --- a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java +++ b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java @@ -41,6 +41,7 @@ import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.watcher.ResourceWatcherService; import org.wikimedia.search.extra.analysis.filters.PreserveOriginalFilterFactory; +import org.wikimedia.search.extra.fuzzylike.FuzzyLikeThisQueryBuilder; import org.wikimedia.search.extra.latency.LatencyStatsAction; import org.wikimedia.search.extra.latency.RestGetLatencyStats; import org.wikimedia.search.extra.latency.SearchLatencyListener; @@ -101,6 +102,7 @@ public List<QuerySpec<?>> getQueries() { return asList( new QuerySpec<>(SourceRegexQueryBuilder.NAME, SourceRegexQueryBuilder::new, SourceRegexQueryBuilder::fromXContent), + new QuerySpec<>(FuzzyLikeThisQueryBuilder.NAME, FuzzyLikeThisQueryBuilder::new, FuzzyLikeThisQueryBuilder::fromXContent), new QuerySpec<>(TokenCountRouterQueryBuilder.NAME, TokenCountRouterQueryBuilder::new, TokenCountRouterQueryBuilder::fromXContent), new QuerySpec<>(DegradedRouterQueryBuilder.NAME, (in) -> new DegradedRouterQueryBuilder(in, loadStats), diff --git a/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java new file mode 100644 index 0000000..1b461d6 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java @@ -0,0 +1,293 @@ +package org.wikimedia.search.extra.fuzzylike; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import javax.annotation.Nullable; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.FuzzyTermsEnum; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.PriorityQueue; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import lombok.EqualsAndHashCode; + +/** + * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. + * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration + * of fuzzy scoring factors. + * This generally produces good results for queries where users may provide details in a number of + * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and + * a fast query. + * <p> + * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because + * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized + * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer + * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query + * term) and this is factored into the variant's boost. If the source query term does not exist in the + * index the average IDF of the variants is used. + */ +@Deprecated +@EqualsAndHashCode(callSuper = false, of = {"analyzer", "fieldVals", "ignoreTF", "maxNumTerms"}) +public class FuzzyLikeThisQuery extends Query { + // TODO: generalize this query (at least it should not reuse this static sim! + // a better way might be to convert this into multitermquery rewrite methods. + // the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq) + // provided to TermQuery, so that the general idea is agnostic to any scoring system... + static final TFIDFSimilarity sim = new ClassicSimilarity(); + ArrayList<FieldVals> fieldVals = new ArrayList<>(); + Analyzer analyzer; + + final ScoreTermQueue q; + private static final int MAX_VARIANTS_PER_TERM = 50; + boolean ignoreTF; + private final int maxNumTerms; + + /** + * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery + */ + public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) { + q = new ScoreTermQueue(maxNumTerms); + this.analyzer = analyzer; + this.maxNumTerms = maxNumTerms; + } + + @EqualsAndHashCode + private static class FieldVals { + @Nullable + final String queryString; + final String fieldName; + final float minSimilarity; + final int prefixLength; + + FieldVals(String name, float similarity, int length, String queryString) { + fieldName = name; + minSimilarity = similarity; + prefixLength = length; + this.queryString = queryString; + } + } + + /** + * Adds user input for "fuzzification". + * + * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed + * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermsEnum) + * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum) + */ + public void addTerms(String queryString, String fieldName, float minSimilarity, int prefixLength) { + fieldVals.add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString)); + } + + @SuppressWarnings("CyclomaticComplexity") + private void addTerms(IndexReader reader, FieldVals f) throws IOException { + if (f.queryString == null) return; + final Terms terms = MultiFields.getTerms(reader, f.fieldName); + if (terms == null) { + return; + } + TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString); + try { + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + + int corpusNumDocs = reader.numDocs(); + HashSet<String> processedTerms = new HashSet<>(); + ts.reset(); + while (ts.incrementToken()) { + String term = termAtt.toString(); + if (!processedTerms.contains(term)) { + processedTerms.add(term); + ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term + float minScore = 0; + Term startTerm = new Term(f.fieldName, term); + AttributeSource atts = new AttributeSource(); + MaxNonCompetitiveBoostAttribute maxBoostAtt = + atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); + FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength, true); + //store the df so all variants use same idf + int df = reader.docFreq(startTerm); + int numVariants = 0; + int totalVariantDocFreqs = 0; + BytesRef possibleMatch; + BoostAttribute boostAtt = + fe.attributes().addAttribute(BoostAttribute.class); + while ((possibleMatch = fe.next()) != null) { + numVariants++; + totalVariantDocFreqs += fe.docFreq(); + float score = boostAtt.getBoost(); + if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) { + ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm); + variantsQ.insertWithOverflow(st); + minScore = variantsQ.top().score; // maintain minScore + } + maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); + } + + if (numVariants > 0) { + int avgDf = totalVariantDocFreqs / numVariants; + if (df == 0) { + //no direct match we can use as df for all variants + df = avgDf; //use avg df of all variants + } + + // take the top variants (scored by edit distance) and reset the score + // to include an IDF factor then add to the global queue for ranking + // overall top query terms + int size = variantsQ.size(); + for (int i = 0; i < size; i++) { + ScoreTerm st = variantsQ.pop(); + st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs); + q.insertWithOverflow(st); + } + } + } + } + ts.end(); + } finally { + IOUtils.closeWhileHandlingException(ts); + } + } + + @Override + @SuppressFBWarnings( + value = "PCAIL_POSSIBLE_CONSTANT_ALLOCATION_IN_LOOP", + justification = "builder should not be reused") + public Query rewrite(IndexReader reader) throws IOException { + //load up the list of possible terms + for (FieldVals f : fieldVals) { + addTerms(reader, f); + } + + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + + //create BooleanQueries to hold the variants for each token/field pair and ensure it + // has no coord factor + //Step 1: sort the termqueries by term/field + HashMap<Term, ArrayList<ScoreTerm>> variantQueries = new HashMap<>(); + int size = q.size(); + for (int i = 0; i < size; i++) { + ScoreTerm st = q.pop(); + ArrayList<ScoreTerm> l = variantQueries.computeIfAbsent(st.fuzziedSourceTerm, k -> new ArrayList<>()); + l.add(st); + } + //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries + for (ArrayList<ScoreTerm> variants : variantQueries.values()) { + if (variants.size() == 1) { + //optimize where only one selected variant + ScoreTerm st = variants.get(0); + Query tq = ignoreTF ? new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term); + tq = new BoostQuery(tq, st.score); // set the boost to a mix of IDF and score + bq.add(tq, BooleanClause.Occur.SHOULD); + } else { + BooleanQuery.Builder termVariants = new BooleanQuery.Builder(); //disable coord and IDF for these term variants + termVariants.setDisableCoord(true); + for (ScoreTerm st : variants) { + // found a match + Query tq = ignoreTF ? new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term); + tq = new BoostQuery(tq, st.score); // set the boost using the ScoreTerm's score + termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query + } + bq.add(termVariants.build(), BooleanClause.Occur.SHOULD); // add to query + } + } + //TODO possible alternative step 3 - organize above booleans into a new layer of field-based + // booleans with a minimum-should-match of NumFields-1? + BooleanQuery bool = bq.build(); + if (bool.clauses().isEmpty()) { + return new MatchNoDocsQuery(); + } + return bool; + } + + //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best + // term variants) then is reset with IDF for use in ranking against all other + // terms/fields + private static class ScoreTerm { + public final Term term; + public float score; + final Term fuzziedSourceTerm; + + ScoreTerm(Term term, float score, Term fuzziedSourceTerm) { + this.term = term; + this.score = score; + this.fuzziedSourceTerm = fuzziedSourceTerm; + } + } + + private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> { + ScoreTermQueue(int size) { + super(size); + } + + /* (non-Javadoc) + * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object) + */ + @Override + protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) { + if (termA.score == termB.score) + return termA.term.compareTo(termB.term) > 0; + else + return termA.score < termB.score; + } + + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + @Override + @Nullable + public String toString(String field) { + return null; + } + + + public boolean isIgnoreTF() { + return ignoreTF; + } + + + public void setIgnoreTF(boolean ignoreTF) { + this.ignoreTF = ignoreTF; + } + +} diff --git a/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java new file mode 100644 index 0000000..1e4ddb8 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java @@ -0,0 +1,336 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.wikimedia.search.extra.fuzzylike; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import javax.annotation.Nullable; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Query; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.ParsingException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.mapper.StringFieldMapper; +import org.elasticsearch.index.mapper.TextFieldMapper; +import org.elasticsearch.index.query.AbstractQueryBuilder; +import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.index.query.QueryShardException; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.Accessors; + +/** + * @deprecated this query was too costly and has been removed + */ +@Deprecated +@Accessors(fluent = true, chain = true) +@Getter @Setter +@SuppressFBWarnings(value = "EI_EXPOSE_REP", justification = "We don't care about exposing representation in a builder") +public class FuzzyLikeThisQueryBuilder extends AbstractQueryBuilder<FuzzyLikeThisQueryBuilder> { + public static final ParseField NAME = new ParseField("fuzzy_like_this", "flt", "fuzzyLikeThis"); + + public static final ParseField FIELDS = new ParseField("fields"); + public static final ParseField LIKE_TEXT = new ParseField("like_text", "likeText"); + public static final ParseField PREFIX_LENGTH = new ParseField("prefix_length", "likeText"); + public static final ParseField MAX_QUERY_TERMS = new ParseField("max_query_terms", "maxQueryTerms"); + public static final ParseField IGNORE_TF = new ParseField("ignore_tf", "ignoreTF"); + public static final ParseField ANALYZER = new ParseField("analyzer"); + public static final ParseField FAIL_ON_UNSUPPORTED_FIELD = new ParseField("fail_on_unsupported_field", "failOnUnsupportedField"); + + public static final ParseField FUZZINESS = Fuzziness.FIELD.withDeprecation("min_similarity"); + + private static final int DEFAULT_PREFIX_LENGTH = 0; + private static final Fuzziness DEFAULT_FUZZINESS = Fuzziness.TWO; + private static final boolean DEFAULT_IGNORETF = false; + private static final boolean DEFAULT_FAIL_ON_UNSUPPORTED_FIELD = false; + private static final int DEFAULT_MAX_QUERY_TERMS = 25; + + private static final Set<String> SUPPORTED_TYPES = new HashSet<>(Arrays.asList( + StringFieldMapper.CONTENT_TYPE, + TextFieldMapper.CONTENT_TYPE + )); + + @Nullable private final String[] fields; + private final String likeText; + private Fuzziness fuzziness = DEFAULT_FUZZINESS; + private int prefixLength = DEFAULT_PREFIX_LENGTH; + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + private boolean ignoreTF = DEFAULT_IGNORETF; + @Nullable private String analyzer; + private boolean failOnUnsupportedField = DEFAULT_FAIL_ON_UNSUPPORTED_FIELD; + + public FuzzyLikeThisQueryBuilder(@Nullable String[] fields, String likeText) { + this.fields = fields; + this.likeText = Objects.requireNonNull(likeText); + } + + public FuzzyLikeThisQueryBuilder(String likeText) { + this(null, likeText); + } + + public FuzzyLikeThisQueryBuilder(StreamInput in) throws IOException { + super(in); + fields = in.readOptionalStringArray(); + likeText = in.readString(); + fuzziness = new Fuzziness(in); + prefixLength = in.readVInt(); + maxQueryTerms = in.readVInt(); + ignoreTF = in.readBoolean(); + analyzer = in.readOptionalString(); + failOnUnsupportedField = in.readBoolean(); + } + + @Override + protected void doWriteTo(StreamOutput out) throws IOException { + out.writeOptionalStringArray(fields); + out.writeString(likeText); + fuzziness.writeTo(out); + out.writeVInt(prefixLength); + out.writeVInt(maxQueryTerms); + out.writeBoolean(ignoreTF); + out.writeOptionalString(analyzer); + out.writeBoolean(failOnUnsupportedField); + } + + public FuzzyLikeThisQueryBuilder fuzziness(Fuzziness fuzziness) { + this.fuzziness = Objects.requireNonNull(fuzziness); + return this; + } + + public FuzzyLikeThisQueryBuilder prefixLength(int prefixLength) { + this.prefixLength = prefixLength; + return this; + } + + public FuzzyLikeThisQueryBuilder maxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + return this; + } + + public FuzzyLikeThisQueryBuilder ignoreTF(boolean ignoreTF) { + this.ignoreTF = ignoreTF; + return this; + } + + /** + * The analyzer that will be used to analyze the text. Defaults to the analyzer associated with the fied. + */ + public FuzzyLikeThisQueryBuilder analyzer(String analyzer) { + this.analyzer = analyzer; + return this; + } + + /** + * Whether to fail or return no result when this query is run against a field which is not supported such as binary/numeric fields. + */ + public FuzzyLikeThisQueryBuilder failOnUnsupportedField(boolean fail) { + failOnUnsupportedField = fail; + return this; + } + + @Override + protected void doXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(NAME.getPreferredName()); + if (fields != null) { + builder.startArray(FIELDS.getPreferredName()); + for (String field : fields) { + builder.value(field); + } + builder.endArray(); + } + builder.field(LIKE_TEXT.getPreferredName(), likeText); + if (maxQueryTerms != DEFAULT_MAX_QUERY_TERMS) { + builder.field(MAX_QUERY_TERMS.getPreferredName(), maxQueryTerms); + } + if (!fuzziness.equals(DEFAULT_FUZZINESS)) { + fuzziness.toXContent(builder, params); + } + if (prefixLength != DEFAULT_PREFIX_LENGTH) { + builder.field(PREFIX_LENGTH.getPreferredName(), prefixLength); + } + if (ignoreTF != DEFAULT_IGNORETF) { + builder.field(IGNORE_TF.getPreferredName(), ignoreTF); + } + if (analyzer != null) { + builder.field(ANALYZER.getPreferredName(), analyzer); + } + if (failOnUnsupportedField != DEFAULT_FAIL_ON_UNSUPPORTED_FIELD) { + builder.field(FAIL_ON_UNSUPPORTED_FIELD.getPreferredName(), failOnUnsupportedField); + } + builder.endObject(); + } + + @Override + public String getWriteableName() { + return NAME.getPreferredName(); + } + + @SuppressWarnings("CyclomaticComplexity") + public static Optional<FuzzyLikeThisQueryBuilder> fromXContent(QueryParseContext parseContext) throws IOException { + XContentParser parser = parseContext.parser(); + + int maxNumTerms = DEFAULT_MAX_QUERY_TERMS; + List<String> fields = null; + String likeText = null; + Fuzziness fuzziness = DEFAULT_FUZZINESS; + int prefixLength = DEFAULT_PREFIX_LENGTH; + boolean ignoreTF = DEFAULT_IGNORETF; + String analyzer = null; + boolean failOnUnsupportedField = DEFAULT_FAIL_ON_UNSUPPORTED_FIELD; + + XContentParser.Token token; + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token.isValue()) { + if (LIKE_TEXT.match(currentFieldName)) { + likeText = parser.text(); + } else if (MAX_QUERY_TERMS.match(currentFieldName)) { + maxNumTerms = parser.intValue(); + } else if (IGNORE_TF.match(currentFieldName)) { + ignoreTF = parser.booleanValue(); + } else if (FUZZINESS.match(currentFieldName)) { + fuzziness = Fuzziness.parse(parser); + } else if (PREFIX_LENGTH.match(currentFieldName)) { + prefixLength = parser.intValue(); + } else if (ANALYZER.match(currentFieldName)) { + analyzer = parser.text(); + } else if (FAIL_ON_UNSUPPORTED_FIELD.match(currentFieldName)) { + failOnUnsupportedField = parser.booleanValue(); + } else { + throw new ParsingException(parser.getTokenLocation(), "[flt] query does not support [" + currentFieldName + "]"); + } + } else if (token == XContentParser.Token.START_ARRAY) { + if (FIELDS.match(currentFieldName)) { + fields = new ArrayList<>(); + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + fields.add(parser.text()); + } + if (fields.isEmpty()) { + throw new ParsingException(parser.getTokenLocation(), "fuzzy_like_this requires 'fields' to be non-empty"); + } + } else { + throw new ParsingException(parser.getTokenLocation(), "[flt] query does not support [" + currentFieldName + "]"); + } + } + } + + if (likeText == null) { + throw new ParsingException(parser.getTokenLocation(), "fuzzy_like_this requires 'like_text' to be specified"); + } + + String[] fs = fields != null ? fields.stream().toArray(String[]::new) : null; + + FuzzyLikeThisQueryBuilder builder = new FuzzyLikeThisQueryBuilder(fs, likeText); + + builder.analyzer(analyzer) + .fuzziness(fuzziness) + .ignoreTF(ignoreTF) + .maxQueryTerms(maxNumTerms) + .prefixLength(prefixLength) + .failOnUnsupportedField(failOnUnsupportedField); + + return Optional.of(builder); + } + + @Override + protected Query doToQuery(QueryShardContext context) throws IOException { + final List<String> fields; + if (this.fields == null) { + fields = Collections.singletonList(context.defaultField()); + } else { + fields = Arrays.stream(this.fields) + .filter(x -> context.fieldMapper(x) != null) + .filter(x -> context.fieldMapper(x).tokenized()) + .filter(x -> SUPPORTED_TYPES.contains(context.fieldMapper(x).typeName())) + .map(x -> context.fieldMapper(x).name()) + .collect(Collectors.toList()); + if (fields.isEmpty()) { + throw new QueryShardException(context, "fuzzy_like_this all provided fields are unknown or not tonized"); + } + + if (failOnUnsupportedField && fields.size() != this.fields.length) { + List<String> unsupportedFields = Stream.of(this.fields) + .filter(x -> !fields.contains(x)) + .collect(Collectors.toList()); + throw new QueryShardException(context, "fuzzy_like_this some fields are either unknown/untokenized/non-text: {}", unsupportedFields); + } + } + + final Analyzer analyzer; + if (this.analyzer == null) { + analyzer = context.getMapperService().searchAnalyzer(); + } else { + analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer); + } + + FuzzyLikeThisQuery query = new FuzzyLikeThisQuery(maxQueryTerms, analyzer); + float minSimilarity = fuzziness.asFloat(); + if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity) { + throw new ElasticsearchParseException("fractional edit distances are not allowed"); + } + if (minSimilarity < 0.0f) { + throw new ElasticsearchParseException("minimumSimilarity cannot be less than 0"); + } + for (String field : fields) { + query.addTerms(likeText, field, minSimilarity, prefixLength); + } + query.setIgnoreTF(ignoreTF); + return query; + } + + @Override + protected boolean doEquals(FuzzyLikeThisQueryBuilder other) { + return Objects.equals(this.analyzer, other.analyzer) + && Arrays.equals(this.fields, other.fields) + && Objects.equals(this.failOnUnsupportedField, other.failOnUnsupportedField) + && Objects.equals(this.fuzziness, other.fuzziness) + && Objects.equals(this.ignoreTF, other.ignoreTF) + && Objects.equals(this.likeText, other.likeText) + && Objects.equals(this.maxQueryTerms, other.maxQueryTerms) + && Objects.equals(this.prefixLength, other.prefixLength); + } + + @Override + protected int doHashCode() { + return Objects.hash(analyzer, fields, failOnUnsupportedField, fuzziness, + ignoreTF, likeText, maxQueryTerms, prefixLength); + } +} diff --git a/src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java b/src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java new file mode 100644 index 0000000..e1dbd13 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java @@ -0,0 +1,7 @@ +/** + * FuzzyLikeThis query. + */ +@javax.annotation.ParametersAreNonnullByDefault +@org.wikimedia.search.extra.util.FieldsAreNonNullByDefault +@org.wikimedia.search.extra.util.ReturnTypesAreNonNullByDefault +package org.wikimedia.search.extra.fuzzylike; diff --git a/src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java b/src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java new file mode 100644 index 0000000..7e3fb68 --- /dev/null +++ b/src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java @@ -0,0 +1,99 @@ +package org.wikimedia.search.extra.fuzzylike; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFirstHit; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoSearchHits; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.hasId; +import static org.hamcrest.Matchers.containsString; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; + +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.rest.RestStatus; +import org.junit.Before; +import org.junit.Test; +import org.wikimedia.search.extra.AbstractPluginIntegrationTest; + +@Deprecated +public class FuzzyLikeThisIntegrationTest extends AbstractPluginIntegrationTest { + + @Before + private void setup() throws IOException, InterruptedException, ExecutionException { + XContentBuilder mapping = jsonBuilder().startObject(); + mapping.startObject("test").startObject("properties"); + mapping.startObject("test"); + mapping.field("type", "text"); + mapping.endObject() + .endObject() + .endObject() + .endObject(); + + assertAcked(prepareCreate("test").addMapping("test", mapping)); + ensureGreen(); + + indexRandom(false, doc("image", "There is nothing worse than a sharp image of a fuzzy concept.")); + indexRandom(false, doc("humans", "From time to time, it is worth wandering around the fuzzy border " + + "regions of what you do, if only to remind yourself that no human activity is an island.")); + indexRandom(false, doc("engineers", "Unfortunately, I'm an engineer. " + + "I'm always thinking about, what's the task and how do I get it done? " + + "And some of my tasks are pretty broad, and pretty fuzzy, and pretty funky, but that's the way I think.")); + indexRandom(false, doc("science", "People assume that science is a very cold sort of profession, " + + "whereas writing novels is a warm and fuzzy intuitive thing. But in fact, they are not at all different.")); + indexRandom(false, doc("nostalgia", "Nostalgia is something we think of as fuzzy. " + + "But it's pain. Pain concerning the past.")); + refresh(); + } + + private IndexRequestBuilder doc(String id, String fieldValue) { + return client().prepareIndex("test", "test", id).setSource("test", fieldValue); + } + + @Test + public void testFuzzyLikeThis() { + FuzzyLikeThisQueryBuilder builder; + SearchResponse resp; + + builder = fuzzyLikeThisQuery("test", "sharp image fuzzy concpt"); + resp = client().prepareSearch("test").setTypes("test").setQuery(builder).get(); + assertHitCount(resp, 5); + assertFirstHit(resp, hasId("image")); + + builder = fuzzyLikeThisQuery("test", "sharp image concpt"); + resp = client().prepareSearch("test").setTypes("test").setQuery(builder).get(); + assertHitCount(resp, 1); + assertFirstHit(resp, hasId("image")); + + // Fuzziness to zero is unsupported and causes some confusion between FuzzyLikeQuery + // and FuzzyTermEnums. A lot of code still rely on a float instead of maxEdits as a int. + builder = fuzzyLikeThisQuery("test", "nostalagia").fuzziness(Fuzziness.ZERO); + assertFailures(client().prepareSearch("test").setTypes("test").setQuery(builder), + RestStatus.INTERNAL_SERVER_ERROR, + containsString("with transpositions enabled, distances > 2 are not supported")); + + builder = fuzzyLikeThisQuery("test", "nostalagio").fuzziness(Fuzziness.ONE); + resp = client().prepareSearch("test").setTypes("test").setQuery(builder).get(); + assertNoSearchHits(resp); + + // AUTO is like 1 (auto fuzziness is not really supported) + builder = fuzzyLikeThisQuery("test", "nostalagio").fuzziness(Fuzziness.AUTO); + resp = client().prepareSearch("test").setTypes("test").setQuery(builder).get(); + assertNoSearchHits(resp); + + builder = fuzzyLikeThisQuery("test", "nostalagio").fuzziness(Fuzziness.TWO); + resp = client().prepareSearch("test").setTypes("test").setQuery(builder).get(); + assertSearchHits(resp, "nostalgia"); + } + + @Deprecated + public FuzzyLikeThisQueryBuilder fuzzyLikeThisQuery(String field, String likeText) { + return new FuzzyLikeThisQueryBuilder(new String[]{field}, likeText); + } +} -- To view, visit https://gerrit.wikimedia.org/r/383084 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I3f3051c87297d3194f52fff6f3ed77b0326c4d84 Gerrit-PatchSet: 3 Gerrit-Project: search/extra Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits