[MediaWiki-commits] [Gerrit] search/extra[master]: Revert "Stop maintaining FuzzyLikeThis"

Gehel (Code Review) Mon, 09 Oct 2017 02:56:06 -0700

Gehel has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/383084 )


Change subject: Revert "Stop maintaining FuzzyLikeThis"
......................................................................


Revert "Stop maintaining FuzzyLikeThis"

This reverts commit 3454e729ccd77d92666bc2347daceb760f83cf56.

We still need it, the quick workaround did not work quite well. Suggested 
approach is now to add potential replacements behind an activation flag (url 
param) so that production usage is not affected by future experiments.

Bug: T177727
Change-Id: I3f3051c87297d3194f52fff6f3ed77b0326c4d84
---
M src/main/java/org/wikimedia/search/extra/ExtraPlugin.java
A src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java
A 
src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java
A src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java
A 
src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java
5 files changed, 737 insertions(+), 0 deletions(-)

Approvals:
  jenkins-bot: Verified
  Gehel: Looks good to me, approved



diff --git a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java 
b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java
index a570c8b..1f51284 100644
--- a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java
+++ b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java
@@ -41,6 +41,7 @@
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.watcher.ResourceWatcherService;
 import 
org.wikimedia.search.extra.analysis.filters.PreserveOriginalFilterFactory;
+import org.wikimedia.search.extra.fuzzylike.FuzzyLikeThisQueryBuilder;
 import org.wikimedia.search.extra.latency.LatencyStatsAction;
 import org.wikimedia.search.extra.latency.RestGetLatencyStats;
 import org.wikimedia.search.extra.latency.SearchLatencyListener;
@@ -101,6 +102,7 @@
     public List<QuerySpec<?>> getQueries() {
         return asList(
                 new QuerySpec<>(SourceRegexQueryBuilder.NAME, 
SourceRegexQueryBuilder::new, SourceRegexQueryBuilder::fromXContent),
+                new QuerySpec<>(FuzzyLikeThisQueryBuilder.NAME, 
FuzzyLikeThisQueryBuilder::new, FuzzyLikeThisQueryBuilder::fromXContent),
                 new QuerySpec<>(TokenCountRouterQueryBuilder.NAME, 
TokenCountRouterQueryBuilder::new, TokenCountRouterQueryBuilder::fromXContent),
                 new QuerySpec<>(DegradedRouterQueryBuilder.NAME,
                         (in) -> new DegradedRouterQueryBuilder(in, loadStats),
diff --git 
a/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java 
b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java
new file mode 100644
index 0000000..1b461d6
--- /dev/null
+++ b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQuery.java
@@ -0,0 +1,293 @@
+package org.wikimedia.search.extra.fuzzylike;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import javax.annotation.Nullable;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostAttribute;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.FuzzyTermsEnum;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.similarities.ClassicSimilarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.PriorityQueue;
+
+import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
+import lombok.EqualsAndHashCode;
+
+/**
+ * Fuzzifies ALL terms provided as strings and then picks the best n 
differentiating terms.
+ * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with 
special consideration
+ * of fuzzy scoring factors.
+ * This generally produces good results for queries where users may provide 
details in a number of
+ * fields and have no knowledge of boolean query syntax and also want a degree 
of fuzzy matching and
+ * a fast query.
+ * <p>
+ * For each source term the fuzzy variants are held in a BooleanQuery with no 
coord factor (because
+ * we are not looking for matches on multiple variants in any one doc). 
Additionally, a specialized
+ * TermQuery is used for variants and does not use that variant term's IDF 
because this would favour rarer
+ * terms eg misspellings. Instead, all variants use the same IDF ranking (the 
one for the source query
+ * term) and this is factored into the variant's boost. If the source query 
term does not exist in the
+ * index the average IDF of the variants is used.
+ */
+@Deprecated
+@EqualsAndHashCode(callSuper = false, of = {"analyzer", "fieldVals", 
"ignoreTF", "maxNumTerms"})
+public class FuzzyLikeThisQuery extends Query {
+    // TODO: generalize this query (at least it should not reuse this static 
sim!
+    // a better way might be to convert this into multitermquery rewrite 
methods.
+    // the rewrite method can 'average' the TermContext's term statistics 
(docfreq,totalTermFreq)
+    // provided to TermQuery, so that the general idea is agnostic to any 
scoring system...
+    static final TFIDFSimilarity sim = new ClassicSimilarity();
+    ArrayList<FieldVals> fieldVals = new ArrayList<>();
+    Analyzer analyzer;
+
+    final ScoreTermQueue q;
+    private static final int MAX_VARIANTS_PER_TERM = 50;
+    boolean ignoreTF;
+    private final int maxNumTerms;
+
+    /**
+     * @param maxNumTerms The total number of terms clauses that will appear 
once rewritten as a BooleanQuery
+     */
+    public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) {
+        q = new ScoreTermQueue(maxNumTerms);
+        this.analyzer = analyzer;
+        this.maxNumTerms = maxNumTerms;
+    }
+
+    @EqualsAndHashCode
+    private static class FieldVals {
+        @Nullable
+        final String queryString;
+        final String fieldName;
+        final float minSimilarity;
+        final int prefixLength;
+
+        FieldVals(String name, float similarity, int length, String 
queryString) {
+            fieldName = name;
+            minSimilarity = similarity;
+            prefixLength = length;
+            this.queryString = queryString;
+        }
+    }
+
+    /**
+     * Adds user input for "fuzzification".
+     *
+     * @param queryString   The string which will be parsed by the analyzer 
and for which fuzzy variants will be parsed
+     * @param minSimilarity The minimum similarity of the term variants (see 
FuzzyTermsEnum)
+     * @param prefixLength  Length of required common prefix on variant terms 
(see FuzzyTermsEnum)
+     */
+    public void addTerms(String queryString, String fieldName, float 
minSimilarity, int prefixLength) {
+        fieldVals.add(new FieldVals(fieldName, minSimilarity, prefixLength, 
queryString));
+    }
+
+    @SuppressWarnings("CyclomaticComplexity")
+    private void addTerms(IndexReader reader, FieldVals f) throws IOException {
+        if (f.queryString == null) return;
+        final Terms terms = MultiFields.getTerms(reader, f.fieldName);
+        if (terms == null) {
+            return;
+        }
+        TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString);
+        try {
+            CharTermAttribute termAtt = 
ts.addAttribute(CharTermAttribute.class);
+
+            int corpusNumDocs = reader.numDocs();
+            HashSet<String> processedTerms = new HashSet<>();
+            ts.reset();
+            while (ts.incrementToken()) {
+                String term = termAtt.toString();
+                if (!processedTerms.contains(term)) {
+                    processedTerms.add(term);
+                    ScoreTermQueue variantsQ = new 
ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one 
term
+                    float minScore = 0;
+                    Term startTerm = new Term(f.fieldName, term);
+                    AttributeSource atts = new AttributeSource();
+                    MaxNonCompetitiveBoostAttribute maxBoostAtt =
+                            
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
+                    FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, 
startTerm, f.minSimilarity, f.prefixLength, true);
+                    //store the df so all variants use same idf
+                    int df = reader.docFreq(startTerm);
+                    int numVariants = 0;
+                    int totalVariantDocFreqs = 0;
+                    BytesRef possibleMatch;
+                    BoostAttribute boostAtt =
+                            fe.attributes().addAttribute(BoostAttribute.class);
+                    while ((possibleMatch = fe.next()) != null) {
+                        numVariants++;
+                        totalVariantDocFreqs += fe.docFreq();
+                        float score = boostAtt.getBoost();
+                        if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score 
> minScore) {
+                            ScoreTerm st = new ScoreTerm(new 
Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
+                            variantsQ.insertWithOverflow(st);
+                            minScore = variantsQ.top().score; // maintain 
minScore
+                        }
+                        maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() 
>= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
+                    }
+
+                    if (numVariants > 0) {
+                        int avgDf = totalVariantDocFreqs / numVariants;
+                        if (df == 0) {
+                            //no direct match we can use as df for all variants
+                            df = avgDf; //use avg df of all variants
+                        }
+
+                        // take the top variants (scored by edit distance) and 
reset the score
+                        // to include an IDF factor then add to the global 
queue for ranking
+                        // overall top query terms
+                        int size = variantsQ.size();
+                        for (int i = 0; i < size; i++) {
+                            ScoreTerm st = variantsQ.pop();
+                            st.score = (st.score * st.score) * sim.idf(df, 
corpusNumDocs);
+                            q.insertWithOverflow(st);
+                        }
+                    }
+                }
+            }
+            ts.end();
+        } finally {
+            IOUtils.closeWhileHandlingException(ts);
+        }
+    }
+
+    @Override
+    @SuppressFBWarnings(
+            value = "PCAIL_POSSIBLE_CONSTANT_ALLOCATION_IN_LOOP",
+            justification = "builder should not be reused")
+    public Query rewrite(IndexReader reader) throws IOException {
+        //load up the list of possible terms
+        for (FieldVals f : fieldVals) {
+            addTerms(reader, f);
+        }
+
+        BooleanQuery.Builder bq = new BooleanQuery.Builder();
+
+        //create BooleanQueries to hold the variants for each token/field pair 
and ensure it
+        // has no coord factor
+        //Step 1: sort the termqueries by term/field
+        HashMap<Term, ArrayList<ScoreTerm>> variantQueries = new HashMap<>();
+        int size = q.size();
+        for (int i = 0; i < size; i++) {
+            ScoreTerm st = q.pop();
+            ArrayList<ScoreTerm> l = 
variantQueries.computeIfAbsent(st.fuzziedSourceTerm, k -> new ArrayList<>());
+            l.add(st);
+        }
+        //Step 2: Organize the sorted termqueries into zero-coord scoring 
boolean queries
+        for (ArrayList<ScoreTerm> variants : variantQueries.values()) {
+            if (variants.size() == 1) {
+                //optimize where only one selected variant
+                ScoreTerm st = variants.get(0);
+                Query tq = ignoreTF ? new ConstantScoreQuery(new 
TermQuery(st.term)) : new TermQuery(st.term);
+                tq = new BoostQuery(tq, st.score); // set the boost to a mix 
of IDF and score
+                bq.add(tq, BooleanClause.Occur.SHOULD);
+            } else {
+                BooleanQuery.Builder termVariants = new 
BooleanQuery.Builder(); //disable coord and IDF for these term variants
+                termVariants.setDisableCoord(true);
+                for (ScoreTerm st : variants) {
+                    // found a match
+                    Query tq = ignoreTF ? new ConstantScoreQuery(new 
TermQuery(st.term)) : new TermQuery(st.term);
+                    tq = new BoostQuery(tq, st.score); // set the boost using 
the ScoreTerm's score
+                    termVariants.add(tq, BooleanClause.Occur.SHOULD);          
// add to query
+                }
+                bq.add(termVariants.build(), BooleanClause.Occur.SHOULD);      
    // add to query
+            }
+        }
+        //TODO possible alternative step 3 - organize above booleans into a 
new layer of field-based
+        // booleans with a minimum-should-match of NumFields-1?
+        BooleanQuery bool = bq.build();
+        if (bool.clauses().isEmpty()) {
+            return new MatchNoDocsQuery();
+        }
+        return bool;
+    }
+
+    //Holds info for a fuzzy term variant - initially score is set to edit 
distance (for ranking best
+    // term variants) then is reset with IDF for use in ranking against all 
other
+    // terms/fields
+    private static class ScoreTerm {
+        public final Term term;
+        public float score;
+        final Term fuzziedSourceTerm;
+
+        ScoreTerm(Term term, float score, Term fuzziedSourceTerm) {
+            this.term = term;
+            this.score = score;
+            this.fuzziedSourceTerm = fuzziedSourceTerm;
+        }
+    }
+
+    private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
+        ScoreTermQueue(int size) {
+            super(size);
+        }
+
+        /* (non-Javadoc)
+         * @see 
org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, 
java.lang.Object)
+         */
+        @Override
+        protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
+            if (termA.score == termB.score)
+                return termA.term.compareTo(termB.term) > 0;
+            else
+                return termA.score < termB.score;
+        }
+
+    }
+
+    /* (non-Javadoc)
+     * @see org.apache.lucene.search.Query#toString(java.lang.String)
+     */
+    @Override
+    @Nullable
+    public String toString(String field) {
+        return null;
+    }
+
+
+    public boolean isIgnoreTF() {
+        return ignoreTF;
+    }
+
+
+    public void setIgnoreTF(boolean ignoreTF) {
+        this.ignoreTF = ignoreTF;
+    }
+
+}
diff --git 
a/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java
 
b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java
new file mode 100644
index 0000000..1e4ddb8
--- /dev/null
+++ 
b/src/main/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisQueryBuilder.java
@@ -0,0 +1,336 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.wikimedia.search.extra.fuzzylike;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import javax.annotation.Nullable;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.Query;
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.ParsingException;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.unit.Fuzziness;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.index.mapper.StringFieldMapper;
+import org.elasticsearch.index.mapper.TextFieldMapper;
+import org.elasticsearch.index.query.AbstractQueryBuilder;
+import org.elasticsearch.index.query.QueryParseContext;
+import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.index.query.QueryShardException;
+
+import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
+import lombok.Getter;
+import lombok.Setter;
+import lombok.experimental.Accessors;
+
+/**
+ * @deprecated this query was too costly and has been removed
+ */
+@Deprecated
+@Accessors(fluent = true, chain = true)
+@Getter @Setter
+@SuppressFBWarnings(value = "EI_EXPOSE_REP", justification = "We don't care 
about exposing representation in a builder")
+public class FuzzyLikeThisQueryBuilder extends 
AbstractQueryBuilder<FuzzyLikeThisQueryBuilder> {
+    public static final ParseField NAME = new ParseField("fuzzy_like_this", 
"flt", "fuzzyLikeThis");
+
+    public static final ParseField FIELDS = new ParseField("fields");
+    public static final ParseField LIKE_TEXT = new ParseField("like_text", 
"likeText");
+    public static final ParseField PREFIX_LENGTH = new 
ParseField("prefix_length", "likeText");
+    public static final ParseField MAX_QUERY_TERMS = new 
ParseField("max_query_terms", "maxQueryTerms");
+    public static final ParseField IGNORE_TF = new ParseField("ignore_tf", 
"ignoreTF");
+    public static final ParseField ANALYZER = new ParseField("analyzer");
+    public static final ParseField FAIL_ON_UNSUPPORTED_FIELD = new 
ParseField("fail_on_unsupported_field", "failOnUnsupportedField");
+
+    public static final ParseField FUZZINESS = 
Fuzziness.FIELD.withDeprecation("min_similarity");
+
+    private static final int DEFAULT_PREFIX_LENGTH = 0;
+    private static final Fuzziness DEFAULT_FUZZINESS = Fuzziness.TWO;
+    private static final boolean DEFAULT_IGNORETF = false;
+    private static final boolean DEFAULT_FAIL_ON_UNSUPPORTED_FIELD = false;
+    private static final int DEFAULT_MAX_QUERY_TERMS = 25;
+
+    private static final Set<String> SUPPORTED_TYPES = new 
HashSet<>(Arrays.asList(
+            StringFieldMapper.CONTENT_TYPE,
+            TextFieldMapper.CONTENT_TYPE
+    ));
+
+    @Nullable private final String[] fields;
+    private final String likeText;
+    private Fuzziness fuzziness = DEFAULT_FUZZINESS;
+    private int prefixLength = DEFAULT_PREFIX_LENGTH;
+    private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
+    private boolean ignoreTF = DEFAULT_IGNORETF;
+    @Nullable private String analyzer;
+    private boolean failOnUnsupportedField = DEFAULT_FAIL_ON_UNSUPPORTED_FIELD;
+
+    public FuzzyLikeThisQueryBuilder(@Nullable String[] fields, String 
likeText) {
+        this.fields = fields;
+        this.likeText = Objects.requireNonNull(likeText);
+    }
+
+    public FuzzyLikeThisQueryBuilder(String likeText) {
+        this(null, likeText);
+    }
+
+    public FuzzyLikeThisQueryBuilder(StreamInput in) throws IOException {
+        super(in);
+        fields = in.readOptionalStringArray();
+        likeText = in.readString();
+        fuzziness = new Fuzziness(in);
+        prefixLength = in.readVInt();
+        maxQueryTerms = in.readVInt();
+        ignoreTF = in.readBoolean();
+        analyzer = in.readOptionalString();
+        failOnUnsupportedField = in.readBoolean();
+    }
+
+    @Override
+    protected void doWriteTo(StreamOutput out) throws IOException {
+        out.writeOptionalStringArray(fields);
+        out.writeString(likeText);
+        fuzziness.writeTo(out);
+        out.writeVInt(prefixLength);
+        out.writeVInt(maxQueryTerms);
+        out.writeBoolean(ignoreTF);
+        out.writeOptionalString(analyzer);
+        out.writeBoolean(failOnUnsupportedField);
+    }
+
+    public FuzzyLikeThisQueryBuilder fuzziness(Fuzziness fuzziness) {
+        this.fuzziness = Objects.requireNonNull(fuzziness);
+        return this;
+    }
+
+    public FuzzyLikeThisQueryBuilder prefixLength(int prefixLength) {
+        this.prefixLength = prefixLength;
+        return this;
+    }
+
+    public FuzzyLikeThisQueryBuilder maxQueryTerms(int maxQueryTerms) {
+        this.maxQueryTerms = maxQueryTerms;
+        return this;
+    }
+
+    public FuzzyLikeThisQueryBuilder ignoreTF(boolean ignoreTF) {
+        this.ignoreTF = ignoreTF;
+        return this;
+    }
+
+    /**
+     * The analyzer that will be used to analyze the text. Defaults to the 
analyzer associated with the fied.
+     */
+    public FuzzyLikeThisQueryBuilder analyzer(String analyzer) {
+        this.analyzer = analyzer;
+        return this;
+    }
+
+    /**
+     * Whether to fail or return no result when this query is run against a 
field which is not supported such as binary/numeric fields.
+     */
+    public FuzzyLikeThisQueryBuilder failOnUnsupportedField(boolean fail) {
+        failOnUnsupportedField = fail;
+        return this;
+    }
+
+    @Override
+    protected void doXContent(XContentBuilder builder, Params params) throws 
IOException {
+        builder.startObject(NAME.getPreferredName());
+        if (fields != null) {
+            builder.startArray(FIELDS.getPreferredName());
+            for (String field : fields) {
+                builder.value(field);
+            }
+            builder.endArray();
+        }
+        builder.field(LIKE_TEXT.getPreferredName(), likeText);
+        if (maxQueryTerms != DEFAULT_MAX_QUERY_TERMS) {
+            builder.field(MAX_QUERY_TERMS.getPreferredName(), maxQueryTerms);
+        }
+        if (!fuzziness.equals(DEFAULT_FUZZINESS)) {
+            fuzziness.toXContent(builder, params);
+        }
+        if (prefixLength != DEFAULT_PREFIX_LENGTH) {
+            builder.field(PREFIX_LENGTH.getPreferredName(), prefixLength);
+        }
+        if (ignoreTF != DEFAULT_IGNORETF) {
+            builder.field(IGNORE_TF.getPreferredName(), ignoreTF);
+        }
+        if (analyzer != null) {
+            builder.field(ANALYZER.getPreferredName(), analyzer);
+        }
+        if (failOnUnsupportedField != DEFAULT_FAIL_ON_UNSUPPORTED_FIELD) {
+            builder.field(FAIL_ON_UNSUPPORTED_FIELD.getPreferredName(), 
failOnUnsupportedField);
+        }
+        builder.endObject();
+    }
+
+    @Override
+    public String getWriteableName() {
+        return NAME.getPreferredName();
+    }
+
+    @SuppressWarnings("CyclomaticComplexity")
+    public static Optional<FuzzyLikeThisQueryBuilder> 
fromXContent(QueryParseContext parseContext) throws IOException {
+        XContentParser parser = parseContext.parser();
+
+        int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
+        List<String> fields = null;
+        String likeText = null;
+        Fuzziness fuzziness = DEFAULT_FUZZINESS;
+        int prefixLength = DEFAULT_PREFIX_LENGTH;
+        boolean ignoreTF = DEFAULT_IGNORETF;
+        String analyzer = null;
+        boolean failOnUnsupportedField = DEFAULT_FAIL_ON_UNSUPPORTED_FIELD;
+
+        XContentParser.Token token;
+        String currentFieldName = null;
+        while ((token = parser.nextToken()) != 
XContentParser.Token.END_OBJECT) {
+            if (token == XContentParser.Token.FIELD_NAME) {
+                currentFieldName = parser.currentName();
+            } else if (token.isValue()) {
+                if (LIKE_TEXT.match(currentFieldName)) {
+                    likeText = parser.text();
+                } else if (MAX_QUERY_TERMS.match(currentFieldName)) {
+                    maxNumTerms = parser.intValue();
+                } else if (IGNORE_TF.match(currentFieldName)) {
+                    ignoreTF = parser.booleanValue();
+                } else if (FUZZINESS.match(currentFieldName)) {
+                    fuzziness = Fuzziness.parse(parser);
+                } else if (PREFIX_LENGTH.match(currentFieldName)) {
+                    prefixLength = parser.intValue();
+                } else if (ANALYZER.match(currentFieldName)) {
+                    analyzer = parser.text();
+                } else if (FAIL_ON_UNSUPPORTED_FIELD.match(currentFieldName)) {
+                    failOnUnsupportedField = parser.booleanValue();
+                } else {
+                    throw new ParsingException(parser.getTokenLocation(), 
"[flt] query does not support [" + currentFieldName + "]");
+                }
+            } else if (token == XContentParser.Token.START_ARRAY) {
+                if (FIELDS.match(currentFieldName)) {
+                    fields = new ArrayList<>();
+                    while (parser.nextToken() != 
XContentParser.Token.END_ARRAY) {
+                        fields.add(parser.text());
+                    }
+                    if (fields.isEmpty()) {
+                        throw new ParsingException(parser.getTokenLocation(), 
"fuzzy_like_this requires 'fields' to be non-empty");
+                    }
+                } else {
+                    throw new ParsingException(parser.getTokenLocation(), 
"[flt] query does not support [" + currentFieldName + "]");
+                }
+            }
+        }
+
+        if (likeText == null) {
+            throw new ParsingException(parser.getTokenLocation(), 
"fuzzy_like_this requires 'like_text' to be specified");
+        }
+
+        String[] fs = fields != null ? fields.stream().toArray(String[]::new) 
: null;
+
+        FuzzyLikeThisQueryBuilder builder = new FuzzyLikeThisQueryBuilder(fs, 
likeText);
+
+        builder.analyzer(analyzer)
+            .fuzziness(fuzziness)
+            .ignoreTF(ignoreTF)
+            .maxQueryTerms(maxNumTerms)
+            .prefixLength(prefixLength)
+            .failOnUnsupportedField(failOnUnsupportedField);
+
+        return Optional.of(builder);
+    }
+
+    @Override
+    protected Query doToQuery(QueryShardContext context) throws IOException {
+        final List<String> fields;
+        if (this.fields == null) {
+            fields = Collections.singletonList(context.defaultField());
+        } else {
+            fields = Arrays.stream(this.fields)
+                        .filter(x -> context.fieldMapper(x) != null)
+                        .filter(x -> context.fieldMapper(x).tokenized())
+                        .filter(x -> 
SUPPORTED_TYPES.contains(context.fieldMapper(x).typeName()))
+                        .map(x -> context.fieldMapper(x).name())
+                        .collect(Collectors.toList());
+            if (fields.isEmpty()) {
+                throw new QueryShardException(context, "fuzzy_like_this all 
provided fields are unknown or not tonized");
+            }
+
+            if (failOnUnsupportedField && fields.size() != this.fields.length) 
{
+                List<String> unsupportedFields = Stream.of(this.fields)
+                        .filter(x -> !fields.contains(x))
+                        .collect(Collectors.toList());
+                throw new QueryShardException(context, "fuzzy_like_this some 
fields are either unknown/untokenized/non-text: {}", unsupportedFields);
+            }
+        }
+
+        final Analyzer analyzer;
+        if (this.analyzer == null) {
+            analyzer = context.getMapperService().searchAnalyzer();
+        } else {
+            analyzer = 
context.getMapperService().getIndexAnalyzers().get(this.analyzer);
+        }
+
+        FuzzyLikeThisQuery query = new FuzzyLikeThisQuery(maxQueryTerms, 
analyzer);
+        float minSimilarity = fuzziness.asFloat();
+        if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity) {
+            throw new ElasticsearchParseException("fractional edit distances 
are not allowed");
+        }
+        if (minSimilarity < 0.0f)  {
+            throw new ElasticsearchParseException("minimumSimilarity cannot be 
less than 0");
+        }
+        for (String field : fields) {
+            query.addTerms(likeText, field, minSimilarity, prefixLength);
+        }
+        query.setIgnoreTF(ignoreTF);
+        return query;
+    }
+
+    @Override
+    protected boolean doEquals(FuzzyLikeThisQueryBuilder other) {
+        return Objects.equals(this.analyzer, other.analyzer)
+            && Arrays.equals(this.fields, other.fields)
+            && Objects.equals(this.failOnUnsupportedField, 
other.failOnUnsupportedField)
+            && Objects.equals(this.fuzziness, other.fuzziness)
+            && Objects.equals(this.ignoreTF, other.ignoreTF)
+            && Objects.equals(this.likeText, other.likeText)
+            && Objects.equals(this.maxQueryTerms, other.maxQueryTerms)
+            && Objects.equals(this.prefixLength, other.prefixLength);
+    }
+
+    @Override
+    protected int doHashCode() {
+        return Objects.hash(analyzer, fields, failOnUnsupportedField, 
fuzziness,
+                ignoreTF, likeText, maxQueryTerms, prefixLength);
+    }
+}
diff --git 
a/src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java 
b/src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java
new file mode 100644
index 0000000..e1dbd13
--- /dev/null
+++ b/src/main/java/org/wikimedia/search/extra/fuzzylike/package-info.java
@@ -0,0 +1,7 @@
+/**
+ * FuzzyLikeThis query.
+ */
+@javax.annotation.ParametersAreNonnullByDefault
+@org.wikimedia.search.extra.util.FieldsAreNonNullByDefault
+@org.wikimedia.search.extra.util.ReturnTypesAreNonNullByDefault
+package org.wikimedia.search.extra.fuzzylike;
diff --git 
a/src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java
 
b/src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java
new file mode 100644
index 0000000..7e3fb68
--- /dev/null
+++ 
b/src/test/java/org/wikimedia/search/extra/fuzzylike/FuzzyLikeThisIntegrationTest.java
@@ -0,0 +1,99 @@
+package org.wikimedia.search.extra.fuzzylike;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFirstHit;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoSearchHits;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.hasId;
+import static org.hamcrest.Matchers.containsString;
+
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+
+import org.elasticsearch.action.index.IndexRequestBuilder;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.common.unit.Fuzziness;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.rest.RestStatus;
+import org.junit.Before;
+import org.junit.Test;
+import org.wikimedia.search.extra.AbstractPluginIntegrationTest;
+
+@Deprecated
+public class FuzzyLikeThisIntegrationTest extends 
AbstractPluginIntegrationTest {
+
+    @Before
+    private void setup() throws IOException, InterruptedException, 
ExecutionException {
+        XContentBuilder mapping = jsonBuilder().startObject();
+        mapping.startObject("test").startObject("properties");
+        mapping.startObject("test");
+        mapping.field("type", "text");
+        mapping.endObject()
+            .endObject()
+            .endObject()
+            .endObject();
+
+        assertAcked(prepareCreate("test").addMapping("test", mapping));
+        ensureGreen();
+
+        indexRandom(false, doc("image", "There is nothing worse than a sharp 
image of a fuzzy concept."));
+        indexRandom(false, doc("humans", "From time to time, it is worth 
wandering around the fuzzy border " +
+                "regions of what you do, if only to remind yourself that no 
human activity is an island."));
+        indexRandom(false, doc("engineers", "Unfortunately, I'm an engineer. " 
+
+                "I'm always thinking about, what's the task and how do I get 
it done? " +
+                "And some of my tasks are pretty broad, and pretty fuzzy, and 
pretty funky, but that's the way I think."));
+        indexRandom(false, doc("science", "People assume that science is a 
very cold sort of profession, " +
+                "whereas writing novels is a warm and fuzzy intuitive thing. 
But in fact, they are not at all different."));
+        indexRandom(false, doc("nostalgia", "Nostalgia is something we think 
of as fuzzy. " +
+                "But it's pain. Pain concerning the past."));
+        refresh();
+    }
+
+    private IndexRequestBuilder doc(String id, String fieldValue) {
+        return client().prepareIndex("test", "test", id).setSource("test", 
fieldValue);
+    }
+
+    @Test
+    public void testFuzzyLikeThis() {
+        FuzzyLikeThisQueryBuilder builder;
+        SearchResponse resp;
+
+        builder = fuzzyLikeThisQuery("test", "sharp image fuzzy concpt");
+        resp = 
client().prepareSearch("test").setTypes("test").setQuery(builder).get();
+        assertHitCount(resp, 5);
+        assertFirstHit(resp, hasId("image"));
+
+        builder = fuzzyLikeThisQuery("test", "sharp image concpt");
+        resp = 
client().prepareSearch("test").setTypes("test").setQuery(builder).get();
+        assertHitCount(resp, 1);
+        assertFirstHit(resp, hasId("image"));
+
+        // Fuzziness to zero is unsupported and causes some confusion between 
FuzzyLikeQuery
+        // and FuzzyTermEnums. A lot of code still rely on a float instead of 
maxEdits as a int.
+        builder = fuzzyLikeThisQuery("test", 
"nostalagia").fuzziness(Fuzziness.ZERO);
+        
assertFailures(client().prepareSearch("test").setTypes("test").setQuery(builder),
+                RestStatus.INTERNAL_SERVER_ERROR,
+                containsString("with transpositions enabled, distances > 2 are 
not supported"));
+
+        builder = fuzzyLikeThisQuery("test", 
"nostalagio").fuzziness(Fuzziness.ONE);
+        resp = 
client().prepareSearch("test").setTypes("test").setQuery(builder).get();
+        assertNoSearchHits(resp);
+
+        // AUTO is like 1 (auto fuzziness is not really supported)
+        builder = fuzzyLikeThisQuery("test", 
"nostalagio").fuzziness(Fuzziness.AUTO);
+        resp = 
client().prepareSearch("test").setTypes("test").setQuery(builder).get();
+        assertNoSearchHits(resp);
+
+        builder = fuzzyLikeThisQuery("test", 
"nostalagio").fuzziness(Fuzziness.TWO);
+        resp = 
client().prepareSearch("test").setTypes("test").setQuery(builder).get();
+        assertSearchHits(resp, "nostalgia");
+    }
+
+    @Deprecated
+    public FuzzyLikeThisQueryBuilder fuzzyLikeThisQuery(String field, String 
likeText) {
+        return new FuzzyLikeThisQueryBuilder(new String[]{field}, likeText);
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/383084
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I3f3051c87297d3194f52fff6f3ed77b0326c4d84
Gerrit-PatchSet: 3
Gerrit-Project: search/extra
Gerrit-Branch: master
Gerrit-Owner: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] search/extra[master]: Revert "Stop maintaining FuzzyLikeThis"

Reply via email to