org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-sandbox Show documentation
Apache Lucene (module: sandbox)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.sandbox.queries;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

/**
 * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. In
 * effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration of
 * fuzzy scoring factors. This generally produces good results for queries where users may provide
 * details in a number of fields and have no knowledge of boolean query syntax and also want a
 * degree of fuzzy matching and a fast query.
 *
 * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor
 * (because we are not looking for matches on multiple variants in any one doc). Additionally, a
 * specialized TermQuery is used for variants and does not use that variant term's IDF because this
 * would favour rarer terms eg misspellings. Instead, all variants use the same IDF ranking (the one
 * for the source query term) and this is factored into the variant's boost. If the source query
 * term does not exist in the index the average IDF of the variants is used.
 */
public class FuzzyLikeThisQuery extends Query {
  // TODO: generalize this query (at least it should not reuse this static sim!
  // a better way might be to convert this into multitermquery rewrite methods.
  // the rewrite method can 'average' the TermStates's term statistics (docfreq,totalTermFreq)
  // provided to TermQuery, so that the general idea is agnostic to any scoring system...
  static TFIDFSimilarity sim = new ClassicSimilarity();
  ArrayList fieldVals = new ArrayList<>();
  Analyzer analyzer;

  int MAX_VARIANTS_PER_TERM = 50;
  boolean ignoreTF = false;
  private int maxNumTerms;

  @Override
  public int hashCode() {
    int prime = 31;
    int result = classHash();
    result = prime * result + Objects.hashCode(analyzer);
    result = prime * result + Objects.hashCode(fieldVals);
    result = prime * result + (ignoreTF ? 1231 : 1237);
    result = prime * result + maxNumTerms;
    return result;
  }

  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) && equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(FuzzyLikeThisQuery other) {
    return Objects.equals(analyzer, other.analyzer)
        && Objects.equals(fieldVals, other.fieldVals)
        && ignoreTF == other.ignoreTF
        && maxNumTerms == other.maxNumTerms;
  }

  /**
   * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a
   *     BooleanQuery
   */
  public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) {
    this.analyzer = analyzer;
    this.maxNumTerms = maxNumTerms;
  }

  static class FieldVals {
    String queryString;
    String fieldName;
    int maxEdits;
    int prefixLength;

    public FieldVals(String name, int maxEdits, int length, String queryString) {
      fieldName = name;
      this.maxEdits = maxEdits;
      prefixLength = length;
      this.queryString = queryString;
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + ((fieldName == null) ? 0 : fieldName.hashCode());
      result = prime * result + maxEdits;
      result = prime * result + prefixLength;
      result = prime * result + ((queryString == null) ? 0 : queryString.hashCode());
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (obj == null) {
        return false;
      }
      if (getClass() != obj.getClass()) {
        return false;
      }
      FieldVals other = (FieldVals) obj;
      if (fieldName == null) {
        if (other.fieldName != null) {
          return false;
        }
      } else if (!fieldName.equals(other.fieldName)) return false;
      if (maxEdits != other.maxEdits) {
        return false;
      }
      if (prefixLength != other.prefixLength) {
        return false;
      }
      if (queryString == null) {
        if (other.queryString != null) {
          return false;
        }
      } else if (!queryString.equals(other.queryString)) {
        return false;
      }
      return true;
    }
  }

  /**
   * Adds user input for "fuzzification"
   *
   * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants
   *     will be parsed
   * @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see
   *     FuzzyTermsEnum)
   * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
   */
  public void addTerms(
      String queryString, String fieldName, float minSimilarity, int prefixLength) {
    int maxEdits = (int) minSimilarity;
    if (maxEdits != minSimilarity
        || maxEdits < 0
        || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException(
          "minSimilarity must integer value between 0 and "
              + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE
              + ", inclusive; got "
              + minSimilarity);
    }
    fieldVals.add(new FieldVals(fieldName, maxEdits, prefixLength, queryString));
  }

  private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
    if (f.queryString == null) return;
    final Terms terms = MultiTerms.getTerms(reader, f.fieldName);
    if (terms == null) {
      return;
    }
    try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

      int corpusNumDocs = reader.numDocs();
      HashSet processedTerms = new HashSet<>();
      ts.reset();
      while (ts.incrementToken()) {
        String term = termAtt.toString();
        if (!processedTerms.contains(term)) {
          processedTerms.add(term);
          ScoreTermQueue variantsQ =
              new ScoreTermQueue(
                  MAX_VARIANTS_PER_TERM); // maxNum variants considered for any one term
          float minScore = 0;
          Term startTerm = new Term(f.fieldName, term);
          FuzzyTermsEnum fe =
              new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
          // store the df so all variants use same idf
          int df = reader.docFreq(startTerm);
          int numVariants = 0;
          int totalVariantDocFreqs = 0;
          BytesRef possibleMatch;
          BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
          while ((possibleMatch = fe.next()) != null) {
            numVariants++;
            totalVariantDocFreqs += fe.docFreq();
            float score = boostAtt.getBoost();
            if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
              ScoreTerm st =
                  new ScoreTerm(
                      new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)),
                      score,
                      startTerm);
              variantsQ.insertWithOverflow(st);
              minScore = variantsQ.top().score; // maintain minScore
            }
            fe.setMaxNonCompetitiveBoost(
                variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
          }

          if (numVariants > 0) {
            int avgDf = totalVariantDocFreqs / numVariants;
            if (df == 0) // no direct match we can use as df for all variants
            {
              df = avgDf; // use avg df of all variants
            }

            // take the top variants (scored by edit distance) and reset the score
            // to include an IDF factor then add to the global queue for ranking
            // overall top query terms
            int size = variantsQ.size();
            for (int i = 0; i < size; i++) {
              ScoreTerm st = variantsQ.pop();
              st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
              q.insertWithOverflow(st);
            }
          }
        }
      }
      ts.end();
    }
  }

  private Query newTermQuery(IndexReader reader, Term term) throws IOException {
    if (ignoreTF) {
      return new ConstantScoreQuery(new TermQuery(term));
    } else {
      // we build an artificial TermStates that will give an overall df and ttf
      // equal to 1
      TermStates context = new TermStates(reader.getContext());
      for (LeafReaderContext leafContext : reader.leaves()) {
        Terms terms = Terms.getTerms(leafContext.reader(), term.field());
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum.seekExact(term.bytes())) {
          int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
          context.register(termsEnum.termState(), leafContext.ord, freq, freq);
        }
      }
      return new TermQuery(term, context);
    }
  }

  @Override
  public void visit(QueryVisitor visitor) {
    visitor.visitLeaf(this);
  }

  @Override
  public Query rewrite(IndexReader reader) throws IOException {
    ScoreTermQueue q = new ScoreTermQueue(maxNumTerms);
    // load up the list of possible terms
    for (FieldVals f : fieldVals) {
      addTerms(reader, f, q);
    }

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    // create BooleanQueries to hold the variants for each token/field pair and ensure it
    // has no coord factor
    // Step 1: sort the termqueries by term/field
    HashMap> variantQueries = new HashMap<>();
    int size = q.size();
    for (int i = 0; i < size; i++) {
      ScoreTerm st = q.pop();
      ArrayList l = variantQueries.get(st.fuzziedSourceTerm);
      if (l == null) {
        l = new ArrayList<>();
        variantQueries.put(st.fuzziedSourceTerm, l);
      }
      l.add(st);
    }
    // Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
    for (Iterator> iter = variantQueries.values().iterator();
        iter.hasNext(); ) {
      ArrayList variants = iter.next();
      if (variants.size() == 1) {
        // optimize where only one selected variant
        ScoreTerm st = variants.get(0);
        Query tq = newTermQuery(reader, st.term);
        // set the boost to a mix of IDF and score
        bq.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD);
      } else {
        BooleanQuery.Builder termVariants = new BooleanQuery.Builder();
        for (Iterator iterator2 = variants.iterator(); iterator2.hasNext(); ) {
          ScoreTerm st = iterator2.next();
          // found a match
          Query tq = newTermQuery(reader, st.term);
          // set the boost using the ScoreTerm's score
          termVariants.add(
              new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD); // add to query
        }
        bq.add(termVariants.build(), BooleanClause.Occur.SHOULD); // add to query
      }
    }
    // TODO possible alternative step 3 - organize above booleans into a new layer of field-based
    // booleans with a minimum-should-match of NumFields-1?
    return bq.build();
  }

  // Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
  // term variants) then is reset with IDF for use in ranking against all other
  // terms/fields
  private static class ScoreTerm {
    public Term term;
    public float score;
    Term fuzziedSourceTerm;

    public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) {
      this.term = term;
      this.score = score;
      this.fuzziedSourceTerm = fuzziedSourceTerm;
    }
  }

  private static class ScoreTermQueue extends PriorityQueue {
    public ScoreTermQueue(int size) {
      super(size);
    }

    /* (non-Javadoc)
     * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
     */
    @Override
    protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
      if (termA.score == termB.score) {
        return termA.term.compareTo(termB.term) > 0;
      } else return termA.score < termB.score;
    }
  }

  /* (non-Javadoc)
   * @see org.apache.lucene.search.Query#toString(java.lang.String)
   */
  @Override
  public String toString(String field) {
    return null;
  }

  public boolean isIgnoreTF() {
    return ignoreTF;
  }

  public void setIgnoreTF(boolean ignoreTF) {
    this.ignoreTF = ignoreTF;
  }
}