org.apache.lucene.sandbox.queries.SlowFuzzyQuery Maven / Gradle / Ivy
Show all versions of lucene-sandbox Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.queries;
import java.io.IOException;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanQuery; // javadocs
import org.apache.lucene.search.FuzzyQuery; // javadocs
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/** Implements the classic fuzzy search query. The similarity measurement
* is based on the Levenshtein (edit distance) algorithm.
*
* Note that, unlike {@link FuzzyQuery}, this query will silently allow
* for a (possibly huge) number of edit distances in comparisons, and may
* be extremely slow (comparing every term in the index).
*
* @deprecated Use {@link FuzzyQuery} instead.
*/
@Deprecated
public class SlowFuzzyQuery extends MultiTermQuery {
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0;
public final static int defaultMaxExpansions = 50;
private float minimumSimilarity;
private int prefixLength;
private boolean termLongEnough = false;
protected Term term;
/**
* Create a new SlowFuzzyQuery that will match terms with a similarity
* of at least minimumSimilarity
to term
.
* If a prefixLength
> 0 is specified, a common prefix
* of that length is also required.
*
* @param term the term to search for
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* minimumSimilarity
of 0.5
a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* Alternatively, if minimumSimilarity
is >= 1f, it is interpreted
* as a pure Levenshtein edit distance. For example, a value of 2f
* will match all terms within an edit distance of 2
from the
* query term. Edit distances specified in this way may not be fractional.
*
* @param prefixLength length of common (non-fuzzy) prefix
* @param maxExpansions the maximum number of terms to match. If this number is
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
* then the maxClauseCount will be used instead.
* @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
* or if prefixLength < 0
*/
public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
int maxExpansions) {
super(term.field());
this.term = term;
if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
throw new IllegalArgumentException("fractional edit distances are not allowed");
if (minimumSimilarity < 0.0f)
throw new IllegalArgumentException("minimumSimilarity < 0");
if (prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
if (maxExpansions < 0)
throw new IllegalArgumentException("maxExpansions < 0");
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
String text = term.text();
int len = text.codePointCount(0, text.length());
if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
this.termLongEnough = true;
}
this.minimumSimilarity = minimumSimilarity;
this.prefixLength = prefixLength;
}
/**
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}.
*/
public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
}
/**
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}.
*/
public SlowFuzzyQuery(Term term, float minimumSimilarity) {
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}.
*/
public SlowFuzzyQuery(Term term) {
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
* Returns the minimum similarity that is required for this query to match.
* @return float value between 0.0 and 1.0
*/
public float getMinSimilarity() {
return minimumSimilarity;
}
/**
* Returns the non-fuzzy prefix length. This is the number of characters at the start
* of a term that must be identical (not fuzzy) to the query term if the query
* is to match that term.
*/
public int getPrefixLength() {
return prefixLength;
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (!termLongEnough) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(), term.bytes());
}
return new SlowFuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
}
/**
* Returns the pattern term.
*/
public Term getTerm() {
return term;
}
@Override
public String toString(String field) {
final StringBuilder buffer = new StringBuilder();
if (!term.field().equals(field)) {
buffer.append(term.field());
buffer.append(":");
}
buffer.append(term.text());
buffer.append('~');
buffer.append(Float.toString(minimumSimilarity));
return buffer.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + Float.floatToIntBits(minimumSimilarity);
result = prime * result + prefixLength;
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
SlowFuzzyQuery other = (SlowFuzzyQuery) obj;
if (Float.floatToIntBits(minimumSimilarity) != Float
.floatToIntBits(other.minimumSimilarity))
return false;
if (prefixLength != other.prefixLength)
return false;
if (term == null) {
if (other.term != null)
return false;
} else if (!term.equals(other.term))
return false;
return true;
}
}