All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sindicetech.siren.search.node.NodeFuzzyQuery Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (c) 2014, Sindice Limited. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */

package com.sindicetech.siren.search.node;

import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

import java.io.IOException;

/**
 * Implements the fuzzy search query. The similarity measurement
 * is based on the Levenshtein (edit distance) algorithm.
 *
 * 

This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite} * as default. So terms will be collected and scored according to their * edit distance. Only the top terms are used for building the {@link NodeBooleanQuery}. * It is not recommended to change the rewrite mode for fuzzy queries. * *

Code taken from {@link FuzzyQuery} and adapted for SIREn. */ public class NodeFuzzyQuery extends MultiNodeTermQuery { public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; public final static int defaultPrefixLength = 0; public final static int defaultMaxExpansions = 50; public final static boolean defaultTranspositions = true; private final int maxEdits; private final int maxExpansions; private final boolean transpositions; private final int prefixLength; private final Term term; /** * Create a new SirenFuzzyQuery that will match terms with a similarity * of at least minimumSimilarity to term. * If a prefixLength > 0 is specified, a common prefix * of that length is also required. * * @param term the term to search for * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}. * @param prefixLength length of common (non-fuzzy) prefix * @param maxExpansions the maximum number of terms to match. If this number is * greater than {@link NodeBooleanQuery#getMaxClauseCount} when the query is rewritten, * then the maxClauseCount will be used instead. * @param transpositions true if transpositions should be treated as a primitive * edit operation. If this is false, comparisons will implement the classic * Levenshtein algorithm. */ public NodeFuzzyQuery(final Term term, final int maxEdits, final int prefixLength, final int maxExpansions, final boolean transpositions) { super(term.field()); if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); } if (prefixLength < 0) { throw new IllegalArgumentException("prefixLength cannot be negative."); } if (maxExpansions < 0) { throw new IllegalArgumentException("maxExpansions cannot be negative."); } this.term = term; this.maxEdits = maxEdits; this.prefixLength = prefixLength; this.transpositions = transpositions; this.maxExpansions = maxExpansions; this.setRewriteMethod(new MultiNodeTermQuery.TopTermsScoringNodeBooleanQueryRewrite(maxExpansions)); } /** * Calls {@code #NodeFuzzyQuery(term, maxEdits, prefixLength, Integer.MAX_VALUE)}. */ public NodeFuzzyQuery(final Term term, final int maxEdits, final int prefixLength) { this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions); } /** * Calls {@code #NodeFuzzyQuery(term, maxEdits, 0, Integer.MAX_VALUE)}. */ public NodeFuzzyQuery(final Term term, final int maxEdits) { this(term, maxEdits, defaultPrefixLength); } /** * Calls {@code #NodeFuzzyQuery(term, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE, 0, Integer.MAX_VALUE)}. */ public NodeFuzzyQuery(final Term term) { this(term, defaultMaxEdits); } /** * @return the maximum number of edit distances allowed for this query to match. */ public int getMaxEdits() { return maxEdits; } /** * Returns the non-fuzzy prefix length. This is the number of characters at the start * of a term that must be identical (not fuzzy) to the query term if the query * is to match that term. */ public int getPrefixLength() { return prefixLength; } @Override protected TermsEnum getTermsEnum(final Terms terms, final AttributeSource atts) throws IOException { if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(null), term.bytes()); } return new FuzzyTermsEnum(terms, atts, this.getTerm(), maxEdits, prefixLength, transpositions); } /** * Returns the pattern term. */ public Term getTerm() { return term; } @Override public String toString(final String field) { final StringBuilder buffer = new StringBuilder(); // Wrap the term in simple quotes to protect it buffer.append("'").append(term.text()).append("'~"); buffer.append(Integer.toString(maxEdits)); buffer.append(ToStringUtils.boost(this.getBoost())); return wrapToStringWithDatatype(buffer).toString(); } @Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + maxEdits; result = prime * result + prefixLength; result = prime * result + maxExpansions; result = prime * result + (transpositions ? 0 : 1); result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } @Override public boolean equals(final Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (this.getClass() != obj.getClass()) return false; final NodeFuzzyQuery other = (NodeFuzzyQuery) obj; if (maxEdits != other.maxEdits) return false; if (prefixLength != other.prefixLength) return false; if (maxExpansions != other.maxExpansions) return false; if (transpositions != other.transpositions) return false; if (term == null) { if (other.term != null) return false; } else if (!term.equals(other.term)) return false; return true; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy