All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.spell.LuceneLevenshteinDistance Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.spell;

import org.apache.lucene.util.IntsRef;

/**
 * Damerau-Levenshtein (optimal string alignment) implemented in a consistent way as Lucene's
 * FuzzyTermsEnum with the transpositions option enabled.
 *
 * 

Notes: * *

    *
  • This metric treats full unicode codepoints as characters *
  • This metric scales raw edit distances into a floating point score based upon the shortest * of the two terms *
  • Transpositions of two adjacent codepoints are treated as primitive edits. *
  • Edits are applied in parallel: for example, "ab" and "bca" have distance 3. *
* * NOTE: this class is not particularly efficient. It is only intended for merging results from * multiple DirectSpellCheckers. */ public final class LuceneLevenshteinDistance implements StringDistance { /** Creates a new comparator, mimicing the behavior of Lucene's internal edit distance. */ public LuceneLevenshteinDistance() {} @Override public float getDistance(String target, String other) { IntsRef targetPoints; IntsRef otherPoints; int n; int[][] d; // cost array // NOTE: if we cared, we could 3*m space instead of m*n space, similar to // what LevenshteinDistance does, except cycling thru a ring of three // horizontal cost arrays... but this comparator is never actually used by // DirectSpellChecker, it's only used for merging results from multiple shards // in "distributed spellcheck", and it's inefficient in other ways too... // cheaper to do this up front once targetPoints = toIntsRef(target); otherPoints = toIntsRef(other); n = targetPoints.length; final int m = otherPoints.length; d = new int[n + 1][m + 1]; if (n == 0 || m == 0) { if (n == m) { return 0; } else { return Math.max(n, m); } } // indexes into strings s and t int i; // iterates through s int j; // iterates through t int t_j; // jth character of t int cost; // cost for (i = 0; i <= n; i++) { d[i][0] = i; } for (j = 0; j <= m; j++) { d[0][j] = j; } for (j = 1; j <= m; j++) { t_j = otherPoints.ints[j - 1]; for (i = 1; i <= n; i++) { cost = targetPoints.ints[i - 1] == t_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost); // transposition if (i > 1 && j > 1 && targetPoints.ints[i - 1] == otherPoints.ints[j - 2] && targetPoints.ints[i - 2] == otherPoints.ints[j - 1]) { d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost); } } } return 1.0f - ((float) d[n][m] / Math.min(m, n)); } private static IntsRef toIntsRef(String s) { IntsRef ref = new IntsRef(s.length()); // worst case int utf16Len = s.length(); for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { cp = ref.ints[ref.length++] = Character.codePointAt(s, i); } return ref; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (null == obj) return false; return (getClass() == obj.getClass()); } @Override public int hashCode() { // constant hashCode since all instances of this class are equal() return 6; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy