fr.univnantes.termsuite.metrics.DiacriticInsensitiveLevenshtein Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
The newest version!
/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright 2, 2015nership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package fr.univnantes.termsuite.metrics;
import java.text.Collator;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
/**
* The {@link Levenshtein} {@link EditDistance} insensitive to diacritics, i.e.
* pairs of words such as café
and cafe
,
* joão
and joao
will be considered to have a
* 0
edit distance or 1
similarity.
*
* @author Sebastián Peña Saldarriaga
*/
public class DiacriticInsensitiveLevenshtein extends AbstractEditDistance{
public static AtomicInteger FastFailures = new AtomicInteger(0);
public static AtomicInteger FullComputation = new AtomicInteger(0);
/** Similarity threshold under which the distance is not computed anymore */
private double failThreshold = -1;
/** Locale sensitive string comparator */
private Collator strCollator;
public DiacriticInsensitiveLevenshtein(Locale locale) {
super();
// Might be modified depending on the language
strCollator = Collator.getInstance(locale);
strCollator.setStrength(Collator.PRIMARY);
}
/**
* Normalizes the specified distance
by
* max(|str|, |rst|)
. For historical reasons this method
* actually returns 1 - normalized distance, making a similarity.
*
* @param distance
* The edit distance between str
and
* rst
.
* @param str
* A string
* @param rst
* Another string
* @return A [1, 0] value determined by
* 1 - distance/max(|str|, |rst|)
.
*/
@Override
public double normalize(int distance, String str, String rst) {
return 1.0 - ((double) distance / Math.max(str.length(), rst.length()));
}
@Override
public int compute(String str, String rst) {
int l = Math.max(str.length(), rst.length());
int maxDistance = failThreshold == -1 ? Math.min(str.length(),
rst.length()) : (int) Math.round((1 - failThreshold) * l);
return compute(str, rst, maxDistance);
}
/**
* Determines whether char1
and char2
are equals
* independent of the presence of diacritic marks.
*
* @param char1
* The first char
* @param char2
* The second char
* @return true
if char1
and char2
* are equals, or false
otherwise.
*/
public boolean diacriticInsensitiveEquals(char char1, char char2) {
return strCollator.equals(
toComparableStr(char1),
toComparableStr(char2));
}
private String toComparableStr(char char1) {
return Character.toString(char1);
}
@Override
public boolean isFailFast() {
return true;
}
@Override
public void setFailThreshold(double threshold) {
failThreshold = threshold;
}
@Override
public int compute(String str, String rst, int maxDistance) {
int l = Math.max(str.length(), rst.length());
int[][] dp = new int[str.length() + 1][rst.length() + 1];
for (int i = 0; i < dp.length; i++) {
int bestPossibleEditDistance = dp.length;
for (int j = 0; j < dp[i].length; j++) {
dp[i][j] = i == 0 ? j : j == 0 ? i : 0;
if (i > 0 && j > 0) {
if (diacriticInsensitiveEquals(str.charAt(i - 1),
rst.charAt(j - 1))) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = Math.min(dp[i][j - 1] + 1, Math.min(
dp[i - 1][j - 1] + 1, dp[i - 1][j] + 1));
}
bestPossibleEditDistance = Math.min(
bestPossibleEditDistance, dp[i][j]);
}
}
// After calculating row i, look for the smallest value in a given
// column. Abort is maxDistance is strictly exceeded
if (i > maxDistance && bestPossibleEditDistance > maxDistance) {
FastFailures.incrementAndGet();
return l;
}
}
FullComputation.incrementAndGet();
return dp[str.length()][rst.length()];
}
}