org.daisy.dotify.hyphenator.impl.CWDecompounder Maven / Gradle / Ivy
package org.daisy.dotify.hyphenator.impl;
import org.daisy.dotify.common.text.TextFileReader;
import org.daisy.dotify.common.text.TextFileReader.LineData;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
/**
* Class to decompound words.
*
* @author Joel Håkansson
*/
class CWDecompounder {
static final String SOFT_HYPHEN = "\u00ad";
private final Map stems;
private final int decompoundLimit;
/**
* Creates a new decompounder.
*
* @param decompoundLimit the partition limit, 1 or more
*/
CWDecompounder(int decompoundLimit) {
this.decompoundLimit = decompoundLimit;
if (decompoundLimit < 1) {
throw new IllegalArgumentException("Decompound limit must not be lower than one.");
}
stems = new HashMap<>();
}
/**
* Load dictionary.
*
* @param url the dictionary url
* @param lowerLimit disregard words shorter than lowerLimit
* @throws IOException if the url cannot be read
*/
void loadDictionary(String url, int lowerLimit) throws IOException {
if (lowerLimit < 1) {
throw new IllegalArgumentException("Decompound limit must not be lower than one.");
}
InputStream is = this.getClass().getResourceAsStream(url);
TextFileReader tfr = new TextFileReader(is);
LineData data;
while ((data = tfr.nextLine()) != null) {
int len = data.getFields()[0].length();
if (len < lowerLimit || data.getFields().length < 2) {
continue;
}
if (data.getFields().length > 2) {
stems.put(
data.getFields()[0],
new CWHyphenationUnit(Integer.parseInt(data.getFields()[1]) + 1, data.getFields()[2])
);
} else {
stems.put(data.getFields()[0], new CWHyphenationUnit(Integer.parseInt(data.getFields()[1]) + 1));
}
}
tfr.close();
is.close();
}
/**
* Loads a dictionary.
*
* @param url the url to the dictionary.
* @throws IOException if the url cannot be read
*/
void loadDictionary(String url) throws IOException {
loadDictionary(url, 1);
}
/**
* Gets the dictionary.
*
* @return returns the dictionary
*/
Map getDictionary() {
return stems;
}
/**
* Finds word compounds. The input word is scanned for possible compounds starting
* at beginLimit and ending at endLimit.
*
* If evaluate is true, the most likely solution is returned. If evaluate is false,
* the input word is returned unless there is only one possible solution in the
* dictionary.
*
* For example, if the input is 'taxikväll' (beginLimit=2, endLimit=2) the following
* string pairs are looked up in the dictionary: ta/xikväll, tax/ikväll, taxi/kväll,
* taxik/väll, taxikvä/ll. In this case, two solutions are possible (assuming a typical
* Swedish dictionary): tax/ikväll, taxi/kväll. In this case, if evaluate is true,
* 'taxi-kväll' is returned, if evaluate is false, 'taxikväll' is returned.
*
* @param word the input word
* @param beginLimit the shortest substring at the beginning of the word to evaluate against the dictionary
* @param endLimit the shortest substring at the end of the word to evaluate against the dictionary
* @param threshold an ambiguity threshold in the range [0, 1]. If there are several similar solutions,
* the input word is returned when the weighted difference between two candidates is less
* than the specified value
* @return returns the word, hyphenated at compound boundaries
*/
String findCompounds(String word, int beginLimit, int endLimit, double threshold) {
if (word.length() < decompoundLimit) {
return word;
}
int breakPoint = -1;
CWHyphenationUnit bp1 = null;
CWHyphenationUnit bp2 = null;
int points = -1; // a word in the dictionary is better than if it's not, even if it's never been observed
CWHyphenationUnit ret = lookup(word);
if (ret != null) {
if (ret.hasHyphenation()) {
// this word has a predefined hyphenation, return it
return ret.hyphenate(word);
}
points = ret.getFreq();
}
for (int i = beginLimit; i <= word.length() - endLimit; i++) {
CWHyphenationUnit val1 = lookup(word.substring(0, i));
if (val1 != null) {
CWHyphenationUnit val2 = lookup(word.substring(i, word.length()));
if (val2 != null) {
// Let the least likely part define the likelihood of the combination
int cval = (int) Math.sqrt(val1.getFreq() * val2.getFreq());
if (Math.abs(cval - points) / (double) (cval + points) < threshold) {
// if there are several possible close solutions, fail and return as is
return word;
}
//use this breakpoint if it is more frequent than the previous breakpoint (or the whole word)
if (points < cval) {
points = cval;
breakPoint = i;
bp1 = val1;
bp2 = val2;
}
}
}
}
if (breakPoint > -1) {
StringBuilder r = new StringBuilder()
.append(bp1.hyphenate(word.substring(0, breakPoint)))
.append(SOFT_HYPHEN)
.append(bp2.hyphenate(word.substring(breakPoint, word.length())));
return r.toString();
} else {
return word;
}
}
/**
* Finds a matching hyphenation unit.
*
* @param word the word to find
* @return returns the hyphenation unit, or null if none is found
*/
private CWHyphenationUnit lookup(String word) {
CWHyphenationUnit ret = stems.get(word);
if (ret == null) {
// Try lower case
ret = stems.get(word.toLowerCase());
}
return ret;
}
}