All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.daisy.dotify.hyphenator.impl.CWDecompounder Maven / Gradle / Ivy

package org.daisy.dotify.hyphenator.impl;

import org.daisy.dotify.common.text.TextFileReader;
import org.daisy.dotify.common.text.TextFileReader.LineData;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;


/**
 * Class to decompound words.
 *
 * @author Joel Håkansson
 */
class CWDecompounder {
    static final String SOFT_HYPHEN = "\u00ad";
    private final Map stems;
    private final int decompoundLimit;

    /**
     * Creates a new decompounder.
     *
     * @param decompoundLimit the partition limit, 1 or more
     */
    CWDecompounder(int decompoundLimit) {
        this.decompoundLimit = decompoundLimit;
        if (decompoundLimit < 1) {
            throw new IllegalArgumentException("Decompound limit must not be lower than one.");
        }
        stems = new HashMap<>();
    }

    /**
     * Load dictionary.
     *
     * @param url        the dictionary url
     * @param lowerLimit disregard words shorter than lowerLimit
     * @throws IOException if the url cannot be read
     */
    void loadDictionary(String url, int lowerLimit) throws IOException {
        if (lowerLimit < 1) {
            throw new IllegalArgumentException("Decompound limit must not be lower than one.");
        }
        InputStream is = this.getClass().getResourceAsStream(url);
        TextFileReader tfr = new TextFileReader(is);
        LineData data;
        while ((data = tfr.nextLine()) != null) {
            int len = data.getFields()[0].length();
            if (len < lowerLimit || data.getFields().length < 2) {
                continue;
            }
            if (data.getFields().length > 2) {
                stems.put(
                    data.getFields()[0],
                    new CWHyphenationUnit(Integer.parseInt(data.getFields()[1]) + 1, data.getFields()[2])
                );
            } else {
                stems.put(data.getFields()[0], new CWHyphenationUnit(Integer.parseInt(data.getFields()[1]) + 1));
            }
        }
        tfr.close();
        is.close();
    }

    /**
     * Loads a dictionary.
     *
     * @param url the url to the dictionary.
     * @throws IOException if the url cannot be read
     */
    void loadDictionary(String url) throws IOException {
        loadDictionary(url, 1);
    }

    /**
     * Gets the dictionary.
     *
     * @return returns the dictionary
     */
    Map getDictionary() {
        return stems;
    }

    /**
     * 

Finds word compounds. The input word is scanned for possible compounds starting * at beginLimit and ending at endLimit.

* *

If evaluate is true, the most likely solution is returned. If evaluate is false, * the input word is returned unless there is only one possible solution in the * dictionary.

* *

For example, if the input is 'taxikväll' (beginLimit=2, endLimit=2) the following * string pairs are looked up in the dictionary: ta/xikväll, tax/ikväll, taxi/kväll, * taxik/väll, taxikvä/ll. In this case, two solutions are possible (assuming a typical * Swedish dictionary): tax/ikväll, taxi/kväll. In this case, if evaluate is true, * 'taxi-kväll' is returned, if evaluate is false, 'taxikväll' is returned.

* * @param word the input word * @param beginLimit the shortest substring at the beginning of the word to evaluate against the dictionary * @param endLimit the shortest substring at the end of the word to evaluate against the dictionary * @param threshold an ambiguity threshold in the range [0, 1]. If there are several similar solutions, * the input word is returned when the weighted difference between two candidates is less * than the specified value * @return returns the word, hyphenated at compound boundaries */ String findCompounds(String word, int beginLimit, int endLimit, double threshold) { if (word.length() < decompoundLimit) { return word; } int breakPoint = -1; CWHyphenationUnit bp1 = null; CWHyphenationUnit bp2 = null; int points = -1; // a word in the dictionary is better than if it's not, even if it's never been observed CWHyphenationUnit ret = lookup(word); if (ret != null) { if (ret.hasHyphenation()) { // this word has a predefined hyphenation, return it return ret.hyphenate(word); } points = ret.getFreq(); } for (int i = beginLimit; i <= word.length() - endLimit; i++) { CWHyphenationUnit val1 = lookup(word.substring(0, i)); if (val1 != null) { CWHyphenationUnit val2 = lookup(word.substring(i, word.length())); if (val2 != null) { // Let the least likely part define the likelihood of the combination int cval = (int) Math.sqrt(val1.getFreq() * val2.getFreq()); if (Math.abs(cval - points) / (double) (cval + points) < threshold) { // if there are several possible close solutions, fail and return as is return word; } //use this breakpoint if it is more frequent than the previous breakpoint (or the whole word) if (points < cval) { points = cval; breakPoint = i; bp1 = val1; bp2 = val2; } } } } if (breakPoint > -1) { StringBuilder r = new StringBuilder() .append(bp1.hyphenate(word.substring(0, breakPoint))) .append(SOFT_HYPHEN) .append(bp2.hyphenate(word.substring(breakPoint, word.length()))); return r.toString(); } else { return word; } } /** * Finds a matching hyphenation unit. * * @param word the word to find * @return returns the hyphenation unit, or null if none is found */ private CWHyphenationUnit lookup(String word) { CWHyphenationUnit ret = stems.get(word); if (ret == null) { // Try lower case ret = stems.get(word.toLowerCase()); } return ret; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy