de.abelssoft.wordtools.jwordsplitter.AbstractWordSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwordsplitter Show documentation
A Java library to split German compound words
The newest version!
/**
 * Copyright 2004-2007 Sven Abels
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.abelssoft.wordtools.jwordsplitter;

import de.abelssoft.tools.FileTools;

import java.io.IOException;
import java.io.InputStream;
import java.util.*;

/**
 * This class can split compound words into their smallest parts (atoms). For example "Erhebungsfehler"
 * will be split into "erhebung" and "fehler", if "erhebung" and "fehler" are in the dictionary
 * and "erhebungsfehler" is not. Thus how words are split only depends on the contents of
 * the dictionary. A dictionary for German is included.
 *
 * This is especially useful for German words but it will work with all languages.
 * The order of the words in the collection will be identical to their appearance in the
 * connected word. It's good to provide a large dictionary.
 *
 * Please note: We don't expect to have any special chars here (!":;,.-_, etc.). Only a set of
 * characters and only one word.
 *
 * @author Sven Abels (Abelssoft), [email protected]
 * @author Daniel Naber
 */
public abstract class AbstractWordSplitter {

    private static final String COMMENT_CHAR = "#";
    private static final String DELIMITER_CHAR = "|";

    private final Map> exceptionMap = new HashMap>();

    private Set words = null;
    private boolean hideConnectingCharacters = true;
    private boolean strictMode = false;
    private boolean reverseMode = false;

    protected String plainTextDictFile = null;
    protected InputStream plainTextDict = null;

    protected abstract Set getWordList() throws IOException;

    protected abstract int getMinimumWordLength();

    /** Interfix elements in lowercase, e.g. at least "s" for German. */
    protected abstract Collection getConnectingCharacters();

    /**
     * @param hideConnectingCharacters whether the word parts returned by {@link #splitWord(String)} still contain
     *  the connecting character (a.k.a. interfix)
     * @throws IOException
     */
    public AbstractWordSplitter(boolean hideConnectingCharacters) throws IOException {
        this(hideConnectingCharacters, (String)null);
    }

    /**
     * @param hideConnectingCharacters whether the word parts returned by {@link #splitWord(String)} still contain
     *  the connecting character (a.k.a. interfix)
     * @param  plainTextDictFile a text file with one word per line, to be used instead of the embedded dictionary,
     *                           must be in UTF-8 format
     * @throws IOException
     */
    public AbstractWordSplitter(boolean hideConnectingCharacters, String plainTextDictFile) throws IOException {
        this.hideConnectingCharacters = hideConnectingCharacters;
        this.plainTextDictFile = plainTextDictFile;
        words = getWordList();
    }

    /**
     * @param hideConnectingCharacters whether the word parts returned by {@link #splitWord(String)} still contain
     *  the connecting character (a.k.a. interfix)
     * @param  plainTextDict a stream of a text file with one word per line, to be used instead of the embedded dictionary,
     *                       must be in UTF-8 format
     * @throws IOException
     */
    public AbstractWordSplitter(boolean hideConnectingCharacters, InputStream plainTextDict) throws IOException {
        this.hideConnectingCharacters = hideConnectingCharacters;
        this.plainTextDict = plainTextDict;
        words = getWordList();
    }

    /**
     * @throws IOException
     */
    public AbstractWordSplitter() throws IOException {
        this(true);
    }

    public void setExceptionFile(String filename) throws IOException {
        final InputStream is = AbstractWordSplitter.class.getResourceAsStream(filename);
        try {
            if (is == null) {
                throw new IOException("Cannot locate exception list in JAR: " + filename);
            }
            final String exceptions = FileTools.loadFile(is, "UTF-8");
            final Scanner scanner = new Scanner(exceptions);
            while (scanner.hasNextLine()) {
                final String line = scanner.nextLine().trim();
                if (!line.isEmpty() && !line.startsWith(COMMENT_CHAR)) {
                    final String[] parts = line.split("\\|");
                    final String completeWord = line.replace(DELIMITER_CHAR, "");
                    final List list = new ArrayList(Arrays.asList(parts));
                    exceptionMap.put(completeWord.toLowerCase(), list);
                }
            }
            scanner.close();
        } finally {
            if (is != null) {
                is.close();
            }
        }
    }

    /**
     * @param completeWord the word for which an exception is to be defined (will be considered case-insensitive)
     * @param wordParts the parts in which the word is to be split (use a list with a single element if the word should not be split)
     */
    public void addException(String completeWord, List wordParts) {
        exceptionMap.put(completeWord.toLowerCase(), wordParts);
    }

    /**
     * When set to true, words will only be split if all parts are words.
     * Otherwise the splitting result might contain parts that are not words.
     * The minimum length of word parts is correctly taken into account only if this is set to true.
     */
    public void setStrictMode(boolean strictMode) {
        this.strictMode = strictMode;
    }

    /**
     * If set to true, words will be split from the end, not from the start. Useful only 
     * to compare both ways of splitting to detect ambiguities.
     */
    public void setReverseMode(boolean reverseMode) {
        this.reverseMode = reverseMode;
    }

    /**
     * Detect if a word exists in the dictionary. Words that are too short are ignored 
     * in order to avoid a fragmentation, which is too strong.
     */
    private boolean isWord(String s) {
        if (s==null)
            return false;
        if (s.trim().length()Attention: We don't expect to have any special chars here (!":;,.-_, etc.).
     *
     * @param str a single compound word
     */
    public Collection splitWord(String str) {
        final Collection result=new ArrayList();
        if (str==null)
            return result;
        final String s=str.trim();
        if (s.length()<2)
        {
            result.add(s);
            return result;
        }

        //find a tuple (from left to right):
        Collection tuple = findTuple(s);
        if (tuple ==null && !strictMode)
            tuple =truncateSplit(s);
        if (tuple ==null && !strictMode)
            tuple =truncateSplitReverse(s);
        if (tuple ==null)
            result.add(str);
        else
            result.addAll(tuple);

        return result;
    }


    /**
     * We were not able to split the word...well: Let's try to cut it at its beginning.
     */
    private Collection truncateSplit(String s) {
        //we were not able to split the word...well: Let's try to cut it:
        for (int i=0;i<(s.length()-2);i++) {
            final Collection tmp= findTuple(s.substring(i));
            if (tmp!=null) {
                final Collection tmp2=new ArrayList();
                if (strictMode && !isWord(s.substring(0,i))) {
                    continue;
                }
                tmp2.add(s.substring(0,i));
                tmp2.addAll(tmp);
                return tmp2;
            }
        }
        return null;
    }


    /**
     * We were not able to split the word... well: Let's try to cut it at its end.
     */
    private Collection truncateSplitReverse(String s) {
        //we were not able to split the word...well: Let's try to cut it:
        for (int i=(s.length()-1);i>1;i--) {
            final Collection tmp= findTuple(s.substring(0,i));
            if (tmp!=null) {
                if (strictMode && !isWord(s.substring(i))) {
                    continue;
                }
                tmp.add(s.substring(i));
                return tmp;
            }
        }
        return null;
    }

    /**
     * Removes e.g. 's' at the end of a string.
     */
    private String removeTailingCharacters(String str) {
        final String lowercaseStr = str.toLowerCase();
        final Collection connChars = getConnectingCharacters();
        for (String connChar : connChars) {
            if (lowercaseStr.endsWith(connChar)) {
                return str.substring(0, str.length()-connChar.length());
            }
        }
        return str;
    }

    private Collection findTuple(String s) {

        final List exceptionSplit = exceptionMap.get(s.toLowerCase());
        if (exceptionSplit != null) {
            return exceptionSplit;
        }

        if (s.length()<2)
            return null;
        Collection result=new ArrayList();

        final int fromPos;
        if (reverseMode) {
            fromPos = s.length()-1;
        } else {
            fromPos = 0;
        }
        int i = fromPos;
        while (true) {
            if (reverseMode) {
                if (i < 1) {
                    break;
                }
                i--;
            } else {
                if (i >= s.length()) {
                    break;
                }
                i++;
            }
            final String left=s.substring(0, i);
            final String right=s.substring(left.length());
            final String leftCleaned=removeTailingCharacters(left);
            boolean leftIsWord=false;
            if ((isWord(leftCleaned))) {
                if (hideConnectingCharacters)
                    result.add(leftCleaned);
                else
                    result.add(left);
                leftIsWord=true;
            } else if ((isWord(left))) {
                result.add(left);
                leftIsWord=true;
            }
            if (leftIsWord) {
                //look if we can split the right part, too:
                final Collection rightCol= findTuple(right);
                if (rightCol!=null) {
                    result.addAll(rightCol);
                } else {
                    //we cannot split the rest of the word => left was not ok.
                    result=new ArrayList();
                    continue;
                }
                return result;
            }
        }

        final boolean stringIsWord=isWord(s);
        final boolean cleanedStringIsWord=isWord(removeTailingCharacters(s));
        if (!stringIsWord && !cleanedStringIsWord) {
            return null;
        }
        if (hideConnectingCharacters && !stringIsWord) {
            result.add(removeTailingCharacters(s));
        } else {
            result.add(s);
        }
        return result;
    }

}