All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atilika.kuromoji.viterbi.ViterbiBuilder Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta7
Show newest version
/*-*
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.viterbi;

import com.atilika.kuromoji.TokenizerBase.Mode;
import com.atilika.kuromoji.dict.CharacterDefinitions;
import com.atilika.kuromoji.dict.TokenInfoDictionary;
import com.atilika.kuromoji.dict.UnknownDictionary;
import com.atilika.kuromoji.dict.UserDictionary;
import com.atilika.kuromoji.trie.DoubleArrayTrie;

import java.util.ArrayList;
import java.util.List;

public class ViterbiBuilder {

    private final DoubleArrayTrie trie;
    private final TokenInfoDictionary dictionary;
    private final UnknownDictionary unknownDictionary;
    private final UserDictionary userDictionary;
    private final CharacterDefinitions characterDefinitions;
    private final boolean useUserDictionary;
    private boolean searchMode;

    /**
     * Constructor
     *
     * @param trie  trie with surface forms
     * @param dictionary  token info dictionary
     * @param unknownDictionary  unknown word dictionary
     * @param userDictionary  user dictionary
     * @param mode  tokenization {@link Mode mode}
     */
    public ViterbiBuilder(DoubleArrayTrie trie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary,
                    UserDictionary userDictionary, Mode mode) {
        this.trie = trie;
        this.dictionary = dictionary;
        this.unknownDictionary = unknownDictionary;
        this.userDictionary = userDictionary;

        this.useUserDictionary = (userDictionary != null);

        if (mode == Mode.SEARCH || mode == Mode.EXTENDED) {
            searchMode = true;
        }
        this.characterDefinitions = unknownDictionary.getCharacterDefinition();
    }


    /**
     * Build lattice from input text
     *
     * @param text  source text for the lattice
     * @return built lattice, not null
     */
    public ViterbiLattice build(String text) {
        int textLength = text.length();
        ViterbiLattice lattice = new ViterbiLattice(textLength + 2);

        lattice.addBos();

        int unknownWordEndIndex = -1; // index of the last character of unknown word

        for (int startIndex = 0; startIndex < textLength; startIndex++) {
            // If no token ends where current token starts, skip this index
            if (lattice.tokenEndsWhereCurrentTokenStarts(startIndex)) {

                String suffix = text.substring(startIndex);
                boolean found = processIndex(lattice, startIndex, suffix);

                // In the case of normal mode, it doesn't process unknown word greedily.
                if (searchMode || unknownWordEndIndex <= startIndex) {

                    int[] categories = characterDefinitions.lookupCategories(suffix.charAt(0));

                    for (int i = 0; i < categories.length; i++) {
                        int category = categories[i];
                        unknownWordEndIndex = processUnknownWord(category, i, lattice, unknownWordEndIndex, startIndex,
                                        suffix, found);
                    }
                }
            }
        }

        if (useUserDictionary) {
            processUserDictionary(text, lattice);
        }

        lattice.addEos();

        return lattice;
    }

    private boolean processIndex(ViterbiLattice lattice, int startIndex, String suffix) {
        boolean found = false;
        for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
            String prefix = suffix.substring(0, endIndex);
            int result = trie.lookup(prefix, 0, 0);

            if (result > 0) { // Found match in double array trie
                found = true; // Don't produce unknown word starting from this index
                for (int wordId : dictionary.lookupWordIds(result)) {
                    ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary, startIndex, ViterbiNode.Type.KNOWN);
                    lattice.addNode(node, startIndex + 1, startIndex + 1 + endIndex);
                }
            } else if (result < 0) { // If result is less than zero, continue to next position
                break;
            }
        }
        return found;
    }

    private int processUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex,
                    String suffix, boolean found) {
        int unknownWordLength = 0;
        int[] definition = characterDefinitions.lookupDefinition(category);

        if (definition[CharacterDefinitions.INVOKE] == 1 || found == false) {
            if (definition[CharacterDefinitions.GROUP] == 0) {
                unknownWordLength = 1;
            } else {
                unknownWordLength = 1;
                for (int j = 1; j < suffix.length(); j++) {
                    char c = suffix.charAt(j);

                    int[] categories = characterDefinitions.lookupCategories(c);

                    if (categories == null) {
                        break;
                    }

                    if (i < categories.length && category == categories[i]) {
                        unknownWordLength++;
                    } else {
                        break;
                    }
                }
            }
        }

        if (unknownWordLength > 0) {
            String unkWord = suffix.substring(0, unknownWordLength);
            int[] wordIds = unknownDictionary.lookupWordIds(category); // characters in input text are supposed to be the same

            for (int wordId : wordIds) {
                ViterbiNode node = new ViterbiNode(wordId, unkWord, unknownDictionary, startIndex,
                                ViterbiNode.Type.UNKNOWN);
                lattice.addNode(node, startIndex + 1, startIndex + 1 + unknownWordLength);
            }
            unknownWordEndIndex = startIndex + unknownWordLength;
        }

        return unknownWordEndIndex;
    }

    /**
     * Find token(s) in input text and set found token(s) in arrays as normal tokens
     *
     * @param text
     * @param lattice
     */
    private void processUserDictionary(final String text, ViterbiLattice lattice) {
        List matches = userDictionary.findUserDictionaryMatches(text);

        for (UserDictionary.UserDictionaryMatch match : matches) {
            int wordId = match.getWordId();
            int index = match.getMatchStartIndex();
            int length = match.getMatchLength();

            String word = text.substring(index, index + length);

            ViterbiNode node = new ViterbiNode(wordId, word, userDictionary, index, ViterbiNode.Type.USER);
            int nodeStartIndex = index + 1;
            int nodeEndIndex = nodeStartIndex + length;

            lattice.addNode(node, nodeStartIndex, nodeEndIndex);

            if (isLatticeBrokenBefore(nodeStartIndex, lattice)) {
                repairBrokenLatticeBefore(lattice, index);
            }

            if (isLatticeBrokenAfter(nodeStartIndex + length, lattice)) {
                repairBrokenLatticeAfter(lattice, nodeEndIndex);
            }
        }
    }

    /**
     * Checks whether there exists any node in the lattice that connects to the newly inserted entry on the left side
     * (before the new entry).
     *
     * @param nodeIndex
     * @param lattice
     * @return whether the lattice has a node that ends at nodeIndex
     */
    private boolean isLatticeBrokenBefore(int nodeIndex, ViterbiLattice lattice) {
        ViterbiNode[][] nodeEndIndices = lattice.getEndIndexArr();

        return nodeEndIndices[nodeIndex] == null;
    }

    /**
     * Checks whether there exists any node in the lattice that connects to the newly inserted entry on the right side
     * (after the new entry).
     *
     * @param endIndex
     * @param lattice
     * @return whether the lattice has a node that starts at endIndex
     */
    private boolean isLatticeBrokenAfter(int endIndex, ViterbiLattice lattice) {
        ViterbiNode[][] nodeStartIndices = lattice.getStartIndexArr();

        return nodeStartIndices[endIndex] == null;
    }

    /**
     * Tries to repair the lattice by creating and adding an additional Viterbi node to the LEFT of the newly
     * inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least
     *
     * @param lattice
     * @param index
     */
    private void repairBrokenLatticeBefore(ViterbiLattice lattice, int index) {
        ViterbiNode[][] nodeStartIndices = lattice.getStartIndexArr();

        for (int startIndex = index; startIndex > 0; startIndex--) {
            if (nodeStartIndices[startIndex] != null) {
                ViterbiNode glueBase = findGlueNodeCandidate(index, nodeStartIndices[startIndex], startIndex);
                if (glueBase != null) {
                    int length = index + 1 - startIndex;
                    String surface = glueBase.getSurface().substring(0, length);
                    ViterbiNode glueNode = createGlueNode(startIndex, glueBase, surface);
                    lattice.addNode(glueNode, startIndex, startIndex + glueNode.getSurface().length());
                    return;
                }
            }
        }
    }

    /**
     * Tries to repair the lattice by creating and adding an additional Viterbi node to the RIGHT of the newly
     * inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least
     *  @param lattice
     * @param nodeEndIndex
     */
    private void repairBrokenLatticeAfter(ViterbiLattice lattice, int nodeEndIndex) {
        ViterbiNode[][] nodeEndIndices = lattice.getEndIndexArr();

        for (int endIndex = nodeEndIndex + 1; endIndex < nodeEndIndices.length; endIndex++) {
            if (nodeEndIndices[endIndex] != null) {
                ViterbiNode glueBase = findGlueNodeCandidate(nodeEndIndex, nodeEndIndices[endIndex], endIndex);
                if (glueBase != null) {
                    int delta = endIndex - nodeEndIndex;
                    String glueBaseSurface = glueBase.getSurface();
                    String surface = glueBaseSurface.substring(glueBaseSurface.length() - delta);
                    ViterbiNode glueNode = createGlueNode(nodeEndIndex, glueBase, surface);
                    lattice.addNode(glueNode, nodeEndIndex, nodeEndIndex + glueNode.getSurface().length());
                    return;
                }
            }
        }
    }

    /**
     * Tries to locate a candidate for a "glue" node that repairs the broken lattice by looking at all nodes at the
     * current index.
     *
     * @param index
     * @param latticeNodes
     * @param startIndex
     * @return new ViterbiNode that can be inserted to glue the graph if such a node exists, else null
     */
    private ViterbiNode findGlueNodeCandidate(int index, ViterbiNode[] latticeNodes, int startIndex) {
        List candidates = new ArrayList<>();

        for (ViterbiNode viterbiNode : latticeNodes) {
            if (viterbiNode != null) {
                candidates.add(viterbiNode);
            }
        }
        if (!candidates.isEmpty()) {
            ViterbiNode glueBase = null;
            int length = index + 1 - startIndex;
            for (ViterbiNode candidate : candidates) {
                if (isAcceptableCandidate(length, glueBase, candidate)) {
                    glueBase = candidate;
                }
            }
            if (glueBase != null) {
                return glueBase;
            }
        }
        return null;
    }

    /**
     * Check whether a candidate for a glue node is acceptable.
     * The candidate should be as short as possible, but long enough to overlap with the inserted user entry
     *
     * @param targetLength
     * @param glueBase
     * @param candidate
     * @return whether candidate is acceptable
     */
    private boolean isAcceptableCandidate(int targetLength, ViterbiNode glueBase, ViterbiNode candidate) {
        return (glueBase == null || candidate.getSurface().length() < glueBase.getSurface().length())
                        && candidate.getSurface().length() >= targetLength;
    }

    /**
     * Create a glue node to be inserted based on ViterbiNode already in the lattice.
     * The new node takes the same parameters as the node it is based on, but the word is truncated to match the
     * hole in the lattice caused by the new user entry
     *
     * @param startIndex
     * @param glueBase
     * @param surface
     * @return new ViterbiNode to be inserted as glue into the lattice
     */
    private ViterbiNode createGlueNode(int startIndex, ViterbiNode glueBase, String surface) {
        return new ViterbiNode(glueBase.getWordId(), surface, glueBase.getLeftId(), glueBase.getRightId(),
                        glueBase.getWordCost(), startIndex, ViterbiNode.Type.INSERTED);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy