com.atilika.kuromoji.TokenizerBase Maven / Gradle / Ivy

Go to download
/*-*
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji;

import com.atilika.kuromoji.dict.*;
import com.atilika.kuromoji.trie.DoubleArrayTrie;
import com.atilika.kuromoji.util.ResourceResolver;
import com.atilika.kuromoji.viterbi.*;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;

/**
 * TokenizerBase main class
 */
public abstract class TokenizerBase {

    public enum Mode {
        NORMAL, SEARCH, EXTENDED
    }

    private ViterbiBuilder viterbiBuilder;

    private ViterbiSearcher viterbiSearcher;

    private ViterbiFormatter viterbiFormatter;

    private boolean split;

    private TokenInfoDictionary tokenInfoDictionary;

    private UnknownDictionary unknownDictionary;

    private UserDictionary userDictionary;

    private InsertedDictionary insertedDictionary;

    protected TokenFactory tokenFactory;

    protected EnumMap dictionaryMap = new EnumMap<>(ViterbiNode.Type.class);

    protected void configure(Builder builder) {

        builder.loadDictionaries();

        this.tokenFactory = builder.tokenFactory;

        this.tokenInfoDictionary = builder.tokenInfoDictionary;
        this.unknownDictionary = builder.unknownDictionary;
        this.userDictionary = builder.userDictionary;
        this.insertedDictionary = builder.insertedDictionary;

        this.viterbiBuilder = new ViterbiBuilder(builder.doubleArrayTrie, tokenInfoDictionary, unknownDictionary,
                        userDictionary, builder.mode);

        this.viterbiSearcher = new ViterbiSearcher(builder.mode, builder.connectionCosts, unknownDictionary,
                        builder.penalties);

        this.viterbiFormatter = new ViterbiFormatter(builder.connectionCosts);
        this.split = builder.split;

        initDictionaryMap();
    }

    private void initDictionaryMap() {
        dictionaryMap.put(ViterbiNode.Type.KNOWN, tokenInfoDictionary);
        dictionaryMap.put(ViterbiNode.Type.UNKNOWN, unknownDictionary);
        dictionaryMap.put(ViterbiNode.Type.USER, userDictionary);
        dictionaryMap.put(ViterbiNode.Type.INSERTED, insertedDictionary);
    }

    public List tokenize(String text) {
        return createTokenList(text);
    }


    /**
     * Tokenizes the provided text and returns a list of tokens with various feature information
     * 
     * This method is thread safe
     *
     * @param text  text to tokenize
     * @param   token type
     * @return list of Token, not null
     */
    protected  List createTokenList(String text) {

        if (!split) {
            return createTokenList(0, text);
        }

        List splitPositions = getSplitPositions(text);

        if (splitPositions.isEmpty()) {
            return createTokenList(0, text);
        }

        ArrayList result = new ArrayList<>();

        int offset = 0;

        for (int position : splitPositions) {
            result.addAll(this.createTokenList(offset, text.substring(offset, position + 1)));
            offset = position + 1;
        }

        if (offset < text.length()) {
            result.addAll(this.createTokenList(offset, text.substring(offset)));
        }

        return result;
    }

    /**
     * Tokenizes the provided text and outputs the corresponding Viterbi lattice and the Viterbi path to the provided output stream
     * 

     * The output is written in DOT format.
     * 

     * This method is not thread safe
     *
     * @param outputStream  output stream to write to
     * @param text  text to tokenize
     * @throws IOException if an error occurs when writing the lattice and path
     */
    public void debugTokenize(OutputStream outputStream, String text) throws IOException {
        ViterbiLattice lattice = viterbiBuilder.build(text);
        List bestPath = viterbiSearcher.search(lattice);

        outputStream.write(viterbiFormatter.format(lattice, bestPath).getBytes(StandardCharsets.UTF_8));
        outputStream.flush();
    }

    /**
     * Writes the Viterbi lattice for the provided text to an output stream
     * 

     * The output is written in DOT format.
     * 

     * This method is not thread safe
     *
     * @param outputStream  output stream to write to
     * @param text  text to create lattice for
     * @throws IOException if an error occurs when writing the lattice
     */
    public void debugLattice(OutputStream outputStream, String text) throws IOException {
        ViterbiLattice lattice = viterbiBuilder.build(text);

        outputStream.write(viterbiFormatter.format(lattice).getBytes(StandardCharsets.UTF_8));
        outputStream.flush();
    }

    /**
     * Split input text at 句読点, which is 。 and 、
     *
     * @param text
     * @return list of split position
     */
    private List getSplitPositions(String text) {
        ArrayList splitPositions = new ArrayList<>();
        int position;
        int currentPosition = 0;

        while (true) {
            int indexOfMaru = text.indexOf("。", currentPosition);
            int indexOfTen = text.indexOf("、", currentPosition);

            if (indexOfMaru < 0 || indexOfTen < 0) {
                position = Math.max(indexOfMaru, indexOfTen);
            } else {
                position = Math.min(indexOfMaru, indexOfTen);
            }

            if (position >= 0) {
                splitPositions.add(position);
                currentPosition = position + 1;
            } else {
                break;
            }
        }

        return splitPositions;
    }

    /**
     * Tokenize input sentence.
     *
     * @param offset   offset of sentence in original input text
     * @param text sentence to tokenize
     * @return list of Token
     */
    private  List createTokenList(int offset, String text) {
        ArrayList result = new ArrayList<>();

        ViterbiLattice lattice = viterbiBuilder.build(text);
        List bestPath = viterbiSearcher.search(lattice);

        for (ViterbiNode node : bestPath) {
            int wordId = node.getWordId();
            if (node.getType() == ViterbiNode.Type.KNOWN && wordId == -1) { // Do not include BOS/EOS
                continue;
            }
            @SuppressWarnings("unchecked")
            T token = (T) tokenFactory.createToken(wordId, node.getSurface(), node.getType(),
                            offset + node.getStartIndex(), dictionaryMap.get(node.getType()));
            result.add(token);
        }

        return result;
    }

    /**
     * Abstract Builder shared by all tokenizers
     */
    public abstract static class Builder {
        protected DoubleArrayTrie doubleArrayTrie;
        protected ConnectionCosts connectionCosts;
        protected TokenInfoDictionary tokenInfoDictionary;
        protected UnknownDictionary unknownDictionary;
        protected CharacterDefinitions characterDefinitions;
        protected InsertedDictionary insertedDictionary;
        protected UserDictionary userDictionary = null;

        protected Mode mode = Mode.NORMAL;
        protected boolean split = true;
        protected List penalties = Collections.emptyList();

        protected int totalFeatures = -1;
        protected int readingFeature = -1;
        protected int partOfSpeechFeature = -1;

        protected ResourceResolver resolver;

        protected TokenFactory tokenFactory;

        protected void loadDictionaries() {
            try {
                doubleArrayTrie = DoubleArrayTrie.newInstance(resolver);
                connectionCosts = ConnectionCosts.newInstance(resolver);
                tokenInfoDictionary = TokenInfoDictionary.newInstance(resolver);
                characterDefinitions = CharacterDefinitions.newInstance(resolver);
                unknownDictionary = UnknownDictionary.newInstance(resolver, characterDefinitions, totalFeatures);
                insertedDictionary = new InsertedDictionary(totalFeatures);
            } catch (Exception ouch) {
                throw new RuntimeException("Could not load dictionaries.", ouch);
            }
        }

        /**
         * Creates a Tokenizer instance defined by this Builder
         *
         * @param  token type
         * @return Tokenizer instance
         */
        public abstract  T build();

        /**
         * Sets an optional user dictionary as an input stream
         * 
         * The inpuut stream provided is not closed by this method
         *
         * @param input  user dictionary as an input stream
         * @return this builder
         * @throws IOException if an error occurs when reading the user dictionary
         */
        public Builder userDictionary(InputStream input) throws IOException {
            this.userDictionary = new UserDictionary(input, totalFeatures, readingFeature, partOfSpeechFeature);
            return this;
        }

        /**
         * Sets an optional user dictionary filename
         *
         * @param filename  user dictionary filename
         * @return this builder
         * @throws IOException if an error occurs when reading the user dictionary
         */
        public Builder userDictionary(String filename) throws IOException {
            InputStream input = new BufferedInputStream(new FileInputStream(filename));

            this.userDictionary(input);
            input.close();
            return this;
        }
    }
}