com.atilika.kuromoji.ipadic.Tokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of deeplearning4j-nlp-japanese Show documentation
There is a newer version: 1.0.0-beta7
Show newest version
/**
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.ipadic;

import com.atilika.kuromoji.TokenizerBase;
import com.atilika.kuromoji.dict.*;
import com.atilika.kuromoji.ipadic.compile.DictionaryEntry;
import com.atilika.kuromoji.trie.DoubleArrayTrie;
import com.atilika.kuromoji.util.SimpleResourceResolver;
import com.atilika.kuromoji.viterbi.TokenFactory;
import com.atilika.kuromoji.viterbi.ViterbiNode;

import java.util.ArrayList;
import java.util.List;

/**
 * A tokenizer based on the IPADIC dictionary
 * 
 * See {@link Token} for details on the morphological features produced by this tokenizer
 * 

 * The following code example demonstrates how to use the Kuromoji tokenizer:
 * 
{@code
 * package com.atilika.kuromoji.example;
 * import com.atilika.kuromoji.ipadic.Token;
 * import com.atilika.kuromoji.ipadic.Tokenizer;
 * import java.util.List;
 *
 * public class KuromojiExample {
 *     public static void main(String[] args) {
 *         Tokenizer tokenizer = new Tokenizer() ;
 *         List tokens = tokenizer.tokenize("お寿司が食べたい。");
 *         for (Token token : tokens) {
 *             System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
 *         }
 *     }
 * }
 * }
 * 
 */
public class Tokenizer extends TokenizerBase {

    /**
     * Construct a default tokenizer
     */
    public Tokenizer() {
        this(new Builder());
    }

    /**
     * Construct a customized tokenizer
     * 
     * See {@see com.atilika.kuromoji.ipadic.Tokenizer#Builder}
     */
    private Tokenizer(Builder builder) {
        configure(builder);
    }

    /**
     * Tokenizes the provided text and returns a list of tokens with various feature information
     * 

     * This method is thread safe
     *
     * @param text  text to tokenize
     * @return list of Token, not null
     */
    @Override
    public List tokenize(String text) {
        return createTokenList(text);
    }

    /**
     * Builder class for creating a customized tokenizer instance
     */
    public static class Builder extends TokenizerBase.Builder {

        private static final int DEFAULT_KANJI_LENGTH_THRESHOLD = 2;
        private static final int DEFAULT_OTHER_LENGTH_THRESHOLD = 7;
        private static final int DEFAULT_KANJI_PENALTY = 3000;
        private static final int DEFAULT_OTHER_PENALTY = 1700;

        private int kanjiPenaltyLengthTreshold = DEFAULT_KANJI_LENGTH_THRESHOLD;
        private int kanjiPenalty = DEFAULT_KANJI_PENALTY;
        private int otherPenaltyLengthThreshold = DEFAULT_OTHER_LENGTH_THRESHOLD;
        private int otherPenalty = DEFAULT_OTHER_PENALTY;

        private boolean nakaguroSplit = false;

        /**
         * Creates a default builder
         */
        public Builder() {
            totalFeatures = DictionaryEntry.TOTAL_FEATURES;
            readingFeature = DictionaryEntry.READING_FEATURE;
            partOfSpeechFeature = DictionaryEntry.PART_OF_SPEECH_FEATURE;

            tokenFactory = new TokenFactory() {
                @Override
                public Token createToken(int wordId,
                                         String surface,
                                         ViterbiNode.Type type,
                                         int position,
                                         Dictionary dictionary) {
                    return new Token(wordId, surface, type, position, dictionary);
                }
            };
        }

        /**
         * Sets the tokenization mode
         * 

         * The tokenization mode defines how Available modes are as follows:
         * 

         * {@link com.atilika.kuromoji.TokenizerBase.Mode#NORMAL} - The default mode
         * 
{@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} - Uses a heuristic to segment compound nouns (複合名詞) into their parts
         * 
{@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} - Same as SEARCH, but emits unigram tokens for unknown terms
         * 
         * See {@link #kanjiPenalty} and {@link #otherPenalty} for how to adjust costs used by SEARCH and EXTENDED mode
         *
         * @param mode  tokenization mode
         * @return this builder, not null
         */
        public Builder mode(Mode mode) {
            this.mode = mode;
            return this;
        }

        /**
         * Sets a custom kanji penalty
         * 
         * This is an expert feature used with {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} and {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} modes that sets a length threshold and an additional costs used when running the Viterbi search.
         * The additional cost is applicable for kanji candidate tokens longer than the length threshold specified.
         * 

         * This is an expert feature and you usually would not need to change this.
         *
         * @param lengthThreshold  length threshold applicable for this penalty
         * @param penalty  cost added to Viterbi nodes for long kanji candidate tokens
         * @return this builder, not null
         */
        public Builder kanjiPenalty(int lengthThreshold, int penalty) {
            this.kanjiPenaltyLengthTreshold = lengthThreshold;
            this.kanjiPenalty = penalty;
            return this;
        }

        /**
         * Sets a custom non-kanji penalty
         * 

         * This is an expert feature used with {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} and {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} modes that sets a length threshold and an additional costs used when running the Viterbi search.
         * The additional cost is applicable for non-kanji candidate tokens longer than the length threshold specified.
         * 

         * This is an expert feature and you usually would not need to change this.
         *
         * @param lengthThreshold  length threshold applicable for this penalty
         * @param penalty  cost added to Viterbi nodes for long non-kanji candidate tokens
         * @return this builder, not null
         */
        public Builder otherPenalty(int lengthThreshold, int penalty) {
            this.otherPenaltyLengthThreshold = lengthThreshold;
            this.otherPenalty = penalty;
            return this;
        }

        /**
         * Predictate that splits unknown words on the middle dot character (U+30FB KATAKANA MIDDLE DOT)
         * 
         * This feature is off by default.
         * This is an expert feature sometimes used with {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} and {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} mode.
         *
         * @param split  predicate to indicate split on middle dot
         * @return this builder, not null
         */
        public Builder isSplitOnNakaguro(boolean split) {
            this.nakaguroSplit = split;
            return this;
        }

        /**
         * Creates the custom tokenizer instance
         *
         * @return tokenizer instance, not null
         */
        @Override
        public Tokenizer build() {
            return new Tokenizer(this);
        }

        @Override
        protected void loadDictionaries() {
            penalties = new ArrayList<>();
            penalties.add(kanjiPenaltyLengthTreshold);
            penalties.add(kanjiPenalty);
            penalties.add(otherPenaltyLengthThreshold);
            penalties.add(otherPenalty);

            resolver = new SimpleResourceResolver(this.getClass());

            try {
                doubleArrayTrie = DoubleArrayTrie.newInstance(resolver);
                connectionCosts = ConnectionCosts.newInstance(resolver);
                tokenInfoDictionary = TokenInfoDictionary.newInstance(resolver);
                characterDefinitions = CharacterDefinitions.newInstance(resolver);

                if (nakaguroSplit) {
                    characterDefinitions.setCategories('・', new String[]{"SYMBOL"});
                }

                unknownDictionary = UnknownDictionary.newInstance(
                    resolver, characterDefinitions, totalFeatures
                );
                insertedDictionary = new InsertedDictionary(totalFeatures);
            } catch (Exception ouch) {
                throw new RuntimeException("Could not load dictionaries.", ouch);
            }
        }
    }
}