All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atilika.kuromoji.ipadic.Tokenizer Maven / Gradle / Ivy

/**
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.ipadic;

import com.atilika.kuromoji.TokenizerBase;
import com.atilika.kuromoji.dict.CharacterDefinitions;
import com.atilika.kuromoji.dict.ConnectionCosts;
import com.atilika.kuromoji.dict.Dictionary;
import com.atilika.kuromoji.dict.InsertedDictionary;
import com.atilika.kuromoji.dict.TokenInfoDictionary;
import com.atilika.kuromoji.dict.UnknownDictionary;
import com.atilika.kuromoji.ipadic.compile.DictionaryEntry;
import com.atilika.kuromoji.trie.DoubleArrayTrie;
import com.atilika.kuromoji.util.SimpleResourceResolver;
import com.atilika.kuromoji.viterbi.TokenFactory;
import com.atilika.kuromoji.viterbi.ViterbiNode;

import java.util.ArrayList;
import java.util.List;

/**
 * A tokenizer based on the IPADIC dictionary
 * 

* See {@link Token} for details on the morphological features produced by this tokenizer *

* The following code example demonstrates how to use the Kuromoji tokenizer: *

{@code
 * package com.atilika.kuromoji.example;
 * import com.atilika.kuromoji.ipadic.Token;
 * import com.atilika.kuromoji.ipadic.Tokenizer;
 * import java.util.List;
 *
 * public class KuromojiExample {
 *     public static void main(String[] args) {
 *         Tokenizer tokenizer = new Tokenizer() ;
 *         List tokens = tokenizer.tokenize("お寿司が食べたい。");
 *         for (Token token : tokens) {
 *             System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
 *         }
 *     }
 * }
 * }
 * 
*/ public class Tokenizer extends TokenizerBase { /** * Construct a default tokenizer */ public Tokenizer() { this(new Builder()); } /** * Construct a customized tokenizer *

* See {@see com.atilika.kuromoji.ipadic.Tokenizer#Builder} */ private Tokenizer(Builder builder) { configure(builder); } /** * Tokenizes the provided text and returns a list of tokens with various feature information *

* This method is thread safe * * @param text text to tokenize * @return list of Token, not null */ @Override public List tokenize(String text) { return createTokenList(text); } /** * Builder class for creating a customized tokenizer instance */ public static class Builder extends TokenizerBase.Builder { private static final int DEFAULT_KANJI_LENGTH_THRESHOLD = 2; private static final int DEFAULT_OTHER_LENGTH_THRESHOLD = 7; private static final int DEFAULT_KANJI_PENALTY = 3000; private static final int DEFAULT_OTHER_PENALTY = 1700; private int kanjiPenaltyLengthTreshold = DEFAULT_KANJI_LENGTH_THRESHOLD; private int kanjiPenalty = DEFAULT_KANJI_PENALTY; private int otherPenaltyLengthThreshold = DEFAULT_OTHER_LENGTH_THRESHOLD; private int otherPenalty = DEFAULT_OTHER_PENALTY; private boolean nakaguroSplit = false; /** * Creates a default builder */ public Builder() { totalFeatures = DictionaryEntry.TOTAL_FEATURES; readingFeature = DictionaryEntry.READING_FEATURE; partOfSpeechFeature = DictionaryEntry.PART_OF_SPEECH_FEATURE; tokenFactory = new TokenFactory() { @Override public Token createToken(int wordId, String surface, ViterbiNode.Type type, int position, Dictionary dictionary) { return new Token(wordId, surface, type, position, dictionary); } }; } /** * Sets the tokenization mode *

* The tokenization mode defines how Available modes are as follows: *

    *
  • {@link com.atilika.kuromoji.TokenizerBase.Mode#NORMAL} - The default mode *
  • {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} - Uses a heuristic to segment compound nouns (複合名詞) into their parts *
  • {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} - Same as SEARCH, but emits unigram tokens for unknown terms *
* See {@link #kanjiPenalty} and {@link #otherPenalty} for how to adjust costs used by SEARCH and EXTENDED mode * * @param mode tokenization mode * @return this builder, not null */ public Builder mode(Mode mode) { this.mode = mode; return this; } /** * Sets a custom kanji penalty *

* This is an expert feature used with {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} and {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} modes that sets a length threshold and an additional costs used when running the Viterbi search. * The additional cost is applicable for kanji candidate tokens longer than the length threshold specified. *

* This is an expert feature and you usually would not need to change this. * * @param lengthThreshold length threshold applicable for this penalty * @param penalty cost added to Viterbi nodes for long kanji candidate tokens * @return this builder, not null */ public Builder kanjiPenalty(int lengthThreshold, int penalty) { this.kanjiPenaltyLengthTreshold = lengthThreshold; this.kanjiPenalty = penalty; return this; } /** * Sets a custom non-kanji penalty *

* This is an expert feature used with {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} and {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} modes that sets a length threshold and an additional costs used when running the Viterbi search. * The additional cost is applicable for non-kanji candidate tokens longer than the length threshold specified. *

* This is an expert feature and you usually would not need to change this. * * @param lengthThreshold length threshold applicable for this penalty * @param penalty cost added to Viterbi nodes for long non-kanji candidate tokens * @return this builder, not null */ public Builder otherPenalty(int lengthThreshold, int penalty) { this.otherPenaltyLengthThreshold = lengthThreshold; this.otherPenalty = penalty; return this; } /** * Predictate that splits unknown words on the middle dot character (U+30FB KATAKANA MIDDLE DOT) *

* This feature is off by default. * This is an expert feature sometimes used with {@link com.atilika.kuromoji.TokenizerBase.Mode#SEARCH} and {@link com.atilika.kuromoji.TokenizerBase.Mode#EXTENDED} mode. * * @param split predicate to indicate split on middle dot * @return this builder, not null */ public Builder isSplitOnNakaguro(boolean split) { this.nakaguroSplit = split; return this; } /** * Creates the custom tokenizer instance * * @return tokenizer instance, not null */ @Override public Tokenizer build() { return new Tokenizer(this); } @Override protected void loadDictionaries() { penalties = new ArrayList<>(); penalties.add(kanjiPenaltyLengthTreshold); penalties.add(kanjiPenalty); penalties.add(otherPenaltyLengthThreshold); penalties.add(otherPenalty); resolver = new SimpleResourceResolver(this.getClass()); try { doubleArrayTrie = DoubleArrayTrie.newInstance(resolver); connectionCosts = ConnectionCosts.newInstance(resolver); tokenInfoDictionary = TokenInfoDictionary.newInstance(resolver); characterDefinitions = CharacterDefinitions.newInstance(resolver); if (nakaguroSplit) { characterDefinitions.setCategories('・', new String[]{"SYMBOL"}); } unknownDictionary = UnknownDictionary.newInstance( resolver, characterDefinitions, totalFeatures ); insertedDictionary = new InsertedDictionary(totalFeatures); } catch (Exception ouch) { throw new RuntimeException("Could not load dictionaries.", ouch); } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy