All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atilika.kuromoji.jumandic.Tokenizer Maven / Gradle / Ivy

/**
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.atilika.kuromoji.jumandic;

import com.atilika.kuromoji.TokenizerBase;
import com.atilika.kuromoji.dict.Dictionary;
import com.atilika.kuromoji.jumandic.compile.DictionaryEntry;
import com.atilika.kuromoji.util.SimpleResourceResolver;
import com.atilika.kuromoji.viterbi.TokenFactory;
import com.atilika.kuromoji.viterbi.ViterbiNode;

import java.util.List;

/**
 * A tokenizer based on the JUMAN DIC dictionary
 * 

* See {@link Token} for details on the morphological features produced by this tokenizer *

* The following code example demonstrates how to use the Kuromoji tokenizer: *

{@code
 * package com.atilika.kuromoji.example;
 * import com.atilika.kuromoji.jumandic.Token;
 * import com.atilika.kuromoji.jumandic.Tokenizer;
 * import java.util.List;
 * 

* public class KuromojiExample { * public static void main(String[] args) { * Tokenizer tokenizer = new Tokenizer() ; * List tokens = tokenizer.tokenize("お寿司が食べたい。"); * for (Token token : tokens) { * System.out.println(token.getSurface() + "\t" + token.getAllFeatures()); * } * } * } * } *

*/ public class Tokenizer extends TokenizerBase { /** * Construct a default tokenizer */ public Tokenizer() { this(new Builder()); } /** * Construct a customized tokenizer *

* See {@see com.atilika.kuromoji.jumandic.Tokenizer#Builder} */ private Tokenizer(Builder builder) { configure(builder); } /** * Tokenizes the provided text and returns a list of tokens with various feature information *

* This method is thread safe * * @param text text to tokenize * @return list of Token, not null */ @Override public List tokenize(String text) { return createTokenList(text); } /** * Builder class for creating a customized tokenizer instance */ public static class Builder extends TokenizerBase.Builder { /** * Creates a default builder */ public Builder() { totalFeatures = DictionaryEntry.TOTAL_FEATURES; readingFeature = DictionaryEntry.READING_FEATURE; partOfSpeechFeature = DictionaryEntry.PART_OF_SPEECH_FEATURE; resolver = new SimpleResourceResolver(this.getClass()); tokenFactory = new TokenFactory() { @Override public Token createToken(int wordId, String surface, ViterbiNode.Type type, int position, Dictionary dictionary) { return new Token(wordId, surface, type, position, dictionary); } }; } /** * Creates the custom tokenizer instance * * @return tokenizer instance, not null */ @Override public Tokenizer build() { return new Tokenizer(this); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy