com.atilika.kuromoji.jumandic.Tokenizer Maven / Gradle / Ivy
/**
* Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. A copy of the
* License is distributed with this work in the LICENSE.md file. You may
* also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.jumandic;
import com.atilika.kuromoji.TokenizerBase;
import com.atilika.kuromoji.dict.Dictionary;
import com.atilika.kuromoji.jumandic.compile.DictionaryEntry;
import com.atilika.kuromoji.util.SimpleResourceResolver;
import com.atilika.kuromoji.viterbi.TokenFactory;
import com.atilika.kuromoji.viterbi.ViterbiNode;
import java.util.List;
/**
* A tokenizer based on the JUMAN DIC dictionary
*
* See {@link Token} for details on the morphological features produced by this tokenizer
*
* The following code example demonstrates how to use the Kuromoji tokenizer:
*
{@code
* package com.atilika.kuromoji.example;
* import com.atilika.kuromoji.jumandic.Token;
* import com.atilika.kuromoji.jumandic.Tokenizer;
* import java.util.List;
*
* public class KuromojiExample {
* public static void main(String[] args) {
* Tokenizer tokenizer = new Tokenizer() ;
* List tokens = tokenizer.tokenize("お寿司が食べたい。");
* for (Token token : tokens) {
* System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
* }
* }
* }
* }
*
*/
public class Tokenizer extends TokenizerBase {
/**
* Construct a default tokenizer
*/
public Tokenizer() {
this(new Builder());
}
/**
* Construct a customized tokenizer
*
* See {@see com.atilika.kuromoji.jumandic.Tokenizer#Builder}
*/
private Tokenizer(Builder builder) {
configure(builder);
}
/**
* Tokenizes the provided text and returns a list of tokens with various feature information
*
* This method is thread safe
*
* @param text text to tokenize
* @return list of Token, not null
*/
@Override
public List tokenize(String text) {
return createTokenList(text);
}
/**
* Builder class for creating a customized tokenizer instance
*/
public static class Builder extends TokenizerBase.Builder {
/**
* Creates a default builder
*/
public Builder() {
totalFeatures = DictionaryEntry.TOTAL_FEATURES;
readingFeature = DictionaryEntry.READING_FEATURE;
partOfSpeechFeature = DictionaryEntry.PART_OF_SPEECH_FEATURE;
resolver = new SimpleResourceResolver(this.getClass());
tokenFactory = new TokenFactory() {
@Override
public Token createToken(int wordId,
String surface,
ViterbiNode.Type type,
int position,
Dictionary dictionary) {
return new Token(wordId, surface, type, position, dictionary);
}
};
}
/**
* Creates the custom tokenizer instance
*
* @return tokenizer instance, not null
*/
@Override
public Tokenizer build() {
return new Tokenizer(this);
}
}
}