org.deeplearning4j.nlp.japanese.tokenization.tokenizer.JapaneseTokenizer Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.deeplearning4j.nlp.japanese.tokenization.tokenizer;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
/**
* A thin wrapper for Japanese Morphological Analyzer Kuromoji (ver.0.9.0),
* it tokenizes texts which is written in languages
* that words are not separated by whitespaces.
*
* In thenory, Kuromoji is a language-independent Morphological Analyzer library,
* so if you want to tokenize non-Japanese texts (Chinese, Korean etc.),
* you can do it with MeCab style dictionary for each languages.
*/
public class JapaneseTokenizer implements org.deeplearning4j.text.tokenization.tokenizer.Tokenizer {
private final List tokens;
private final boolean useBaseForm;
private final int tokenCount;
private int currentToken;
private TokenPreProcess preProcessor;
/**
* Tokenize the string with Kuromoji, optionally using baseForms.
*
* Note: It is safe to create new instances from multiple threads.
* @param kuromoji The kuromoji instance.
* @param toTokenize The string to tokenize.
* @param useBaseForm normalize conjugations "走った" -> "走る" instead of "走っ"
*/
public JapaneseTokenizer(Tokenizer kuromoji, String toTokenize, boolean useBaseForm) {
this.useBaseForm = useBaseForm;
this.tokens = kuromoji.tokenize(toTokenize);
this.tokenCount = this.tokens.size();
this.currentToken = 0;
}
@Override
public boolean hasMoreTokens() {
return currentToken < tokenCount;
}
@Override
public int countTokens() {
return tokenCount;
}
private String getToken(int i) {
Token t = tokens.get(i);
String ret = (useBaseForm) ? t.getBaseForm() : t.getSurface();
return (preProcessor == null) ? ret : preProcessor.preProcess(ret);
}
@Override
public String nextToken() {
if (!hasMoreTokens()) {
throw new NoSuchElementException();
}
return getToken(currentToken++);
}
@Override
public List getTokens() {
List ret = new ArrayList<>(tokenCount);
for (int i = 0; i < tokenCount; i++) {
ret.add(getToken(i));
}
return ret;
}
@Override
public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
this.preProcessor = tokenPreProcessor;
}
}