org.bigml.mimir.nlp.tokenization.JapaneseTokenStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mimir Show documentation
Show all versions of mimir Show documentation
Java/Clojure Prediction Code for BigML
package org.bigml.mimir.nlp.tokenization;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
/**
* A TokenStream for Japanese. This stream relies almost entirely on
* the underlying tokenizer provided by Apache Lucene.
*
* @author [email protected]
*
*/
public class JapaneseTokenStream extends LuceneTokenStream {
public JapaneseTokenStream(String in, boolean caseSensitive) {
super(in, caseSensitive);
}
@Override
protected Analyzer getAnalyzer() {
return createAnalyzer();
}
public static JapaneseAnalyzer createAnalyzer() {
CharArraySet stopSet = new CharArraySet(0, false);
Set stopWords = new HashSet();
return new JapaneseAnalyzer(null, Mode.SEARCH, stopSet, stopWords);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy