All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bigml.mimir.nlp.tokenization.JapaneseTokenStream Maven / Gradle / Ivy

There is a newer version: 0.8.3
Show newest version
package org.bigml.mimir.nlp.tokenization;

import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;

/**
 * A TokenStream for Japanese.  This stream relies almost entirely on
 * the underlying tokenizer provided by Apache Lucene.
 *
 * @author [email protected]
 *
 */
public class JapaneseTokenStream extends LuceneTokenStream {
    public JapaneseTokenStream(String in, boolean caseSensitive) {
        super(in, caseSensitive);
    }

    @Override
    protected Analyzer getAnalyzer() {
        return createAnalyzer();
    }

    public static JapaneseAnalyzer createAnalyzer() {
        CharArraySet stopSet = new CharArraySet(0, false);
        Set stopWords = new HashSet();
        return new JapaneseAnalyzer(null, Mode.SEARCH, stopSet, stopWords);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy