io.bdrc.lucene.sa.SanskritAnalyzer Maven / Gradle / Ivy
Show all versions of lucene-sa Show documentation
/*******************************************************************************
* Copyright (c) 2017 Buddhist Digital Resource Center (BDRC)
*
* If this file is a derivation of another work the license header will appear
* below; otherwise, this work is licensed under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with the
* License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package io.bdrc.lucene.sa;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.IOUtils;
/**
* An Analyzer that uses {@link SkrtSyllableTokenizer} and {@link SkrtWordTokenizer} and filters with StopFilter
*
* Derived from Lucene 6.4.1 analysis.core.WhitespaceAnalyzer.java
*
* @author Chris Tomlinson
* @author Hélios Hildt
**/
public final class SanskritAnalyzer extends Analyzer {
String mode = null;
String inputEncoding = null;
String lenient = null;
String stopFilename = null;
boolean mergePrepositions = true;
boolean filterGeminates = false;
CharArraySet skrtStopWords = null;
/**
*
* @param mode `space`, `syl` or `word`
* @param inputEncoding `SLP`, `deva` or `roman`
* @param stopFilename path to the file
*
* @throws IOException the file containing the stoplist can not be found
*/
public SanskritAnalyzer(String mode, String inputEncoding, String stopFilename) throws IOException {
this.mode = mode;
this.inputEncoding = inputEncoding;
if (stopFilename != null) {
InputStream stream = null;
stream = CommonHelpers.getResourceOrFile(stopFilename);
this.skrtStopWords = StopFilter.makeStopSet(getWordList(stream, "#"));
}
}
/**
*
* Uses the list of stopwords defined here:
* gist.github.com/Akhilesh28
*
* @param mode `space`, `syl` or `word`
* @param inputEncoding `SLP`, `deva` or `roman`
*
* @throws IOException the file containing the stoplist can not be read
*/
public SanskritAnalyzer(String mode, String inputEncoding) throws IOException {
this(mode, inputEncoding, "skrt-stopwords.txt");
}
/**
*
* Allows to change the default value(true) of mergePrepositions.
*
*
*
* Prepositions can either be merged or kept as separate tokens.
* Eventually, we will want to have a more refined treatment of the prepositions to account for cases where they should be standalone tokens.
*
*
*
* "(...) in the classical language the usage is mainly restricted to prati, anu, and ā.",
* (1125.b. of Whitney)
*
* @param mode `space`, `syl` or `word`
* @param inputEncoding `SLP`, `deva` or `roman`
* @param mergePrepositions concatenates the token containing the preposition with the next one if true.
* @param filterGeminates true or false (false by default)
*
* @throws IOException the file containing the stoplist can not be found
*/
public SanskritAnalyzer(String mode, String inputEncoding, boolean mergePrepositions, boolean filterGeminates) throws IOException {
this(mode, inputEncoding);
this.filterGeminates = filterGeminates;
if (mode == "word") {
this.mergePrepositions = mergePrepositions;
} else if (mergePrepositions){
CommonHelpers.logger.error("Can only merge prepositions if mode == word");
return;
}
}
/**
*
* @param mode `space`, `syl` or `word`
* @param inputEncoding `SLP`, `deva` or `roman`
* @param mergePrepositions concatenates the token containing the preposition with the next one if true.
* @param filterGeminates true or false (true by default)
* @param lenient `index` or `query`
*
* @throws IOException the file containing the stoplist can not be found
*/
public SanskritAnalyzer(String mode, String inputEncoding, boolean mergePrepositions, boolean filterGeminates, String lenient) throws IOException {
this(mode, inputEncoding, mergePrepositions, filterGeminates);
this.lenient = lenient;
}
/**
* @param inputStream Stream containing the list of stopwords
* @param comment The string representing a comment.
* @return the {@link ArrayList} of stopwords
* @throws IOException the input file couldn't be read
*/
public static ArrayList getWordList(InputStream inputStream, String comment) throws IOException {
ArrayList result = new ArrayList();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(inputStream));
String word = null;
while ((word = br.readLine()) != null) {
word = word.replace("\t", "");
if (word.contains(comment)) {
if (!word.startsWith(comment)) {
word = word.substring(0, word.indexOf(comment));
word = word.trim();
if (!word.isEmpty()) result.add(word);
}
} else {
word = word.trim();
if (!word.isEmpty()) result.add(word);
}
}
}
finally {
IOUtils.close(br);
}
return result;
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
if (this.inputEncoding == "deva") {
reader = new Deva2SlpFilter(reader);
reader = new VedicFilter(reader);
} else if (this.inputEncoding == "roman") {
reader = new Roman2SlpFilter(reader);
} else if (this.inputEncoding != "SLP"){
CommonHelpers.logger.error("wrong value for `mode`");
return null;
}
if (this.filterGeminates == true) {
reader = new GeminateNormalizingFilter(reader);
}
if (this.lenient == "query") {
reader = new LenientCharFilter(reader);
}
return super.initReader(fieldName, reader);
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer source = null;
TokenStream filter = null;
if (mode == "word") {
try {
source = new SkrtWordTokenizer();
} catch (Exception e) {
CommonHelpers.logger.error("cannot initialize SkrtWordTokenizer", e);
return null;
}
} else if (mode == "syl") {
source = new SkrtSyllableTokenizer();
} else if (mode == "space") {
source = new WhitespaceTokenizer();
}
if (skrtStopWords != null) { // a stop list was parsed
filter = new StopFilter(source, skrtStopWords);
} else {
filter = (TokenStream) source;
}
if (mergePrepositions) {
filter = new PrepositionMergingFilter(filter);
}
if (lenient == "index") {
filter = new Slp2RomanFilter(filter);
filter = new LenientTokenFilter(filter);
}
return new TokenStreamComponents(source, filter);
}
}