org.apache.lucene.analysis.hebrew.SimpleAnalyzer Maven / Gradle / Ivy
/***************************************************************************
* Copyright (C) 2010-2015 by *
* Itamar Syn-Hershko *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License *
* version 3, as published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public *
* License along with this program; if not, see *
* . *
**************************************************************************/
package org.apache.lucene.analysis.hebrew;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hebrew.TokenFilters.NiqqudFilter;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
public final class SimpleAnalyzer extends Analyzer {
/**
* An unmodifiable set containing some common Hebrew words that are usually not
* useful for searching.
*/
private final CharArraySet commonWords;
private Map suffixByTokenType = null;
private HashMap prefixesTree;
public SimpleAnalyzer(final HashMap prefixes) throws IOException {
this(prefixes, null);
}
public SimpleAnalyzer(final HashMap prefixes, final CharArraySet commonWords) throws IOException {
this.commonWords = commonWords;
this.prefixesTree = prefixes;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final HebrewTokenizer src = new HebrewTokenizer(prefixesTree);
TokenStream tok = new NiqqudFilter(src);
tok = new LowerCaseFilter(tok);
//consider adding a suffix filter?
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) {
super.setReader(reader);
}
};
}
public void registerSuffix(String tokenType, String suffix) {
if (suffixByTokenType == null)
suffixByTokenType = new java.util.HashMap<>();
if (!suffixByTokenType.containsKey(tokenType))
suffixByTokenType.put(tokenType, suffix.toCharArray());
}
}