org.apache.lucene.analysis.hebrew.HebrewQueryAnalyzer Maven / Gradle / Ivy
/**
* ************************************************************************
* Copyright (C) 2010-2015 by *
* Itamar Syn-Hershko *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License *
* version 3, as published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public *
* License along with this program; if not, see *
* . *
* ************************************************************************
*/
package org.apache.lucene.analysis.hebrew;
import com.code972.hebmorph.datastructures.DictHebMorph;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hebrew.TokenFilters.AddSuffixTokenFilter;
import org.apache.lucene.analysis.hebrew.TokenFilters.HebrewLemmatizerTokenFilter;
import org.apache.lucene.analysis.hebrew.TokenFilters.NiqqudFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import java.io.IOException;
public class HebrewQueryAnalyzer extends HebrewAnalyzer {
public HebrewQueryAnalyzer(DictHebMorph dict) throws IOException {
super(dict);
}
public HebrewQueryAnalyzer() throws IOException {
super();
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
// on query - if marked as keyword don't keep origin, else only lemmatized (don't suffix)
// if word termintates with $ will output word$, else will output all lemmas or word$ if OOV
HebrewTokenizer src = new HebrewTokenizer(dict.getPref(), SPECIAL_TOKENIZATION_CASES);
src.setSuffixForExactMatch(originalTermSuffix);
TokenStream tok = new NiqqudFilter(src);
tok = new ASCIIFoldingFilter(tok);
tok = new LowerCaseFilter(tok);
tok = new HebrewLemmatizerTokenFilter(tok, dict, false, true);
tok = new AddSuffixTokenFilter(tok, '$');
return new TokenStreamComponents(src, tok);
}
}