org.wikimedia.search.glent.analysis.CirrusNearMatchNormalizer Maven / Gradle / Ivy
package org.wikimedia.search.glent.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import com.ibm.icu.text.Normalizer2;
public class CirrusNearMatchNormalizer extends GlentNormalizer {
private static final Normalizer2 NFKCCF = Normalizer2.getNFKCCasefoldInstance();
private static final NormalizeCharMap NEAR_SPACE_FLATTENER = initNearSpaceFlattener();
private static final String SPACE = "\u0020";
private static NormalizeCharMap initNearSpaceFlattener() {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("'", SPACE); // apostrophe
builder.add("\u2019", SPACE); // right single quote
builder.add("\u02BC", SPACE); // modifier letter apostrophe
builder.add("_", SPACE); // underscore
builder.add("-", SPACE); // hyphen
return builder.build();
}
@Override
protected TokenStream filters(TokenStream source) {
return new TrimFilter(source);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
reader = new MappingCharFilter(NEAR_SPACE_FLATTENER, reader);
return new ICUNormalizer2CharFilter(reader, NFKCCF);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy