org.apache.lucene.analysis.hebrew.SimpleAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hebmorph-lucene Show documentation
There is a newer version: 6.6.1
/***************************************************************************
 *   Copyright (C) 2010-2015 by                                            *
 *      Itamar Syn-Hershko                      *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Affero General Public License           *
 *   version 3, as published by the Free Software Foundation.              *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU Affero General Public License for more details.                   *
 *                                                                         *
 *   You should have received a copy of the GNU Affero General Public      *
 *   License along with this program; if not, see                          *
 *   .                                       *
 **************************************************************************/
package org.apache.lucene.analysis.hebrew;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hebrew.TokenFilters.NiqqudFilter;

import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;

public final class SimpleAnalyzer extends Analyzer {
    /**
     * An unmodifiable set containing some common Hebrew words that are usually not
     * useful for searching.
     */
    private final CharArraySet commonWords;

    private Map suffixByTokenType = null;
    private HashMap prefixesTree;

    public SimpleAnalyzer(final HashMap prefixes) throws IOException {
        this(prefixes, null);
    }

    public SimpleAnalyzer(final HashMap prefixes, final CharArraySet commonWords) throws IOException {
        this.commonWords = commonWords;
        this.prefixesTree = prefixes;
    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName) {
        final HebrewTokenizer src = new HebrewTokenizer(prefixesTree);
        TokenStream tok = new NiqqudFilter(src);
        tok = new LowerCaseFilter(tok);
        //consider adding a suffix filter?
        return new TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) {
                super.setReader(reader);
            }
        };
    }

    public void registerSuffix(String tokenType, String suffix) {
        if (suffixByTokenType == null)
            suffixByTokenType = new java.util.HashMap<>();

        if (!suffixByTokenType.containsKey(tokenType))
            suffixByTokenType.put(tokenType, suffix.toCharArray());
    }
}