org.fryske_akademy.exist.lucene.InsensitivePunctAnalyzer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of exist-db-addons Show documentation

Functions for exist-db: module to load properties from filesystem, cronjobs to import data from filesystem to collection, some analyzers for lucene.

The newest version!

package org.fryske_akademy.exist.lucene;

/*-
 * #%L
 * exist-db-addons
 * %%
 * Copyright (C) 2020 - 2021 Fryske Akademy
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.util.Version;

import java.io.Reader;

/**
 * Analyzer that separates tokens on whitespace and punctuation, converts tokens to lowercase and ascii
 */
public class InsensitivePunctAnalyzer extends Analyzer {
    private final boolean sensitive;

    public InsensitivePunctAnalyzer(boolean sensitive) {
        this.sensitive = sensitive;
    }

    public InsensitivePunctAnalyzer() {
        this.sensitive = false;
    }
    public InsensitivePunctAnalyzer(Version version) {
        this();
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new NoPunctuationTokenizer(reader);
        TokenStream filter = null;
        if (!sensitive) {
            filter = new LowerCaseFilter(tokenizer);
            filter = new ASCIIFoldingFilter(filter);
        }
        return filter == null ? new TokenStreamComponents(tokenizer) : new TokenStreamComponents(tokenizer, filter);
    }
}