com.s24.search.solr.analyzers.AnalyzingSentenceTokenizerFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-analyzers Show documentation
Show all versions of solr-analyzers Show documentation
A repository for all open sourced tokenizers and filters from shopping24.
The newest version!
package com.s24.search.solr.analyzers;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
/**
* @see AnalyzingSentenceTokenizer
*
* @author Shopping24 GmbH
*/
public class AnalyzingSentenceTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
private static final Logger logger = LoggerFactory.getLogger(AnalyzingSentenceTokenizerFactory.class);
// remove bad sentences?
private static final String FILTER_ARG = "filter";
private boolean filter = false;
// stopwords
private static final String STOP_WORD_FILE = "stopwordfile";
private String stopWordFilePath;
private CharArraySet stopWords = null;
// the magic threshold
@VisibleForTesting
static final float DEFAULT_COMMA_WORD_THRESHOLD = 0.2f;
private static final String COMMA_WORD_THRESHOLD_ARG = "commaWordThreshold";
private float commaWordThreshold = DEFAULT_COMMA_WORD_THRESHOLD;
static final float DEFAULT_MAX_STOPWORD_RATIO = 0.21f;
private static final String MAX_STOPWORD_RATIO_ARG = "maxStopwordRatio";
private float maxStopwordRatio = DEFAULT_MAX_STOPWORD_RATIO;
static final int DEFAULT_MIN_SENTENCE_LENGTH = 5;
private static final String MIN_SENTENCE_LENGTH_ARG = "minSentenceLength";
private int minSentenceLength = DEFAULT_MIN_SENTENCE_LENGTH;
/**
* Tokenizer gets constructed with the configured args.
*/
public AnalyzingSentenceTokenizerFactory(Map args) {
super(args);
if (args.containsKey(FILTER_ARG)) {
filter = Boolean.parseBoolean(args.get(FILTER_ARG));
}
if (args.containsKey(COMMA_WORD_THRESHOLD_ARG)) {
commaWordThreshold = Float.parseFloat(args.get(COMMA_WORD_THRESHOLD_ARG));
}
if (args.containsKey(MAX_STOPWORD_RATIO_ARG)) {
maxStopwordRatio = Float.parseFloat(args.get(MAX_STOPWORD_RATIO_ARG));
}
if (args.containsKey(MIN_SENTENCE_LENGTH_ARG)) {
minSentenceLength = Integer.parseInt(args.get(MIN_SENTENCE_LENGTH_ARG));
}
if (args.containsKey(STOP_WORD_FILE)) {
stopWordFilePath = args.get(STOP_WORD_FILE);
} else {
logger.warn(
"The {} param is not set. The sentences could not be analyzed (due to wrong calcuation of the information gain).",
STOP_WORD_FILE);
}
}
/**
* Reload the stop words
*/
@Override
public void inform(ResourceLoader loader) throws IOException {
if (stopWordFilePath != null) {
try {
stopWords = getWordSet(loader, stopWordFilePath, true);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
stopWords = new CharArraySet(0, false);
}
}
/**
* Create the tokenizer
*/
@Override
public Tokenizer create(AttributeFactory factory) {
return new AnalyzingSentenceTokenizer(factory, filter, stopWords, commaWordThreshold, maxStopwordRatio,
minSentenceLength);
}
}