com.s24.search.solr.analyzers.AnalyzingSentenceTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-analyzers Show documentation
Show all versions of solr-analyzers Show documentation
A repository for all open sourced tokenizers and filters from shopping24.
The newest version!
package com.s24.search.solr.analyzers;
import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.AttributeFactory;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.io.CharStreams;
/**
* Tokenizer which splits the input into sentences and emits only those sentences that do not contain too many
* stopwords. Sentences that contain many commas are split into their comma-separated parts and analyzed per part. If
* the input contains only a single sentence, it is always emitted.
*
* @author Shopping24 GmbH
*/
public class AnalyzingSentenceTokenizer extends Tokenizer {
// determine sentences
private static final Pattern SENTENCE_PATTERN = Pattern.compile("(?<=[.?!\\|;-])\\s+(?=\\p{Lu})");
private static final Splitter SPACE_SPLITTER = Splitter.on(CharMatcher.WHITESPACE).trimResults();
private static final CharMatcher SENTENCE_NOISE = CharMatcher.DIGIT.or(
CharMatcher.anyOf(",;.:$!?%&/<>™®\\-–'\"|"));
private static final Pattern COMMA_PATTERN = Pattern.compile("(,+(?=\\D))|((?<=\\D),+)|;");
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
// register attributes to manipulate
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute positionIncrement = addAttribute(PositionIncrementAttribute.class);
// this is the internal state
private final StringBuilder inputBuffer = new StringBuilder();
private final Matcher sentenceMatcher;
private int index;
private boolean lastSentenceFromCommaSplit = false;
// configuration
private final boolean removeBadSentences;
private final CharArraySet stopWords;
private final float commaWordThreshold;
private final float maxStopwordRatio;
private final int minSentenceLength;
/**
* Construct a token stream processing the given input using the given AttributeFactory.
*
* @param factory
* the factory.
* @param removeBadSentences
* if {@code true}, sentences with too many stopwords are filtered out.
* @param stopWords
* the stopwords.
* @param commaWordThreshold
* the threshold that defines the "comma density" that, if exceeded, causes a sentence to be split into
* sub-sentences that are analyzed individually.
* @param maxStopwordRatio
* if the ratio of stopwords exceeds this threshold, the sentence is filtered out.
* @param minSentenceLength
* a sentence must contain at least this many words, otherwise it is not analyzed and always emitted.
*/
public AnalyzingSentenceTokenizer(AttributeFactory factory, boolean removeBadSentences, CharArraySet stopWords,
float commaWordThreshold, float maxStopwordRatio, int minSentenceLength) {
super(factory);
this.removeBadSentences = removeBadSentences;
this.stopWords = stopWords;
this.commaWordThreshold = commaWordThreshold;
this.maxStopwordRatio = maxStopwordRatio;
this.minSentenceLength = minSentenceLength;
// Use "" instead of str so don't consume chars
// (fillBuffer) from the input on throwing IAE below:
sentenceMatcher = SENTENCE_PATTERN.matcher("");
}
/**
* {@inheritDoc}
*
* Sets the final offset, does not reset internal state.
*/
@Override
public void end() throws IOException {
super.end();
// do a offset correction. Taken from PatternTokenzier
final int ofs = correctOffset(inputBuffer.length());
offsetAtt.setOffset(ofs, ofs);
}
/**
* {@inheritDoc}
*
* Method is called after the input has been set. This should reset all internal state and adjust to the new input.
*/
@Override
public void reset() throws IOException {
super.reset();
// read full input into string buffer. This is not very memory efficient
// but we need a full view of the input document in order to do our
// pattern matching
inputBuffer.setLength(0);
inputBuffer.append(CharStreams.toString(input));
// reset matcher to the new input
sentenceMatcher.reset(inputBuffer);
// we're starting from 0
index = 0;
}
/**
* {@inheritDoc}
*
* @return true
to indicate to the caller to read the current attribute state and false
to
* indicate the end of the token stream.
*/
@Override
public final boolean incrementToken() throws IOException {
// increment the internal token state but check that we do not overrun the
// input buffer length
while (index < inputBuffer.length()) {
// increment the internal token state until the current state should be
// emitted from the token stream-
if (incrementTokenInternal()) {
return true;
}
}
return false;
}
/**
*
* @return true
if the current attribute state should be emitted
*/
protected boolean incrementTokenInternal() throws IOException {
// find the next split sentence occurence from our current index
String sentence = null;
if (sentenceMatcher.find(index)) {
// get sentence up to the current match
sentence = inputBuffer.substring(index, sentenceMatcher.end());
} else {
// no more matches. Check the remaining chars as candidate
sentence = inputBuffer.substring(index, inputBuffer.length());
}
Matcher commaMatcher = COMMA_PATTERN.matcher(sentence);
// check for commas in the current sentence.
if (commaMatcher.find()) {
// update the comma to word ratio of the whole sentence. If we exceed
// the threshold,
int commaCount = 1;
while (commaMatcher.find()) {
commaCount++;
}
float commaToWordRatio = commaCount / (float) (CharMatcher.WHITESPACE.countIn(sentence) - 1);
// comma to word ratio does not exceed threshold
if (commaToWordRatio > commaWordThreshold || lastSentenceFromCommaSplit) {
commaMatcher.reset();
commaMatcher.find();
sentence = sentence.substring(0, commaMatcher.end());
lastSentenceFromCommaSplit = true;
}
} else {
lastSentenceFromCommaSplit = false;
}
// Is this the only sentence in the input?
boolean isOnlySentence = sentence.length() == inputBuffer.length();
// should we emit the current sentence?
boolean emit = isQualitySentence(sentence) || isOnlySentence || !removeBadSentences;
if (emit) {
emitSentence(sentence);
}
// emitted or not, increase the working index
index = index + sentence.length();
// return whether to emit the current sentence
return emit;
}
/**
* Returns true if the sentence has a high quality.
*
* @param sentence
* the sentence.
*/
private boolean isQualitySentence(CharSequence sentence) {
SentenceStatistics sentenceStatistics = analyzeSentence(sentence);
// check information gain
boolean highInformationGain = sentenceStatistics.getStopwordsRatio() <= maxStopwordRatio;
boolean shortSentence = sentenceStatistics.getWordCount() < minSentenceLength;
return highInformationGain || shortSentence;
}
/**
* Emits the given sentence as a token.
*
* @param sentence
* Sentence to emit.
*/
private void emitSentence(CharSequence sentence) {
termAtt.setEmpty().append(sentence);
offsetAtt
.setOffset(correctOffset(index),
correctOffset(index + sentence.length()));
positionIncrement.setPositionIncrement(1);
}
/**
* Analyzes the sentence for stopwords appearances. It will remove whitespaces and symbols from the sentence to
* guarantee a high stopwords match.
*
* @param sentence
* Sentence to analyze.
*/
private SentenceStatistics analyzeSentence(CharSequence sentence) {
// remove noise: trim, noise(|<>:;...), multiple whitespace and to lower
String cleanSentence = WHITESPACE_PATTERN.matcher(SENTENCE_NOISE.removeFrom(
CharMatcher.WHITESPACE.trimFrom(sentence))).replaceAll(" ")
.toLowerCase(Locale.GERMAN);
// split sentence into words
Iterable words = SPACE_SPLITTER.split(cleanSentence);
int stopWordCount = 0;
int wordCount = 0;
for (String w : words) {
if (stopWords.contains(w)) {
stopWordCount++;
}
wordCount++;
}
// calculate ratio
return new SentenceStatistics(wordCount, stopWordCount);
}
private static class SentenceStatistics {
private final int wordCount;
private final int stopwordCount;
public SentenceStatistics(int wordCount, int stopwordCount) {
this.wordCount = wordCount;
this.stopwordCount = stopwordCount;
}
public int getWordCount() {
return wordCount;
}
public float getStopwordsRatio() {
return wordCount > 0 ? stopwordCount / (float) wordCount : 0;
}
}
}