querqy.lucene.contrib.rewrite.wordbreak.SuffixGroup Maven / Gradle / Ivy
Show all versions of querqy-lucene Show documentation
package querqy.lucene.contrib.rewrite.wordbreak;
import querqy.CompoundCharSequence;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.Collection;
/**
* A SuffixGroup represents all word forms that can be generated once a suffix has been stripped off.
*
* For example, German has a suffix -en that is added to the modifier word in compounding:
*
* strauß + ei -> straußenei (strauß +en ei)
*
* There are further structures that use this suffix:
*
* stadion + verbot -> stadienverbot (stadion -on +en verbot)
* aphorismus + schatz -> aphorismenschat (aphorismus -us +en schatz)
* ...
*
*
* All word forms using the +en would be combined into a single SuffixGroup and the different structures (Null, -on,
* -us etc.) will be kept in the {@link #generatorAndWeights} list.
*
* Suffixes can overlap: -ien/-nen are both contained in -en, -en is contained in -n, and finally, all suffixes are
* contained in the null (or, zero-length) suffix. This relationship is expressed in the {@link #next} property of
* a SuffixGroup. -n is a 'next' of the zero-length SuffixGroup, -en is a 'next' of -n, and -ien and -nen are both
* 'nexts' of -en. This organisation of suffixed helps speed up the lookup, as we can stop the lookup as soon as the
* first element in the 'next' list could be matched in the index.
*
* @author renekrie
*/
public class SuffixGroup {
private final CharSequence suffix;
private final int suffixLength;
private final List generatorAndWeights;
private final List next;
public SuffixGroup(final CharSequence suffix, final List generatorAndWeights,
final SuffixGroup... next) {
this.suffix = suffix;
this.generatorAndWeights = generatorAndWeights;
this.next = Arrays.asList(next);
this.suffixLength = suffix == null ? 0 : suffix.length();
}
public List generateSuggestions(final CharSequence left) {
return generateSuggestions(left, 0);
}
private List generateSuggestions(final CharSequence left, final int matchingFromEndOfLeft) {
final int leftLength = left.length();
if (left.length() <= suffixLength) {
return Collections.emptyList();
}
if (suffixLength > 0 && left.length() > suffixLength) {
for (int i = 1 + matchingFromEndOfLeft; i <= suffixLength; i++) {
if (left.charAt(leftLength - i) != suffix.charAt(suffixLength - i)) {
return Collections.emptyList();
}
}
}
final CharSequence reduced = suffixLength == 0 ? left : left.subSequence(0, leftLength - suffixLength);
final List res =
generatorAndWeights.stream()
.map(generatorAndWeights -> generatorAndWeights.generateSuggestion(reduced))
.filter(Optional::isPresent)
.map(Optional::get)
.collect(Collectors.toList());
final List suggestions = next.stream()
.map(sg -> sg.generateSuggestions(left, suffixLength))
.flatMap(Collection::stream)
.collect(Collectors.toList());
res.addAll(suggestions);
return res;
}
public List generateCompoundSuggestions(final CharSequence left, final CharSequence right) {
return generateSuggestions(left, 0)
.stream().map(
suggestion -> new Suggestion(
new CharSequence[]{new CompoundCharSequence(null, suggestion.sequence[0], right)},
suggestion.score)
).collect(Collectors.toList());
}
}