All Downloads are FREE. Search and download functionalities are using the official Maven repository.

querqy.lucene.contrib.rewrite.wordbreak.SuffixGroup Maven / Gradle / Ivy

The newest version!
package querqy.lucene.contrib.rewrite.wordbreak;

import querqy.CompoundCharSequence;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.Collection;

/**
 * 

A SuffixGroup represents all word forms that can be generated once a suffix has been stripped off.

* *

For example, German has a suffix -en that is added to the modifier word in compounding:

*
 *     strauß + ei -> straußenei (strauß +en ei)
 * 
* There are further structures that use this suffix: *
 *     stadion + verbot -> stadienverbot (stadion -on +en verbot)
 *     aphorismus + schatz -> aphorismenschat (aphorismus -us +en schatz)
 *     ...
 * 
* *

All word forms using the +en would be combined into a single SuffixGroup and the different structures (Null, -on, * -us etc.) will be kept in the {@link #generatorAndWeights} list.

* *

Suffixes can overlap: -ien/-nen are both contained in -en, -en is contained in -n, and finally, all suffixes are * contained in the null (or, zero-length) suffix. This relationship is expressed in the {@link #next} property of * a SuffixGroup. -n is a 'next' of the zero-length SuffixGroup, -en is a 'next' of -n, and -ien and -nen are both * 'nexts' of -en. This organisation of suffixed helps speed up the lookup, as we can stop the lookup as soon as the * first element in the 'next' list could be matched in the index.

* * @author renekrie */ public class SuffixGroup { private final CharSequence suffix; private final int suffixLength; private final List generatorAndWeights; private final List next; public SuffixGroup(final CharSequence suffix, final List generatorAndWeights, final SuffixGroup... next) { this.suffix = suffix; this.generatorAndWeights = generatorAndWeights; this.next = Arrays.asList(next); this.suffixLength = suffix == null ? 0 : suffix.length(); } public List generateSuggestions(final CharSequence left) { return generateSuggestions(left, 0); } private List generateSuggestions(final CharSequence left, final int matchingFromEndOfLeft) { final int leftLength = left.length(); if (left.length() <= suffixLength) { return Collections.emptyList(); } if (suffixLength > 0 && left.length() > suffixLength) { for (int i = 1 + matchingFromEndOfLeft; i <= suffixLength; i++) { if (left.charAt(leftLength - i) != suffix.charAt(suffixLength - i)) { return Collections.emptyList(); } } } final CharSequence reduced = suffixLength == 0 ? left : left.subSequence(0, leftLength - suffixLength); final List res = generatorAndWeights.stream() .map(generatorAndWeights -> generatorAndWeights.generateSuggestion(reduced)) .filter(Optional::isPresent) .map(Optional::get) .collect(Collectors.toList()); final List suggestions = next.stream() .map(sg -> sg.generateSuggestions(left, suffixLength)) .flatMap(Collection::stream) .collect(Collectors.toList()); res.addAll(suggestions); return res; } public List generateCompoundSuggestions(final CharSequence left, final CharSequence right) { return generateSuggestions(left, 0) .stream().map( suggestion -> new Suggestion( new CharSequence[]{new CompoundCharSequence(null, suggestion.sequence[0], right)}, suggestion.score) ).collect(Collectors.toList()); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy