querqy.lucene.contrib.rewrite.wordbreak.MorphologicalCompounder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of querqy-lucene Show documentation
Show all versions of querqy-lucene Show documentation
Querqy library for query rewriting for Lucene
The newest version!
package querqy.lucene.contrib.rewrite.wordbreak;
import org.apache.lucene.index.IndexReader;
import querqy.model.Term;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.*;
import java.util.stream.Collectors;
import static querqy.lucene.LuceneQueryUtil.toLuceneTerm;
public class MorphologicalCompounder implements LuceneCompounder {
private static final int DEFAULT_MAX_COMPOUND_EXPANSIONS = 10;
private final String dictionaryField;
private final boolean lowerCaseInput;
private final int minSuggestionFrequency;
private final Morphology morphology;// move to constructor
private final int maxCompoundExpansions;
public MorphologicalCompounder(final Morphology morphology,
final String dictionaryField,
final boolean lowerCaseInput,
final int minSuggestionFrequency,
final int maxCompoundExpansions) {
this.dictionaryField = dictionaryField;
this.lowerCaseInput = lowerCaseInput;
this.minSuggestionFrequency = minSuggestionFrequency;
this.morphology = morphology;
this.maxCompoundExpansions = maxCompoundExpansions;
}
public MorphologicalCompounder(final Morphology morphology,
final String dictionaryField,
final boolean lowercaseInput,
final int minSuggestionFrequency) {
this(morphology, dictionaryField, lowercaseInput, minSuggestionFrequency, DEFAULT_MAX_COMPOUND_EXPANSIONS);
}
@Override
public List combine(final Term[] terms, final IndexReader indexReader, final boolean reverse) {
if (terms.length < 2) {
return Collections.emptyList();
}
final int leftIdx = reverse ? 1 : 0;
final int rightIdx = reverse ? 0 : 1;
final Term left = lowerCaseInput ? terms[leftIdx].toLowerCaseTerm() : terms[leftIdx];
final Term right = lowerCaseInput ? terms[rightIdx].toLowerCaseTerm() : terms[rightIdx];
final int queueInitialCapacity = Math.min(maxCompoundExpansions, 10);
final Collection collector = Arrays.stream(morphology.suggestCompounds(left, right))
.collect(Collectors.toCollection(() ->
new PriorityQueue<>(queueInitialCapacity))
);
return collector.stream()
.sorted(Comparator.reverseOrder())
.limit(maxCompoundExpansions)
.map(compound -> new CompoundTerm(compound.compound, terms))
.filter(compound -> {
final org.apache.lucene.index.Term compoundTerm = toLuceneTerm(dictionaryField, compound.value, false);
final int compoundDf;
try {
compoundDf = indexReader.docFreq(compoundTerm);
} catch (final IOException e) {
throw new UncheckedIOException(e);
}
return (compoundDf >= minSuggestionFrequency);
})
.collect(Collectors.toList());
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy