querqy.lucene.contrib.rewrite.wordbreak.Collector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of querqy-lucene Show documentation
Show all versions of querqy-lucene Show documentation
Querqy library for query rewriting for Lucene
The newest version!
package querqy.lucene.contrib.rewrite.wordbreak;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.PriorityQueue;
import java.util.Queue;
/**
* A Collector receives the de-compounding candidates, checks whether they exist in the index, and, optionally,
* verifies that they co-occur in a document. It collects the candidates that match these requirements, ranks them and
* keeps up to 'maxDecompoundExpansions' of them. The number of index lookups is restricted by the 'maxEvaluations'
* property.
*
* Candidates are scores like this:
*
* The score depends on two main variables: A 'prior' score that reflects general the popularity of the morphological
* structure in compound creation (see constants names PRIOR... in {@link GermanDecompoundingMorphology}), and a score
* that depends on the document frequency (df) in the index of the two terms that form the compound. The df-dependent
* score is calculated as:
*
*
* score_df = -log(count(term1) / N) -log(count(term2) / N)
*
*
* where a smaller value will be better.
*
* To avoid issues with missing terms, we use add-1 smoothing:
*
*
* score_df = -log((count(term1) +1) / (N + 1)) -log((count(term2) +1)/ (N + 1))
*
*
* which can be reformulated into:
*
*
* score_df = 2*log(N+1) - (log(count(term1) +1) + log(count(term2) +1))
*
*
* We combine it with the score from the prior (score_prior) in a weighted manner:
*
*
* score = score_prior^w / score_df^(1-w)
*
*
*
*
*
* The approach to the calculation of score_df follows:
*
* - Schiller, A.: German compound analysis with wfsc. In Proceedings of Finite State Methods and Natural
* Language Processing 2005, Helsinki (2005)
* - Marek, T.: Analysis of german compounds using weighted finite state transducers. Technical report, BA Thesis,
* Universiät Tübingen (2006)
* - Both of the above quoted in: Alfonseca, E. & Pharies, S.: German Decompounding in a Difficult Corpus.
* CICLing 2008
*
*
* @author renekrie
*/
public class Collector {
/**
* A call to {@link #collect(CharSequence, CharSequence, Term, int, float)} returns a CollectionState, containing
* the information about whether the maximum number of evaluations have been reached and if the terms could be found
* in the index (fulfilling all requirements about verification and minimum index frequency).
*/
enum CollectionState {
MAX_EVALUATIONS_REACHED(null, true),
MATCHED_MAX_EVALUATIONS_REACHED(true, true),
MATCHED_MAX_EVALUATIONS_NOT_REACHED(true, false),
NOT_MATCHED_MAX_EVALUATIONS_REACHED(false, true),
NOT_MATCHED_MAX_EVALUATIONS_NOT_REACHED(false, false);
private final Boolean matched;
private final boolean maxEvaluationsReached;
CollectionState(final Boolean matched, final boolean maxEvaluationsReached) {
this.matched = matched;
this.maxEvaluationsReached = maxEvaluationsReached;
}
boolean isMaxEvaluationsReached() {
return maxEvaluationsReached;
}
Optional getMatched() {
return Optional.ofNullable(matched);
}
}
private final Queue collection;
private final int minSuggestionFrequency;
private final boolean verifyCollation;
private final IndexReader indexReader;
private final String dictionaryField;
private final float weightDfObservation;
private final float totalDocsNorm;
private final int maxDecompoundExpansions;
private final IndexSearcher searcher;
private final int maxEvaluations;
private int evaluations = 0;
/**
*
* @param minSuggestionFrequency Minimum frequency of each split term in the index
* @param maxDecompoundExpansions Maximum number of decompound structures to return
* @param maxEvaluations Maximum number of lookups in the index
* @param verifyCollation Iff true, the compound parts must co-occur in a document in the index
* @param indexReader The index reader
* @param dictionaryField The document field to use for the lookup
* @param weightDfObservation The weight of the observed document frequencies when combining with the score of the morphological compound pattern.
*/
public Collector(final int minSuggestionFrequency,final int maxDecompoundExpansions, final int maxEvaluations,
final boolean verifyCollation, final IndexReader indexReader, final String dictionaryField,
final float weightDfObservation) {
final int queueInitialCapacity = Math.min(maxDecompoundExpansions, 10);
collection = new PriorityQueue<>(queueInitialCapacity);
this.minSuggestionFrequency = minSuggestionFrequency;
this.maxDecompoundExpansions = maxDecompoundExpansions;
this.verifyCollation = verifyCollation;
this.indexReader = indexReader;
searcher = new IndexSearcher(indexReader);
this.dictionaryField = dictionaryField;
this.weightDfObservation = weightDfObservation;
this.maxEvaluations = maxEvaluations;
this.totalDocsNorm = 2f * (float) Math.log(1 + indexReader.numDocs());
}
/**
*
* @param left The modifier character sequence
* @param right The head character sequence
* @param rightTerm The head character sequence as a term in the dictionary field
* @param rightDf The document frequency of the rightTerm
* @param weightMorphologicalPattern The weight of this specific morphological pattern.
* @return The state of candidate collection
*/
public CollectionState collect(final CharSequence left, final CharSequence right, final Term rightTerm,
final int rightDf, final float weightMorphologicalPattern) {
if (maxEvaluations <= evaluations) {
return CollectionState.MAX_EVALUATIONS_REACHED;
}
evaluations++;
final Term leftTerm = new Term(dictionaryField, new BytesRef(left));
final int leftDf;
try {
leftDf = indexReader.docFreq(leftTerm);
if (leftDf >= minSuggestionFrequency) {
final float score = weightDfObservation == 0f ? weightMorphologicalPattern
: weightMorphologicalPattern /
((float) Math.pow(totalDocsNorm - Math.log(leftDf + 1) - Math.log(rightDf + 1),
weightDfObservation));
if (verifyCollation) {
if (((collection.size() < maxDecompoundExpansions) || (score > collection.element().score))
&& hasMinMatches(1, leftTerm, rightTerm)) {
collection.offer(new Suggestion(new CharSequence[]{left, right},
score));
if (collection.size() > maxDecompoundExpansions) {
collection.poll();
}
return evaluations == maxEvaluations
? CollectionState.MATCHED_MAX_EVALUATIONS_REACHED
: CollectionState.MATCHED_MAX_EVALUATIONS_NOT_REACHED;
}
} else {
collection.offer(new Suggestion(new CharSequence[]{left, right},
score));
if (collection.size() > maxDecompoundExpansions) {
collection.poll();
}
return evaluations == maxEvaluations
? CollectionState.MATCHED_MAX_EVALUATIONS_REACHED
: CollectionState.MATCHED_MAX_EVALUATIONS_NOT_REACHED;
}
}
} catch (final IOException e) {
throw new UncheckedIOException(e);
}
return evaluations == maxEvaluations
? CollectionState.NOT_MATCHED_MAX_EVALUATIONS_REACHED
: CollectionState.NOT_MATCHED_MAX_EVALUATIONS_NOT_REACHED;
}
public boolean maxEvaluationsReached() {
return evaluations >= maxEvaluations;
}
/**
* Get the collected results ordered by decreasing score. This resets the internal result queue.
*
* @return The collected results.
*/
public List flushResults() {
if (collection.isEmpty()) {
return Collections.emptyList();
}
final LinkedList result = new LinkedList<>();
while (collection.size() > 0) {
result.addFirst(collection.remove().sequence);
}
return result;
}
private boolean hasMinMatches(final int minCount, final Term term1, final Term term2)
throws IOException {
final IndexReaderContext topReaderContext = searcher.getTopReaderContext();
final IndexReader indexReader = topReaderContext.reader();
// TODO: deleted documents?
final int numDocs = indexReader.numDocs();
if (minCount > numDocs) {
return false;
}
final int df1 = indexReader.docFreq(term1);
if (minCount > df1) {
return false;
}
final int df2 = indexReader.docFreq(term2);
if (minCount > df2) {
return false;
}
int count = 0;
for (final LeafReaderContext context : topReaderContext.leaves()) {
final Terms terms1 = context.reader().terms(term1.field());
if (terms1 != null) {
final Terms terms2 = context.reader().terms(term2.field());
if (terms2 != null) {
final TermsEnum termsEnum1 = terms1.iterator();
if (!termsEnum1.seekExact(term1.bytes())) {
continue;
}
final TermsEnum termsEnum2 = terms2.iterator();
if (!termsEnum2.seekExact(term2.bytes())) {
continue;
}
final PostingsEnum postings1 = termsEnum1.postings(null, PostingsEnum.NONE);
final PostingsEnum postings2 = termsEnum2.postings(null, PostingsEnum.NONE);
int doc1 = postings1.nextDoc();
while (doc1 != DocIdSetIterator.NO_MORE_DOCS) {
int doc2 = postings2.advance(doc1);
if (doc2 == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (doc2 == doc1) {
count++;
if (count >= minCount) {
return true;
}
} else if (doc2 > doc1) {
doc1 = postings1.advance(doc2);
if (doc2 == doc1) {
count++;
if (count >= minCount) {
return true;
}
}
}
}
}
}
}
return false;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy