querqy.lucene.rewrite.TermSubQueryBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of querqy-lucene Show documentation
Show all versions of querqy-lucene Show documentation
Querqy library for query rewriting for Lucene
The newest version!
/**
*
*/
package querqy.lucene.rewrite;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.io.input.CharSequenceReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BytesRef;
import querqy.lucene.rewrite.cache.CacheKey;
import querqy.lucene.rewrite.cache.TermQueryCache;
import querqy.lucene.rewrite.cache.TermQueryCacheValue;
import querqy.lucene.rewrite.prms.PRMSAndQuery;
import querqy.lucene.rewrite.prms.PRMSDisjunctionMaxQuery;
import querqy.lucene.rewrite.prms.PRMSQuery;
import querqy.lucene.rewrite.prms.PRMSTermQuery;
import querqy.model.Term;
import querqy.rewrite.commonrules.model.PositionSequence;
/**
* @author rene
*
*/
public class TermSubQueryBuilder {
private final TermQueryCache termQueryCache;
private final Analyzer analyzer;
public TermSubQueryBuilder(final Analyzer analyzer, final TermQueryCache termQueryCache) {
this.termQueryCache = termQueryCache;
this.analyzer = analyzer;
}
public TermSubQueryFactory termToFactory(final String fieldname, final Term sourceTerm, final FieldBoost boost)
throws IOException {
final CacheKey cacheKey;
if (termQueryCache != null) {
cacheKey = new CacheKey(fieldname, sourceTerm);
TermQueryCacheValue cacheValue = termQueryCache.get(cacheKey);
if (cacheValue != null) {
// The cache references factories with pre-analyzed terms, or cache entries without a
// query factory if the term does not exist in the index. cacheValue.hasQuery() returns
// true/false correspondingly.
// Cache entries don't have a boost factor, it is only added later via the queryFactory.
return (cacheValue.hasQuery())
? new TermSubQueryFactory(cacheValue, boost, sourceTerm, fieldname)
: null;
}
} else {
cacheKey = null;
}
final LuceneQueryFactoryAndPRMSQuery root;
TokenStream ts = null;
try {
ts = analyzer.tokenStream(fieldname, new CharSequenceReader(sourceTerm));
final CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
final PositionIncrementAttribute posIncAttr = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
final PositionSequence sequence = new PositionSequence<>();
while (ts.incrementToken()) {
final int inc = posIncAttr.getPositionIncrement();
if (inc > 0 || sequence.isEmpty()) {
sequence.nextPosition();
}
sequence.addElement(new org.apache.lucene.index.Term(fieldname, new BytesRef(termAttr)));
}
root = positionSequenceToQueryFactoryAndPRMS(sequence, sourceTerm);
} finally {
if (ts != null) {
try {
ts.close();
} catch (IOException e) {
}
}
}
putQueryFactoryAndPRMSQueryIntoCache(cacheKey, root);
return root == null ? null : new TermSubQueryFactory(root, boost, sourceTerm, fieldname);
}
protected void putQueryFactoryAndPRMSQueryIntoCache(final CacheKey cacheKey, final LuceneQueryFactoryAndPRMSQuery value) {
if (cacheKey != null && value != null && termQueryCache != null) {
termQueryCache.put(cacheKey, new TermQueryCacheValue(value));
}
}
public LuceneQueryFactoryAndPRMSQuery positionSequenceToQueryFactoryAndPRMS(
final PositionSequence sequence, final Term sourceTerm) {
switch (sequence.size()) {
case 0: return null;
case 1:
final List first = sequence.getFirst();
return first.isEmpty() ? null: newPosition(sequence.iterator(), null, null, sourceTerm);
default:
return newPosition(sequence.iterator(), null, null, sourceTerm);
}
}
protected LuceneQueryFactoryAndPRMSQuery newPosition(final Iterator> seqIterator,
final BooleanQueryFactory incomingBq, final List incomingPrmsClauses, final Term sourceTerm) {
final List position = seqIterator.next();
switch (position.size()) {
case 0: throw new IllegalArgumentException("Sequence must not contain an empty position");
case 1: {
final org.apache.lucene.index.Term term = position.get(0);
final TermQueryFactory tqf = createTermQueryFactory(term, sourceTerm);
final PRMSTermQuery prmsTermQuery = new PRMSTermQuery(term);
if (incomingBq != null) {
incomingBq.add(tqf, Occur.MUST);
incomingPrmsClauses.add(prmsTermQuery);
if (seqIterator.hasNext()) {
newPosition(seqIterator, incomingBq, incomingPrmsClauses, sourceTerm);
}
return null; // we are not the first position
} else {
if (seqIterator.hasNext()) {
final BooleanQueryFactory bq = new BooleanQueryFactory(true);
final List prmsClauses = new LinkedList<>();
bq.add(tqf, Occur.MUST);
prmsClauses.add(prmsTermQuery);
newPosition(seqIterator, bq, prmsClauses, sourceTerm);
return new LuceneQueryFactoryAndPRMSQuery(bq, new PRMSAndQuery(prmsClauses));
} else {
return new LuceneQueryFactoryAndPRMSQuery(tqf, prmsTermQuery);
}
}
}
default:
final boolean hasNextPosition = seqIterator.hasNext();
// the dmq for this position
final DisjunctionMaxQueryFactory dmq = new DisjunctionMaxQueryFactory(0f);
final List prmsClauses = new LinkedList<>();
if (!hasNextPosition) {
for (final org.apache.lucene.index.Term term: position) {
dmq.add(createTermQueryFactory(term, sourceTerm));
prmsClauses.add(new PRMSTermQuery(term));
}
} else {
final Iterator posIterator = position.iterator();
while (posIterator.hasNext()) {
final org.apache.lucene.index.Term term = posIterator.next();
final TermQueryFactory tqf = createTermQueryFactory(term, sourceTerm);
final PRMSTermQuery prmsTermQuery = new PRMSTermQuery(term);
if (posIterator.hasNext()) {
dmq.add(tqf);
prmsClauses.add(prmsTermQuery);
} else {
final BooleanQueryFactory bq = new BooleanQueryFactory(true);
final List bqPrmsClauses = new LinkedList<>();
bq.add(tqf, Occur.MUST);
bqPrmsClauses.add(prmsTermQuery);
newPosition(seqIterator, bq, bqPrmsClauses, sourceTerm);
dmq.add(bq);
prmsClauses.add(new PRMSAndQuery(bqPrmsClauses));
}
}
}
return new LuceneQueryFactoryAndPRMSQuery(dmq, new PRMSDisjunctionMaxQuery(prmsClauses));
}
}
protected TermQueryFactory createTermQueryFactory(final org.apache.lucene.index.Term term, final Term sourceTerm) {
return new TermQueryFactory(term, sourceTerm);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy