
solutions.siren.join.index.query.TermsEnumTermsQuery Maven / Gradle / Ivy
/**
* Copyright (c) 2016, SIREn Solutions. All Rights Reserved.
*
* This file is part of the SIREn project.
*
* SIREn is a free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* SIREn is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public
* License along with this program. If not, see .
*/
package solutions.siren.join.index.query;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.*;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import solutions.siren.join.action.terms.collector.*;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Set;
/**
* Specialization for a disjunction over many terms, encoded in a byte array, which scans the dictionary
* using a {@link TermsEnum} to collect documents ids.
* It behaves like a {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
* {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
*/
public class TermsEnumTermsQuery extends Query implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsEnumTermsQuery.class);
/**
* Reference to the encoded list of terms for late decoding.
*/
private byte[] encodedTerms;
/**
* The set of terms after decoding
*/
private BytesRefTermsSet termsSet;
/**
* The field to enumerate
*/
protected String field;
/**
* The cache key for this query
*/
protected final long cacheKey;
private static final ESLogger logger = Loggers.getLogger(TermsEnumTermsQuery.class);
/**
* Creates a new {@link TermsEnumTermsQuery} from the given field data.
*/
public TermsEnumTermsQuery(final byte[] encodedTerms, final String field, final long cacheKey) {
this.encodedTerms = encodedTerms;
this.cacheKey = cacheKey;
this.field = field;
}
@Override
public long ramBytesUsed() {
BytesRefTermsSet termsSet = this.getTermsSet();
return BASE_RAM_BYTES_USED + termsSet.size() * 8;
}
@Override
public String toString(String defaultField) {
BytesRefTermsSet termsSet = this.getTermsSet();
final StringBuilder sb = new StringBuilder("TermsEnumTermsQuery:");
return sb
.append(defaultField)
.append(":")
// Do not serialise the full array, but instead the number of elements - see issue #168
.append("[size=" + termsSet.size() + "]")
.toString();
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (!super.equals(obj)) {
return false;
}
if (cacheKey != ((TermsEnumTermsQuery) obj).cacheKey) { // relies on the cache key instead of the encodedTerms for equality
return false;
}
if (!field.equals(((TermsEnumTermsQuery) obj).field)) {
return false;
}
return true;
}
@Override
public int hashCode() {
int hashcode = super.hashCode();
hashcode = 31 * hashcode + ((int) cacheKey); // relies on the cache key instead of the encodedTerms for hashcode
hashcode = 31 * hashcode + field.hashCode();
return hashcode;
}
@Override
public Collection getChildResources() {
return Collections.emptyList();
}
/**
* Returns the set of terms. This method will perform a late-decoding of the encoded terms, and will release the
* byte array. This method needs to be synchronized as each segment thread will call it concurrently.
*/
protected synchronized BytesRefTermsSet getTermsSet() {
if (encodedTerms != null) { // late decoding of the encoded terms
long start = System.nanoTime();
termsSet = (BytesRefTermsSet) TermsSet.readFrom(new BytesRef(encodedTerms));
logger.debug("{}: Deserialized {} terms - took {} ms", new Object[] { Thread.currentThread().getName(), termsSet.size(), (System.nanoTime() - start) / 1000000 });
encodedTerms = null; // release reference to the byte array to be able to reclaim memory
}
return termsSet;
}
public DocIdSet getDocIdSet(LeafReaderContext context) throws IOException {
final Terms terms = context.reader().terms(field);
// make sure the field exists
if (terms == null) return null;
final BytesRefTermsSet termsSet = this.getTermsSet();
// make sure there are terms to filter on
if (termsSet == null || termsSet.isEmpty()) return null;
SeekingTermSetTermsEnum termsEnum = new SeekingTermSetTermsEnum(terms.iterator(), termsSet);
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
PostingsEnum docs = null;
while (termsEnum.next() != null) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}
return builder.build();
}
@Override
public Weight createWeight(final IndexSearcher searcher, final boolean needsScores) throws IOException {
return new ConstantScoreWeight(new CacheKeyFieldDataTermsQuery(cacheKey)) {
@Override
public void extractTerms(Set terms) {
// no-op
// This query is for abuse cases when the number of terms is too high to
// run efficiently as a BooleanQuery. So likewise we hide its terms in
// order to protect highlighters
}
private Scorer scorer(DocIdSet set) throws IOException {
if (set == null) {
return null;
}
final DocIdSetIterator disi = set.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), disi);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final Scorer scorer = scorer(TermsEnumTermsQuery.this.getDocIdSet(context));
if (scorer == null) {
return null;
}
return new DefaultBulkScorer(scorer);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return scorer(TermsEnumTermsQuery.this.getDocIdSet(context));
}
};
}
/**
*
* This query will be returned by the {@link ConstantScoreWeight} instead of the {@link TermsEnumTermsQuery}
* and used by the
* {@link LRUQueryCache.CachingWrapperWeight} to cache the query.
* This is necessary in order to avoid caching the byte array and long hash set, which is not memory friendly
* and not very efficient.
*
*
* Extends MultiTermQuery in order to be detected as "costly" query by {@link UsageTrackingQueryCachingPolicy}
* and trigger early caching.
*
*/
private static class CacheKeyFieldDataTermsQuery extends MultiTermQuery {
private final long cacheKey;
public CacheKeyFieldDataTermsQuery(long cacheKey) {
super("");
this.cacheKey = cacheKey;
}
@Override
public String toString(String field) {
final StringBuilder sb = new StringBuilder("CacheKeyFieldDataTermsQuery:");
return sb.append(field).append(":").append("[cacheKey=" + cacheKey + "]").toString();
}
@Override
public boolean equals(Object o) {
if (!(o instanceof CacheKeyFieldDataTermsQuery)) return false;
CacheKeyFieldDataTermsQuery other = (CacheKeyFieldDataTermsQuery) o;
return super.equals(o) && this.cacheKey == other.cacheKey;
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return null;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((int) cacheKey);
return result;
}
}
static class SeekingTermSetTermsEnum extends FilteredTermsEnum {
private final BytesRefHash terms;
private final int[] ords;
private final int lastElement;
private final BytesRef lastTerm;
private final BytesRef spare = new BytesRef();
private BytesRef seekTerm;
private int upto = 0;
SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefTermsSet termsSet) {
super(tenum);
this.terms = termsSet.getBytesRefHash();
this.ords = this.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
lastElement = terms.size() - 1;
lastTerm = terms.get(ords[lastElement], new BytesRef());
seekTerm = terms.get(ords[upto], spare);
}
@Override
protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
BytesRef temp = seekTerm;
seekTerm = null;
return temp;
}
@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
if (term.compareTo(lastTerm) > 0) {
return AcceptStatus.END;
}
BytesRef currentTerm = terms.get(ords[upto], spare);
if (term.compareTo(currentTerm) == 0) {
if (upto == lastElement) {
return AcceptStatus.YES;
} else {
seekTerm = terms.get(ords[++upto], spare);
return AcceptStatus.YES_AND_SEEK;
}
} else {
if (upto == lastElement) {
return AcceptStatus.NO;
} else { // Our current term doesn't match the the given term.
int cmp;
do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher.
if (upto == lastElement) {
return AcceptStatus.NO;
}
// typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of
// our terms so we don't do a binary search here
seekTerm = terms.get(ords[++upto], spare);
} while ((cmp = seekTerm.compareTo(term)) < 0);
if (cmp == 0) {
if (upto == lastElement) {
return AcceptStatus.YES;
}
seekTerm = terms.get(ords[++upto], spare);
return AcceptStatus.YES_AND_SEEK;
} else {
return AcceptStatus.NO_AND_SEEK;
}
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy