solutions.siren.join.index.query.TermsEnumTermsQuery Maven / Gradle / Ivy

Go to download
/**
 * Copyright (c) 2016, SIREn Solutions. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package solutions.siren.join.index.query;

import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.*;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import solutions.siren.join.action.terms.collector.*;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Set;

/**
 * Specialization for a disjunction over many terms, encoded in a byte array, which scans the dictionary
 * using a {@link TermsEnum} to collect documents ids.
 * It behaves like a {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
 * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
 */
public class TermsEnumTermsQuery extends Query implements Accountable {

  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsEnumTermsQuery.class);

  /**
   * Reference to the encoded list of terms for late decoding.
   */
  private byte[] encodedTerms;

  /**
   * The set of terms after decoding
   */
  private BytesRefTermsSet termsSet;

  /**
   * The field to enumerate
   */
  protected String field;

  /**
   * The cache key for this query
   */
  protected final long cacheKey;

  private static final ESLogger logger = Loggers.getLogger(TermsEnumTermsQuery.class);

  /**
   * Creates a new {@link TermsEnumTermsQuery} from the given field data.
   */
  public TermsEnumTermsQuery(final byte[] encodedTerms, final String field, final long cacheKey) {
    this.encodedTerms = encodedTerms;
    this.cacheKey = cacheKey;
    this.field = field;
  }

  @Override
  public long ramBytesUsed() {
    BytesRefTermsSet termsSet = this.getTermsSet();
    return BASE_RAM_BYTES_USED + termsSet.size() * 8;
  }

  @Override
  public String toString(String defaultField) {
    BytesRefTermsSet termsSet = this.getTermsSet();
    final StringBuilder sb = new StringBuilder("TermsEnumTermsQuery:");
    return sb
            .append(defaultField)
            .append(":")
            // Do not serialise the full array, but instead the number of elements - see issue #168
            .append("[size=" + termsSet.size() + "]")
            .toString();
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    }
    if (!super.equals(obj)) {
      return false;
    }
    if (cacheKey != ((TermsEnumTermsQuery) obj).cacheKey) { // relies on the cache key instead of the encodedTerms for equality
      return false;
    }
    if (!field.equals(((TermsEnumTermsQuery) obj).field)) {
      return false;
    }
    return true;
  }

  @Override
  public int hashCode() {
    int hashcode = super.hashCode();
    hashcode = 31 * hashcode + ((int) cacheKey); // relies on the cache key instead of the encodedTerms for hashcode
    hashcode = 31 * hashcode + field.hashCode();
    return hashcode;
  }

  @Override
  public Collection getChildResources() {
    return Collections.emptyList();
  }

  /**
   * Returns the set of terms. This method will perform a late-decoding of the encoded terms, and will release the
   * byte array. This method needs to be synchronized as each segment thread will call it concurrently.
   */
  protected synchronized BytesRefTermsSet getTermsSet() {
    if (encodedTerms != null) { // late decoding of the encoded terms
      long start = System.nanoTime();
      termsSet = (BytesRefTermsSet) TermsSet.readFrom(new BytesRef(encodedTerms));
      logger.debug("{}: Deserialized {} terms - took {} ms", new Object[] { Thread.currentThread().getName(), termsSet.size(), (System.nanoTime() - start) / 1000000 });
      encodedTerms = null; // release reference to the byte array to be able to reclaim memory
    }
    return termsSet;
  }

  public DocIdSet getDocIdSet(LeafReaderContext context) throws IOException {
    final Terms terms = context.reader().terms(field);
    // make sure the field exists
    if (terms == null) return null;

    final BytesRefTermsSet termsSet = this.getTermsSet();
    // make sure there are terms to filter on
    if (termsSet == null || termsSet.isEmpty()) return null;

    SeekingTermSetTermsEnum termsEnum = new SeekingTermSetTermsEnum(terms.iterator(), termsSet);

    DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
    PostingsEnum docs = null;
    while (termsEnum.next() != null) {
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      builder.add(docs);
    }

    return builder.build();
  }

  @Override
  public Weight createWeight(final IndexSearcher searcher, final boolean needsScores) throws IOException {
    return new ConstantScoreWeight(new CacheKeyFieldDataTermsQuery(cacheKey)) {

      @Override
      public void extractTerms(Set terms) {
        // no-op
        // This query is for abuse cases when the number of terms is too high to
        // run efficiently as a BooleanQuery. So likewise we hide its terms in
        // order to protect highlighters
      }

      private Scorer scorer(DocIdSet set) throws IOException {
        if (set == null) {
          return null;
        }
        final DocIdSetIterator disi = set.iterator();
        if (disi == null) {
          return null;
        }
        return new ConstantScoreScorer(this, score(), disi);
      }

      @Override
      public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
        final Scorer scorer = scorer(TermsEnumTermsQuery.this.getDocIdSet(context));
        if (scorer == null) {
          return null;
        }
        return new DefaultBulkScorer(scorer);
      }

      @Override
      public Scorer scorer(LeafReaderContext context) throws IOException {
        return scorer(TermsEnumTermsQuery.this.getDocIdSet(context));
      }
    };
  }

  /**
   * 
   *   This query will be returned by the {@link ConstantScoreWeight} instead of the {@link TermsEnumTermsQuery}
   *   and used by the
   *   {@link LRUQueryCache.CachingWrapperWeight} to cache the query.
   *   This is necessary in order to avoid caching the byte array and long hash set, which is not memory friendly
   *   and not very efficient.
   * 
   * 
   *   Extends MultiTermQuery in order to be detected as "costly" query by {@link UsageTrackingQueryCachingPolicy}
   *   and trigger early caching.
   * 
   */
  private static class CacheKeyFieldDataTermsQuery extends MultiTermQuery {

    private final long cacheKey;

    public CacheKeyFieldDataTermsQuery(long cacheKey) {
      super("");
      this.cacheKey = cacheKey;
    }

    @Override
    public String toString(String field) {
      final StringBuilder sb = new StringBuilder("CacheKeyFieldDataTermsQuery:");
      return sb.append(field).append(":").append("[cacheKey=" + cacheKey + "]").toString();
    }

    @Override
    public boolean equals(Object o) {
      if (!(o instanceof CacheKeyFieldDataTermsQuery)) return false;
      CacheKeyFieldDataTermsQuery other = (CacheKeyFieldDataTermsQuery) o;
      return super.equals(o) && this.cacheKey == other.cacheKey;
    }

    @Override
    protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
      return null;
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + ((int) cacheKey);
      return result;
    }

  }

  static class SeekingTermSetTermsEnum extends FilteredTermsEnum {

    private final BytesRefHash terms;
    private final int[] ords;
    private final int lastElement;

    private final BytesRef lastTerm;
    private final BytesRef spare = new BytesRef();

    private BytesRef seekTerm;
    private int upto = 0;

    SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefTermsSet termsSet) {
      super(tenum);
      this.terms = termsSet.getBytesRefHash();
      this.ords = this.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
      lastElement = terms.size() - 1;
      lastTerm = terms.get(ords[lastElement], new BytesRef());
      seekTerm = terms.get(ords[upto], spare);
    }

    @Override
    protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
      BytesRef temp = seekTerm;
      seekTerm = null;
      return temp;
    }

    @Override
    protected AcceptStatus accept(BytesRef term) throws IOException {
      if (term.compareTo(lastTerm) > 0) {
        return AcceptStatus.END;
      }

      BytesRef currentTerm = terms.get(ords[upto], spare);
      if (term.compareTo(currentTerm) == 0) {
        if (upto == lastElement) {
          return AcceptStatus.YES;
        } else {
          seekTerm = terms.get(ords[++upto], spare);
          return AcceptStatus.YES_AND_SEEK;
        }
      } else {
        if (upto == lastElement) {
          return AcceptStatus.NO;
        } else { // Our current term doesn't match the the given term.
          int cmp;
          do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher.
            if (upto == lastElement) {
              return AcceptStatus.NO;
            }
            // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of
            // our terms so we don't do a binary search here
            seekTerm = terms.get(ords[++upto], spare);
          } while ((cmp = seekTerm.compareTo(term)) < 0);
          if (cmp == 0) {
            if (upto == lastElement) {
              return AcceptStatus.YES;
            }
            seekTerm = terms.get(ords[++upto], spare);
            return AcceptStatus.YES_AND_SEEK;
          } else {
            return AcceptStatus.NO_AND_SEEK;
          }
        }
      }
    }

  }

}