solutions.siren.join.index.query.FieldDataTermsQuery Maven / Gradle / Ivy

Go to download
/**
 * Copyright (c) 2016, SIREn Solutions. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package solutions.siren.join.index.query;

import java.io.IOException;
import java.util.*;

import com.carrotsearch.hppc.LongHashSet;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexNumericFieldData;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import solutions.siren.join.action.terms.collector.LongBloomFilter;
import solutions.siren.join.action.terms.collector.NumericTermsSet;
import solutions.siren.join.action.terms.collector.TermsSet;

/**
 * Specialization for a disjunction over many terms, encoded in a byte array, which scans the
 * {@link IndexFieldData} to collect documents ids.
 * It behaves like a {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
 * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
 */
public abstract class FieldDataTermsQuery extends Query implements Accountable {

  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FieldDataTermsQuery.class);

  /**
   * Reference to the encoded list of terms for late decoding.
   */
  private byte[] encodedTerms;

  /**
   * The set of terms after decoding
   */
  private NumericTermsSet termsSet;

  /**
   * The field data for the field
   */
  protected final IndexFieldData fieldData;

  /**
   * The cache key for this query
   */
  protected final long cacheKey;

  private static final ESLogger logger = Loggers.getLogger(FieldDataTermsQuery.class);

  /**
   * Get a {@link FieldDataTermsQuery} that filters on non-floating point numeric terms found in a hppc
   * {@link LongHashSet}.
   *
   * @param encodedTerms  An encoded set of terms.
   * @param fieldData     The fielddata for the field.
   * @param cacheKey      A unique key to use for caching this query.
   * @return the query.
   */
  public static FieldDataTermsQuery newLongs(final byte[] encodedTerms, final IndexNumericFieldData fieldData, final long cacheKey) {
    return new LongsFieldDataTermsQuery(encodedTerms, fieldData, cacheKey);
  }

  /**
   * Get a {@link FieldDataTermsQuery} that filters on non-numeric terms found in a hppc {@link LongHashSet} of
   * {@link BytesRef}.
   *
   * @param encodedTerms  An encoded set of terms.
   * @param fieldData     The fielddata for the field.
   * @param cacheKey      A unique key to use for caching this query.
   * @return the query.
   */
  public static FieldDataTermsQuery newBytes(final byte[] encodedTerms, final IndexFieldData fieldData, final long cacheKey) {
    return new BytesFieldDataTermsQuery(encodedTerms, fieldData, cacheKey);
  }

  /**
   * Creates a new {@link FieldDataTermsQuery} from the given field data.
   */
  public FieldDataTermsQuery(final byte[] encodedTerms, final IndexFieldData fieldData, final long cacheKey) {
    this.encodedTerms = encodedTerms;
    this.fieldData = fieldData;
    this.cacheKey = cacheKey;
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    }
    if (!super.equals(obj)) {
      return false;
    }
    if (cacheKey != ((FieldDataTermsQuery) obj).cacheKey) { // relies on the cache key instead of the encodedTerms for equality
      return false;
    }
    return true;
  }

  @Override
  public int hashCode() {
    int hashcode = super.hashCode();
    hashcode = 31 * hashcode + ((int) cacheKey); // relies on the cache key instead of the encodedTerms for hashcode
    return hashcode;
  }

  @Override
  public Collection getChildResources() {
    return Collections.emptyList();
  }

  /**
   * Returns the set of terms. This method will perform a late-decoding of the encoded terms, and will release the
   * byte array. This method needs to be synchronized as each segment thread will call it concurrently.
   */
  protected synchronized NumericTermsSet getTermsSet() {
    if (encodedTerms != null) { // late decoding of the encoded terms
      long start = System.nanoTime();
      termsSet = (NumericTermsSet) TermsSet.readFrom(new BytesRef(encodedTerms));
      logger.debug("{}: Deserialized {} terms - took {} ms", new Object[] { Thread.currentThread().getName(), termsSet.size(), (System.nanoTime() - start) / 1000000 });
      encodedTerms = null; // release reference to the byte array to be able to reclaim memory
    }
    return termsSet;
  }

  public abstract DocIdSet getDocIdSet(LeafReaderContext context) throws IOException;

  @Override
  public Weight createWeight(final IndexSearcher searcher, final boolean needsScores) throws IOException {
    return new ConstantScoreWeight(new CacheKeyFieldDataTermsQuery(cacheKey)) {

      @Override
      public void extractTerms(Set terms) {
        // no-op
        // This query is for abuse cases when the number of terms is too high to
        // run efficiently as a BooleanQuery. So likewise we hide its terms in
        // order to protect highlighters
      }

      private Scorer scorer(DocIdSet set) throws IOException {
        if (set == null) {
          return null;
        }
        final DocIdSetIterator disi = set.iterator();
        if (disi == null) {
          return null;
        }
        return new ConstantScoreScorer(this, score(), disi);
      }

      @Override
      public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
        final Scorer scorer = scorer(FieldDataTermsQuery.this.getDocIdSet(context));
        if (scorer == null) {
          return null;
        }
        return new DefaultBulkScorer(scorer);
      }

      @Override
      public Scorer scorer(LeafReaderContext context) throws IOException {
        return scorer(FieldDataTermsQuery.this.getDocIdSet(context));
      }
    };
  }

  /**
   * Filters on non-floating point numeric fields.
   */
  protected static class LongsFieldDataTermsQuery extends FieldDataTermsQuery {

    /**
     * Creates a new {@link FieldDataTermsQuery} from the given field data.
     *
     * @param fieldData
     */
    public LongsFieldDataTermsQuery(final byte[] encodedTerms, final IndexFieldData fieldData, final long cacheKey) {
      super(encodedTerms, fieldData, cacheKey);
    }

    @Override
    public long ramBytesUsed() {
      NumericTermsSet termsSet = this.getTermsSet();
      return BASE_RAM_BYTES_USED + termsSet.size() * 8;
    }

    @Override
    public String toString(String defaultField) {
      NumericTermsSet termsSet = this.getTermsSet();
      final StringBuilder sb = new StringBuilder("LongsFieldDataTermsQuery:");
      return sb
              .append(defaultField)
              .append(":")
              // Do not serialise the full array, but instead the number of elements - see issue #168
              .append("[size=" + termsSet.size() + "]")
              .toString();
    }

    @Override
    public DocIdSet getDocIdSet(LeafReaderContext context) throws IOException {
      final NumericTermsSet termsSet = this.getTermsSet();

      // make sure there are terms to filter on
      if (termsSet == null || termsSet.isEmpty()) return null;

      IndexNumericFieldData numericFieldData = (IndexNumericFieldData) fieldData;
      if (!numericFieldData.getNumericType().isFloatingPoint()) {
        final SortedNumericDocValues values = numericFieldData.load(context).getLongValues(); // load fielddata
        return new DocValuesDocIdSet(context.reader().maxDoc(), context.reader().getLiveDocs()) {
          @Override
          protected boolean matchDoc(int doc) {
            values.setDocument(doc);
            final int numVals = values.count();
            for (int i = 0; i < numVals; i++) {
              if (termsSet.contains(values.valueAt(i))) {
                return true;
              }
            }

            return false;
          }
        };
      }

      // only get here if wrong fielddata type in which case
      // no docs will match so we just return null.
      return null;
    }

  }

  /**
   * Filters on non-numeric fields. Uses Sip hash to hash byte values before comparison.
   */
  protected static class BytesFieldDataTermsQuery extends FieldDataTermsQuery {

    private final ESLogger logger = Loggers.getLogger(getClass());

    /**
     * Creates a new {@link BytesFieldDataTermsQuery} from the given field data.
     *
     * @param fieldData
     */
    public BytesFieldDataTermsQuery(final byte[] encodedTerms, final IndexFieldData fieldData, final long cacheKey) {
      super(encodedTerms, fieldData, cacheKey);
    }

    @Override
    public long ramBytesUsed() {
      NumericTermsSet termsSet = this.getTermsSet();
      return BASE_RAM_BYTES_USED + termsSet.size() * 8;
    }

    @Override
    public String toString(String defaultField) {
      NumericTermsSet termsSet = this.getTermsSet();
      final StringBuilder sb = new StringBuilder("BytesFieldDataTermsQuery:");
      return sb
              .append(defaultField)
              .append(":")
              // Do not serialise the full array, but instead the number of elements - see issue #168
              .append("[size=" + termsSet.size() * 8 + "]")
              .toString();
    }

    @Override
    public DocIdSet getDocIdSet(LeafReaderContext context) throws IOException {
      final NumericTermsSet termsSet = this.getTermsSet();

      // make sure there are terms to filter on
      if (termsSet == null || termsSet.isEmpty()) return null;

      final SortedBinaryDocValues values = fieldData.load(context).getBytesValues(); // load fielddata
      return new DocValuesDocIdSet(context.reader().maxDoc(), context.reader().getLiveDocs()) {
        @Override
        protected boolean matchDoc(int doc) {
          values.setDocument(doc);
          final int numVals = values.count();
          for (int i = 0; i < numVals; i++) {
            final BytesRef term = values.valueAt(i);
            long termHash = LongBloomFilter.hash3_x64_128(term.bytes, term.offset, term.length, 0);
            if (termsSet.contains(termHash)) {
              return true;
            }
          }

          return false;
        }
      };
    }

  }

  /**
   * 
   *   This query will be returned by the {@link ConstantScoreWeight} instead of the {@link FieldDataTermsQuery}
   *   and used by the
   *   {@link org.apache.lucene.search.LRUQueryCache.CachingWrapperWeight} to cache the query.
   *   This is necessary in order to avoid caching the byte array and long hash set, which is not memory friendly
   *   and not very efficient.
   * 
   * 
   *   Extends MultiTermQuery in order to be detected as "costly" query by {@link UsageTrackingQueryCachingPolicy}
   *   and trigger early caching.
   * 
   */
  private static class CacheKeyFieldDataTermsQuery extends MultiTermQuery {

    private final long cacheKey;

    public CacheKeyFieldDataTermsQuery(long cacheKey) {
      super("");
      this.cacheKey = cacheKey;
    }

    @Override
    public String toString(String field) {
      final StringBuilder sb = new StringBuilder("CacheKeyFieldDataTermsQuery:");
      return sb.append(field).append(":").append("[cacheKey=" + cacheKey + "]").toString();
    }

    @Override
    public boolean equals(Object o) {
      if (!(o instanceof CacheKeyFieldDataTermsQuery)) return false;
      CacheKeyFieldDataTermsQuery other = (CacheKeyFieldDataTermsQuery) o;
      return super.equals(o) && this.cacheKey == other.cacheKey;
    }

    @Override
    protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
      return null;
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + ((int) cacheKey);
      return result;
    }

  }

}