All Downloads are FREE. Search and download functionalities are using the official Maven repository.

solutions.siren.join.index.query.TermsEnumTermsQuery Maven / Gradle / Ivy

/**
 * Copyright (c) 2016, SIREn Solutions. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package solutions.siren.join.index.query;

import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.*;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import solutions.siren.join.action.terms.collector.*;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Set;

/**
 * Specialization for a disjunction over many terms, encoded in a byte array, which scans the dictionary
 * using a {@link TermsEnum} to collect documents ids.
 * It behaves like a {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
 * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
 */
public class TermsEnumTermsQuery extends Query implements Accountable {

  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsEnumTermsQuery.class);

  /**
   * Reference to the encoded list of terms for late decoding.
   */
  private byte[] encodedTerms;

  /**
   * The set of terms after decoding
   */
  private BytesRefTermsSet termsSet;

  /**
   * The field to enumerate
   */
  protected String field;

  /**
   * The cache key for this query
   */
  protected final long cacheKey;

  private static final ESLogger logger = Loggers.getLogger(TermsEnumTermsQuery.class);

  /**
   * Creates a new {@link TermsEnumTermsQuery} from the given field data.
   */
  public TermsEnumTermsQuery(final byte[] encodedTerms, final String field, final long cacheKey) {
    this.encodedTerms = encodedTerms;
    this.cacheKey = cacheKey;
    this.field = field;
  }

  @Override
  public long ramBytesUsed() {
    BytesRefTermsSet termsSet = this.getTermsSet();
    return BASE_RAM_BYTES_USED + termsSet.size() * 8;
  }

  @Override
  public String toString(String defaultField) {
    BytesRefTermsSet termsSet = this.getTermsSet();
    final StringBuilder sb = new StringBuilder("TermsEnumTermsQuery:");
    return sb
            .append(defaultField)
            .append(":")
            // Do not serialise the full array, but instead the number of elements - see issue #168
            .append("[size=" + termsSet.size() + "]")
            .toString();
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    }
    if (!super.equals(obj)) {
      return false;
    }
    if (cacheKey != ((TermsEnumTermsQuery) obj).cacheKey) { // relies on the cache key instead of the encodedTerms for equality
      return false;
    }
    if (!field.equals(((TermsEnumTermsQuery) obj).field)) {
      return false;
    }
    return true;
  }

  @Override
  public int hashCode() {
    int hashcode = super.hashCode();
    hashcode = 31 * hashcode + ((int) cacheKey); // relies on the cache key instead of the encodedTerms for hashcode
    hashcode = 31 * hashcode + field.hashCode();
    return hashcode;
  }

  @Override
  public Collection getChildResources() {
    return Collections.emptyList();
  }

  /**
   * Returns the set of terms. This method will perform a late-decoding of the encoded terms, and will release the
   * byte array. This method needs to be synchronized as each segment thread will call it concurrently.
   */
  protected synchronized BytesRefTermsSet getTermsSet() {
    if (encodedTerms != null) { // late decoding of the encoded terms
      long start = System.nanoTime();
      termsSet = (BytesRefTermsSet) TermsSet.readFrom(new BytesRef(encodedTerms));
      logger.debug("{}: Deserialized {} terms - took {} ms", new Object[] { Thread.currentThread().getName(), termsSet.size(), (System.nanoTime() - start) / 1000000 });
      encodedTerms = null; // release reference to the byte array to be able to reclaim memory
    }
    return termsSet;
  }

  public DocIdSet getDocIdSet(LeafReaderContext context) throws IOException {
    final Terms terms = context.reader().terms(field);
    // make sure the field exists
    if (terms == null) return null;

    final BytesRefTermsSet termsSet = this.getTermsSet();
    // make sure there are terms to filter on
    if (termsSet == null || termsSet.isEmpty()) return null;

    SeekingTermSetTermsEnum termsEnum = new SeekingTermSetTermsEnum(terms.iterator(), termsSet);

    DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
    PostingsEnum docs = null;
    while (termsEnum.next() != null) {
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      builder.add(docs);
    }

    return builder.build();
  }

  @Override
  public Weight createWeight(final IndexSearcher searcher, final boolean needsScores) throws IOException {
    return new ConstantScoreWeight(new CacheKeyFieldDataTermsQuery(cacheKey)) {

      @Override
      public void extractTerms(Set terms) {
        // no-op
        // This query is for abuse cases when the number of terms is too high to
        // run efficiently as a BooleanQuery. So likewise we hide its terms in
        // order to protect highlighters
      }

      private Scorer scorer(DocIdSet set) throws IOException {
        if (set == null) {
          return null;
        }
        final DocIdSetIterator disi = set.iterator();
        if (disi == null) {
          return null;
        }
        return new ConstantScoreScorer(this, score(), disi);
      }

      @Override
      public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
        final Scorer scorer = scorer(TermsEnumTermsQuery.this.getDocIdSet(context));
        if (scorer == null) {
          return null;
        }
        return new DefaultBulkScorer(scorer);
      }

      @Override
      public Scorer scorer(LeafReaderContext context) throws IOException {
        return scorer(TermsEnumTermsQuery.this.getDocIdSet(context));
      }
    };
  }

  /**
   * 

* This query will be returned by the {@link ConstantScoreWeight} instead of the {@link TermsEnumTermsQuery} * and used by the * {@link LRUQueryCache.CachingWrapperWeight} to cache the query. * This is necessary in order to avoid caching the byte array and long hash set, which is not memory friendly * and not very efficient. *

*

* Extends MultiTermQuery in order to be detected as "costly" query by {@link UsageTrackingQueryCachingPolicy} * and trigger early caching. *

*/ private static class CacheKeyFieldDataTermsQuery extends MultiTermQuery { private final long cacheKey; public CacheKeyFieldDataTermsQuery(long cacheKey) { super(""); this.cacheKey = cacheKey; } @Override public String toString(String field) { final StringBuilder sb = new StringBuilder("CacheKeyFieldDataTermsQuery:"); return sb.append(field).append(":").append("[cacheKey=" + cacheKey + "]").toString(); } @Override public boolean equals(Object o) { if (!(o instanceof CacheKeyFieldDataTermsQuery)) return false; CacheKeyFieldDataTermsQuery other = (CacheKeyFieldDataTermsQuery) o; return super.equals(o) && this.cacheKey == other.cacheKey; } @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { return null; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((int) cacheKey); return result; } } static class SeekingTermSetTermsEnum extends FilteredTermsEnum { private final BytesRefHash terms; private final int[] ords; private final int lastElement; private final BytesRef lastTerm; private final BytesRef spare = new BytesRef(); private BytesRef seekTerm; private int upto = 0; SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefTermsSet termsSet) { super(tenum); this.terms = termsSet.getBytesRefHash(); this.ords = this.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); lastElement = terms.size() - 1; lastTerm = terms.get(ords[lastElement], new BytesRef()); seekTerm = terms.get(ords[upto], spare); } @Override protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException { BytesRef temp = seekTerm; seekTerm = null; return temp; } @Override protected AcceptStatus accept(BytesRef term) throws IOException { if (term.compareTo(lastTerm) > 0) { return AcceptStatus.END; } BytesRef currentTerm = terms.get(ords[upto], spare); if (term.compareTo(currentTerm) == 0) { if (upto == lastElement) { return AcceptStatus.YES; } else { seekTerm = terms.get(ords[++upto], spare); return AcceptStatus.YES_AND_SEEK; } } else { if (upto == lastElement) { return AcceptStatus.NO; } else { // Our current term doesn't match the the given term. int cmp; do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher. if (upto == lastElement) { return AcceptStatus.NO; } // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of // our terms so we don't do a binary search here seekTerm = terms.get(ords[++upto], spare); } while ((cmp = seekTerm.compareTo(term)) < 0); if (cmp == 0) { if (upto == lastElement) { return AcceptStatus.YES; } seekTerm = terms.get(ords[++upto], spare); return AcceptStatus.YES_AND_SEEK; } else { return AcceptStatus.NO_AND_SEEK; } } } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy