All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.search.facet.FacetFieldProcessorByEnumTermsStream Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search.facet;

import java.io.Closeable;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiPostingsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.facet.SlotAcc.SlotContext;

/**
 * Enumerates indexed terms in order in a streaming fashion. It's able to stream since no data needs
 * to be accumulated so long as it's index order.
 */
class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implements Closeable {
  long effectiveLimit;
  long bucketsToSkip;
  long bucketsReturned;

  boolean closed;
  boolean countOnly;
  boolean hasSubFacets; // true if there are subfacets
  int minDfFilterCache;
  DocSet docs;
  Bits fastForRandomSet;
  TermsEnum termsEnum = null;
  SolrIndexSearcher.DocsEnumState deState = null;
  PostingsEnum postingsEnum;
  BytesRef startTermBytes;
  BytesRef term;
  AtomicBoolean shardHasMoreBuckets = new AtomicBoolean(false); // set after streaming as finished

  // at any point in processing where we need a SlotContext, all we care about is the current 'term'
  private IntFunction slotContext =
      (slotNum) -> {
        assert null != term;
        return new SlotAcc.SlotContext(new TermQuery(new Term(sf.getName(), term)));
      };

  LeafReaderContext[] leaves;

  FacetFieldProcessorByEnumTermsStream(FacetContext fcontext, FacetField freq, SchemaField sf) {
    super(fcontext, freq, sf);
  }

  @Override
  public void close() throws IOException {
    if (!closed) {
      closed = true;
      // fcontext.base.decref();  // OFF-HEAP
    }
  }

  @Override
  public void process() throws IOException {
    super.process();

    // We need to keep the fcontext open after processing is done (since we will be streaming in the
    // response writer). But if the connection is broken, we want to clean up.
    // fcontext.base.incref();  // OFF-HEAP
    fcontext.qcontext.addCloseHook(this);

    setup();
    response = new SimpleOrderedMap<>();
    response.add(
        "buckets",
        new Iterator<>() {
          boolean retrieveNext = true;
          Object val;

          @Override
          public boolean hasNext() {
            if (retrieveNext) {
              val = nextBucket();
            }
            retrieveNext = false;
            return val != null;
          }

          @Override
          public Object next() {
            if (retrieveNext) {
              val = nextBucket();
            }
            retrieveNext = true;
            if (val == null) {
              // Last value, so clean up.  In the case that we are doing streaming facets within
              // streaming facets, the number of close hooks could grow very large, so we want to
              // remove ourselves.
              boolean removed =
                  fcontext.qcontext.removeCloseHook(FacetFieldProcessorByEnumTermsStream.this);
              assert removed;
              try {
                close();
              } catch (IOException e) {
                throw new SolrException(
                    SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming close", e);
              }
            }
            return val;
          }

          @Override
          public void remove() {
            throw new UnsupportedOperationException();
          }
        });
    if (fcontext.isShard()) {
      response.add("more", shardHasMoreBuckets); // lazily evaluated
    }
  }

  private void setup() throws IOException {

    countOnly =
        freq.facetStats.size() == 0
            || freq.facetStats.values().iterator().next() instanceof CountAgg;
    hasSubFacets = freq.subFacets.size() > 0;
    bucketsToSkip = freq.offset;

    effectiveLimit = freq.limit;
    if (freq.limit >= 0) {
      // TODO: consider only applying overrequest for `freq.limit > 0` (as opposed to `>=`)?
      //  corresponding change should be made in overrequest logic for `FacetFieldProcessor`.
      if (freq.overrequest < -1) {
        // other negative values are not supported
        throw new IllegalArgumentException("Illegal `overrequest` specified: " + freq.overrequest);
      } else if (freq.overrequest > 0 && (fcontext.isShard() || null != resort)) {
        // NOTE: "index sort" _never_ applies a default overrequest. In both the shard case and the
        // resort case, the default overrequest is `0`. However, if `overrequest` is explicitly
        // specified, we respect it except for non-distrib, no-resort request. Overrequest is
        // relevant for the `resort` case; but it can also be relevant in some edge cases of the
        // "shard" case, where it can affect the behavior of `isBucketComplete()` (see SOLR-14595).
        effectiveLimit += freq.overrequest;
      }
    }

    createAccs(-1, 1);

    // Minimum term docFreq in order to use the filterCache for that term.
    if (freq.cacheDf == -1) { // -1 means never cache
      minDfFilterCache = Integer.MAX_VALUE;
    } else if (freq.cacheDf == 0) { // default; compute as fraction of maxDoc
      // (minimum of 3 is for test coverage purposes)
      minDfFilterCache = Math.max(fcontext.searcher.maxDoc() >> 4, 3);
    } else {
      minDfFilterCache = freq.cacheDf;
    }

    docs = fcontext.base;
    fastForRandomSet = null;

    if (freq.prefix != null) {
      String indexedPrefix = sf.getType().toInternal(freq.prefix);
      startTermBytes = new BytesRef(indexedPrefix);
    } else if (sf.getType().getNumberType() != null) {
      String triePrefix = TrieField.getMainValuePrefix(sf.getType());
      if (triePrefix != null) {
        startTermBytes = new BytesRef(triePrefix);
      }
    }

    Terms terms = fcontext.searcher.getSlowAtomicReader().terms(sf.getName());

    termsEnum = null;
    deState = null;
    term = null;

    if (terms != null) {

      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (startTermBytes != null) {
        if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    List leafList = fcontext.searcher.getTopReaderContext().leaves();
    leaves = leafList.toArray(new LeafReaderContext[0]);
  }

  private SimpleOrderedMap nextBucket() {
    try {
      return _nextBucket();
    } catch (Exception e) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming", e);
    }
  }

  private SimpleOrderedMap _nextBucket() throws IOException {
    DocSet termSet = null;

    try {
      while (term != null) {

        if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) {
          break;
        }

        int df = termsEnum.docFreq();
        if (df < effectiveMincount) {
          term = termsEnum.next();
          continue;
        }

        if (termSet != null) {
          // termSet.decref(); // OFF-HEAP
          termSet = null;
        }

        int c = 0;

        if (hasSubFacets || df >= minDfFilterCache) {
          // use the filter cache

          if (deState == null) {
            deState = new SolrIndexSearcher.DocsEnumState();
            deState.fieldName = sf.getName();
            deState.liveDocs = fcontext.searcher.getLiveDocsBits();
            deState.termsEnum = termsEnum;
            deState.postingsEnum = postingsEnum;
            deState.minSetSizeCached = minDfFilterCache;
          }

          if (hasSubFacets || !countOnly) {
            DocSet termsAll = fcontext.searcher.getDocSet(deState);
            termSet = docs.intersection(termsAll);
            // termsAll.decref(); // OFF-HEAP
            c = termSet.size();
          } else {
            c = fcontext.searcher.numDocs(docs, deState);
          }
          postingsEnum = deState.postingsEnum;

          resetStats();

          if (!countOnly) {
            collect(termSet, 0, slotContext);
          }

        } else {
          // We don't need the docset here (meaning no sub-facets).
          // if countOnly, then we are calculating some other stats...
          resetStats();

          // lazy convert to fastForRandomSet
          if (fastForRandomSet == null) {
            fastForRandomSet = docs.getBits();
          }
          // iterate over TermDocs to calculate the intersection
          postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);

          if (postingsEnum instanceof MultiPostingsEnum) {
            MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
            int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
            for (int subindex = 0; subindex < numSubs; subindex++) {
              MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
              if (sub.postingsEnum == null) continue;
              int base = sub.slice.start;
              int docid;

              if (countOnly) {
                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.get(docid + base)) c++;
                }
              } else {
                setNextReader(leaves[sub.slice.readerIndex]);
                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.get(docid + base)) {
                    c++;
                    collect(docid, 0, slotContext);
                  }
                }
              }
            }
          } else {
            int docid;
            if (countOnly) {
              while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                if (fastForRandomSet.get(docid)) c++;
              }
            } else {
              setNextReader(leaves[0]);
              while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                if (fastForRandomSet.get(docid)) {
                  c++;
                  collect(docid, 0, slotContext);
                }
              }
            }
          }
        }

        if (c < effectiveMincount) {
          term = termsEnum.next();
          continue;
        }

        // handle offset and limit
        if (bucketsToSkip > 0) {
          bucketsToSkip--;
          term = termsEnum.next();
          continue;
        }

        if (effectiveLimit >= 0 && ++bucketsReturned > effectiveLimit) {
          shardHasMoreBuckets.set(true);
          return null;
        }

        // set count in case other stats depend on it
        countAcc.incrementCount(0, c);

        // OK, we have a good bucket to return... first get bucket value before moving to next term
        Object bucketVal = sf.getType().toObject(sf, term);
        TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null;
        term = termsEnum.next();

        SimpleOrderedMap bucket = new SimpleOrderedMap<>();
        bucket.add("val", bucketVal);
        addStats(bucket, 0);
        if (hasSubFacets) {
          processSubs(bucket, bucketQuery, termSet, false, null);
        }

        // TODO... termSet needs to stick around for streaming sub-facets?

        return bucket;
      }

    } finally {
      if (termSet != null) {
        // termSet.decref();  // OFF-HEAP
        termSet = null;
      }
    }

    // end of the iteration
    return null;
  }
}