org.apache.solr.search.facet.UnInvertedField Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.facet;

import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.SolrException;
import org.apache.solr.index.SlowCompositeReaderWrapper;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts;
import org.apache.solr.uninverting.DocTermOrds;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Final form of the un-inverted field: Each document points to a list of term numbers that are
 * contained in that document.
 *
 * Term numbers are in sorted order, and are encoded as variable-length deltas from the previous
 * term number. Real term numbers start at 2 since 0 and 1 are reserved. A term number of 0 signals
 * the end of the termNumber list.
 *
 * 
There is a single int[maxDoc()] which either contains a pointer into a byte[] for the
 * termNumber lists, or directly contains the termNumber list if it fits in the 4 bytes of an
 * integer. If the first byte in the integer is 1, the next 3 bytes are a pointer into a byte[]
 * where the termNumber list starts.
 *
 * 
There are actually 256 byte arrays, to compensate for the fact that the pointers into the byte
 * arrays are only 3 bytes long. The correct byte array for a document is a function of its id.
 *
 * 
To save space and speed up faceting, any term that matches enough documents will not be
 * un-inverted... it will be skipped while building the un-inverted field structure, and will use a
 * set intersection method during faceting.
 *
 * To further save memory, the terms (the actual string values) are not all stored in memory, but
 * a TermIndex is used to convert term numbers to term values only for the terms needed after
 * faceting has completed. Only every 128th term value is stored, along with its corresponding term
 * number, and this is used as an index to find the closest term and iterate until the desired
 * number is hit (very much like Lucene's own internal term index).
 */
public class UnInvertedField extends DocTermOrds {
  private static int TNUM_OFFSET = 2;

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  static class TopTerm {
    Query termQuery;
    BytesRef term;
    int termNum;

    long memSize() {
      return 8L
          + // obj header
          8
          + 8
          + term.length
          + // term
          4; // int
    }
  }

  long memsz;
  final AtomicLong use = new AtomicLong(); // number of uses

  /* The number of documents holding the term {@code maxDocs = maxTermCounts[termNum]}. */
  int[] maxTermCounts = new int[1024];

  /* termNum -> docIDs for big terms. */
  final Map bigTerms = new LinkedHashMap<>();

  private SolrIndexSearcher.DocsEnumState deState;
  private final SolrIndexSearcher searcher;

  private static final UnInvertedField uifPlaceholder = new UnInvertedField();

  private UnInvertedField() { // Dummy for synchronization.
    super("fake", 0, 0); // cheapest initialization I can find.
    searcher = null;
  }

  /**
   * Called for each term in the field being uninverted. Collects {@link #maxTermCounts} for all
   * bigTerms as well as storing them in {@link #bigTerms}.
   *
   * @param te positioned at the current term.
   * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between
   *     calls.
   */
  @Override
  protected void visitTerm(TermsEnum te, int termNum) throws IOException {

    if (termNum >= maxTermCounts.length) {
      // resize by doubling - for very large number of unique terms, expanding
      // by 4K and resultant GC will dominate uninvert times.  Resize at end if material
      int[] newMaxTermCounts = new int[Math.min(Integer.MAX_VALUE - 16, maxTermCounts.length * 2)];
      System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
      maxTermCounts = newMaxTermCounts;
    }

    final BytesRef term = te.term();

    if (te.docFreq() > maxTermDocFreq) {
      Term t = new Term(field, term); // this makes a deep copy of the term bytes
      TopTerm topTerm = new TopTerm();
      topTerm.term = t.bytes();
      topTerm.termNum = termNum;
      topTerm.termQuery = new TermQuery(t);

      bigTerms.put(topTerm.termNum, topTerm);

      if (deState == null) {
        deState = new SolrIndexSearcher.DocsEnumState();
        deState.fieldName = field;
        deState.liveDocs = searcher.getLiveDocsBits();
        // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail?
        deState.termsEnum = te;
        deState.postingsEnum = postingsEnum;
        deState.minSetSizeCached = maxTermDocFreq;
      }

      postingsEnum = deState.postingsEnum;
      DocSet set = searcher.getDocSet(deState);
      maxTermCounts[termNum] = set.size();
    }
  }

  @Override
  protected void setActualDocFreq(int termNum, int docFreq) {
    maxTermCounts[termNum] = docFreq;
  }

  public long memSize() {
    // can cache the mem size since it shouldn't change
    if (memsz != 0) return memsz;
    long sz = super.ramBytesUsed();
    sz += 8 * 8 + 32; // local fields
    sz += bigTerms.size() * 64L;
    for (TopTerm tt : bigTerms.values()) {
      sz += tt.memSize();
    }
    if (maxTermCounts != null) sz += maxTermCounts.length * 4L;
    memsz = sz;
    return sz;
  }

  public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
    super(
        field,
        // threshold, over which we use set intersections instead of counting
        // to (1) save memory, and (2) speed up faceting.
        // Add 2 for testing purposes so that there will always be some terms under
        // the threshold even when the index is very
        // small.
        searcher.maxDoc() / 20 + 2,
        DEFAULT_INDEX_INTERVAL_BITS);

    assert TestInjection.injectUIFOutOfMemoryError();

    final String prefix = TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field));
    this.searcher = searcher;
    try {
      // TODO: it's wasteful to create one of these each time
      // but DocTermOrds will throw an exception if it thinks the field has doc values (which is
      // faked by UnInvertingReader)
      LeafReader r = SlowCompositeReaderWrapper.wrap(searcher.getRawReader());
      uninvert(r, r.getLiveDocs(), prefix == null ? null : new BytesRef(prefix));
    } catch (IllegalStateException ise) {
      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ise);
    }
    if (tnums != null) {
      for (byte[] target : tnums) {
        if (target != null && target.length > (1 << 24) * .9) {
          log.warn(
              "Approaching too many values for UnInvertedField faceting on field '{}' : bucket size={}",
              field,
              target.length);
        }
      }
    }

    // free space if outrageously wasteful (tradeoff memory/cpu)
    if ((maxTermCounts.length - numTermsInField) > 1024) { // too much waste!
      int[] newMaxTermCounts = new int[numTermsInField];
      System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, numTermsInField);
      maxTermCounts = newMaxTermCounts;
    }

    log.info("UnInverted multi-valued field {}", this);
    // System.out.println("CREATED: " + toString() + " ti.index=" + ti.index);
  }

  public int getNumTerms() {
    return numTermsInField;
  }

  public class DocToTerm implements Closeable {
    private final DocSet[] bigTermSets;
    private final int[] bigTermNums;
    private TermsEnum te;

    public DocToTerm() throws IOException {
      bigTermSets = new DocSet[bigTerms.size()];
      bigTermNums = new int[bigTerms.size()];
      int i = 0;
      for (TopTerm tt : bigTerms.values()) {
        bigTermSets[i] = searcher.getDocSet(tt.termQuery);
        bigTermNums[i] = tt.termNum;
        i++;
      }
    }

    public BytesRef lookupOrd(int ord) throws IOException {
      return getTermValue(getTermsEnum(), ord);
    }

    public TermsEnum getTermsEnum() throws IOException {
      if (te == null) {
        te = getOrdTermsEnum(searcher.getSlowAtomicReader());
      }
      return te;
    }

    public void getBigTerms(int doc, Callback target) throws IOException {
      if (bigTermSets != null) {
        for (int i = 0; i < bigTermSets.length; i++) {
          if (bigTermSets[i].exists(doc)) {
            target.call(bigTermNums[i]);
          }
        }
      }
    }

    public void getSmallTerms(int doc, Callback target) {
      if (termInstances > 0) {
        int code = index[doc];

        if ((code & 0x80000000) != 0) {
          int pos = code & 0x7fffffff;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
          for (; ; ) {
            int delta = 0;
            for (; ; ) {
              byte b = arr[pos++];
              delta = (delta << 7) | (b & 0x7f);
              if ((b & 0x80) == 0) break;
            }
            if (delta == 0) break;
            tnum += delta - TNUM_OFFSET;
            target.call(tnum);
          }
        } else {
          int tnum = 0;
          int delta = 0;
          for (; ; ) {
            delta = (delta << 7) | (code & 0x7f);
            if ((code & 0x80) == 0) {
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              target.call(tnum);
              delta = 0;
            }
            code >>>= 8;
          }
        }
      }
    }

    @Override
    public void close() throws IOException {
      for (DocSet set : bigTermSets) {
        // set.decref(); // OFF-HEAP
      }
    }
  }

  public interface Callback {
    public void call(int termNum);
  }

  private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException {
    DocSet docs = processor.fcontext.base;
    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();

    // what about allBuckets?
    if (baseSize < processor.effectiveMincount) {
      return;
    }

    SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
    final List others = SweepingCountSlotAcc.otherStructsOf(processor);

    final int[] index = this.index;

    boolean doNegative =
        baseSize > maxDoc >> 1
            && termInstances > 0
            && docs instanceof BitDocSet
            && baseCountAccStruct != null;

    if (doNegative) {
      FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
      bs.flip(0, maxDoc);
      // TODO: when iterator across negative elements is available, use that
      // instead of creating a new bitset and inverting.
      docs = new BitDocSet(bs, maxDoc - baseSize);
      // simply negating will mean that we have deleted docs in the set.
      // that should be OK, as their entries in our table should be empty.
      baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs);
    }

    // For the biggest terms, do straight set intersections
    for (TopTerm tt : bigTerms.values()) {
      // TODO: counts could be deferred if sorting by index order
      final int termOrd = tt.termNum;
      Iterator othersIter = others.iterator();
      SweepCountAccStruct entry =
          baseCountAccStruct != null ? baseCountAccStruct : othersIter.next();
      for (; ; ) {
        entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet));
        if (!othersIter.hasNext()) {
          break;
        }
        entry = othersIter.next();
      }
    }

    // TODO: we could short-circuit counting altogether for sorted faceting
    // where we already have enough terms from the bigTerms

    if (termInstances > 0) {
      final SweepIteratorAndCounts iterAndCounts =
          SweepDocIterator.newInstance(baseCountAccStruct, others);
      final SweepDocIterator iter = iterAndCounts.iter;
      final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs);
      while (iter.hasNext()) {
        int doc = iter.nextDoc();
        int maxIdx = iter.registerCounts(counts);
        int code = index[doc];

        if ((code & 0x80000000) != 0) {
          int pos = code & 0x7fffffff;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
          for (; ; ) {
            int delta = 0;
            for (; ; ) {
              byte b = arr[pos++];
              delta = (delta << 7) | (b & 0x7f);
              if ((b & 0x80) == 0) break;
            }
            if (delta == 0) break;
            tnum += delta - TNUM_OFFSET;
            counts.incrementCount(tnum, 1, maxIdx);
          }
        } else {
          int tnum = 0;
          int delta = 0;
          for (; ; ) {
            delta = (delta << 7) | (code & 0x7f);
            if ((code & 0x80) == 0) {
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              counts.incrementCount(tnum, 1, maxIdx);
              delta = 0;
            }
            code >>>= 8;
          }
        }
      }
    }

    if (doNegative) {
      final CountSlotAcc baseCounts = processor.countAcc;
      for (int i = 0; i < numTermsInField; i++) {
        //       counts[i] = maxTermCounts[i] - counts[i];
        baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i) * 2);
      }
    }

    /* TODO - future optimization to handle allBuckets
    if (processor.allBucketsSlot >= 0) {
      int all = 0;  // overflow potential
      for (int i=0; i= numTermsInField) {
      getCounts(processor);
      return;
    }

    collectDocsGeneric(processor);
  }

  // called from FieldFacetProcessor
  // TODO: do a callback version that can be specialized!
  public void collectDocsGeneric(FacetFieldProcessorByArrayUIF processor) throws IOException {
    use.incrementAndGet();

    int startTermIndex = processor.startTermIndex;
    int endTermIndex = processor.endTermIndex;
    int nTerms = processor.nTerms;
    DocSet docs = processor.fcontext.base;

    int uniqueTerms = 0;
    final CountSlotAcc countAcc = processor.countAcc;
    final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
    final List others = SweepingCountSlotAcc.otherStructsOf(processor);

    for (TopTerm tt : bigTerms.values()) {
      if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
        // handle the biggest terms
        DocSet termSet = searcher.getDocSet(tt.termQuery);
        DocSet intersection = termSet.intersection(docs);
        int collected =
            processor.collectFirstPhase(
                intersection,
                tt.termNum - startTermIndex,
                slotNum -> {
                  return new SlotContext(tt.termQuery);
                });
        final int termOrd = tt.termNum - startTermIndex;
        countAcc.incrementCount(termOrd, collected);
        for (SweepCountAccStruct entry : others) {
          entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet));
        }
        if (collected > 0) {
          uniqueTerms++;
        }
      }
    }

    if (termInstances > 0) {

      final List leaves = searcher.getIndexReader().leaves();
      final Iterator ctxIt = leaves.iterator();
      LeafReaderContext ctx = null;
      int segBase = 0;
      int segMax;
      int adjustedMax = 0;

      // TODO: handle facet.prefix here!!!

      SweepIteratorAndCounts sweepIterAndCounts =
          SweepDocIterator.newInstance(baseCountAccStruct, others);
      final SweepDocIterator iter = sweepIterAndCounts.iter;
      final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs;
      final SegCountGlobal counts = new SegCountGlobal(countAccs);
      while (iter.hasNext()) {
        int doc = iter.nextDoc();
        int maxIdx = iter.registerCounts(counts);
        boolean collectBase = iter.collectBase();

        if (doc >= adjustedMax) {
          do {
            ctx = ctxIt.next();
            if (ctx == null) {
              // should be impossible
              throw new RuntimeException("INTERNAL FACET ERROR");
            }
            segBase = ctx.docBase;
            segMax = ctx.reader().maxDoc();
            adjustedMax = segBase + segMax;
          } while (doc >= adjustedMax);
          assert doc >= ctx.docBase;
          processor.setNextReaderFirstPhase(ctx);
        }
        int segDoc = doc - segBase;

        int code = index[doc];

        if ((code & 0x80000000) != 0) {
          int pos = code & 0x7fffffff;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
          for (; ; ) {
            int delta = 0;
            for (; ; ) {
              byte b = arr[pos++];
              delta = (delta << 7) | (b & 0x7f);
              if ((b & 0x80) == 0) break;
            }
            if (delta == 0) break;
            tnum += delta - TNUM_OFFSET;
            int arrIdx = tnum - startTermIndex;
            if (arrIdx < 0) continue;
            if (arrIdx >= nTerms) break;
            counts.incrementCount(arrIdx, 1, maxIdx);
            if (collectBase) {
              processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
            }
          }
        } else {
          int tnum = 0;
          int delta = 0;
          for (; ; ) {
            delta = (delta << 7) | (code & 0x7f);
            if ((code & 0x80) == 0) {
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              int arrIdx = tnum - startTermIndex;
              if (arrIdx >= 0) {
                if (arrIdx >= nTerms) break;
                counts.incrementCount(arrIdx, 1, maxIdx);
                if (collectBase) {
                  processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
                }
              }
              delta = 0;
            }
            code >>>= 8;
          }
        }
      }
    }
  }

  String getReadableValue(BytesRef termval, FieldType ft, CharsRefBuilder charsRef) {
    return ft.indexedToReadable(termval, charsRef).toString();
  }

  /** may return a reused BytesRef */
  BytesRef getTermValue(TermsEnum te, int termNum) throws IOException {
    // System.out.println("getTermValue termNum=" + termNum + " this=" + this + " numTerms=" +
    // numTermsInField);
    if (bigTerms.size() > 0) {
      // see if the term is one of our big terms.
      TopTerm tt = bigTerms.get(termNum);
      if (tt != null) {
        // System.out.println("  return big " + tt.term);
        return tt.term;
      }
    }

    return lookupTerm(te, termNum);
  }

  @Override
  public String toString() {
    final long indexSize =
        indexedTermsArray == null
            ? 0
            : (8
                + 8
                + 8
                + 8
                + (indexedTermsArray.length << 3)
                + sizeOfIndexedStrings); // assume 8 byte references?
    return "{field="
        + field
        + ",memSize="
        + memSize()
        + ",tindexSize="
        + indexSize
        + ",time="
        + total_time
        + ",phase1="
        + phase1_time
        + ",nTerms="
        + numTermsInField
        + ",bigTerms="
        + bigTerms.size()
        + ",termInstances="
        + termInstances
        + ",uses="
        + use.get()
        + "}";
  }

  //////////////////////////////////////////////////////////////////
  //////////////////////////// caching /////////////////////////////
  //////////////////////////////////////////////////////////////////

  public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher)
      throws IOException {
    SolrCache cache = searcher.getFieldValueCache();
    if (cache == null) {
      return new UnInvertedField(field, searcher);
    }
    return cache.computeIfAbsent(field, f -> new UnInvertedField(f, searcher));
  }

  // Returns null if not already populated
  public static UnInvertedField checkUnInvertedField(String field, SolrIndexSearcher searcher)
      throws IOException {
    SolrCache cache = searcher.getFieldValueCache();
    if (cache == null) {
      return null;
    }
    Object uif = cache.get(field); // cache is already synchronized, so no extra sync needed
    // placeholder is an implementation detail, keep it hidden and return null if that is what we
    // got
    return uif == uifPlaceholder || !(uif instanceof UnInvertedField)
        ? null
        : (UnInvertedField) uif;
    // TODO: SolrCache is not used safely in other places, but this might be simpligfied to:
    //  return uif==uifPlaceholder ? null : uif;
  }
}