org.apache.solr.search.facet.FacetFieldProcessorByArrayDV Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.facet;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
import org.apache.solr.search.facet.SweepCountAware.SegCountPerSeg;
import org.apache.solr.uninverting.FieldCacheImpl;

/** Grabs values from {@link DocValues}. */
class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
  static boolean unwrap_singleValued_multiDv = true; // only set to false for test coverage

  boolean multiValuedField;
  SortedSetDocValues si; // only used for term lookups (for both single and multi-valued)
  OrdinalMap ordinalMap = null; // maps per-segment ords to global ords

  FacetFieldProcessorByArrayDV(FacetContext fcontext, FacetField freq, SchemaField sf) {
    super(fcontext, freq, sf);
    multiValuedField = sf.multiValued() || sf.getType().multiValuedFieldCache();
  }

  @Override
  protected void findStartAndEndOrds() throws IOException {
    if (multiValuedField) {
      si = FieldUtil.getSortedSetDocValues(fcontext.qcontext, sf, null);
      if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
        ordinalMap = ((MultiDocValues.MultiSortedSetDocValues) si).mapping;
      }
    } else {
      // multi-valued view
      SortedDocValues single = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
      si = DocValues.singleton(single);
      if (single instanceof MultiDocValues.MultiSortedDocValues) {
        ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
      }
    }

    if (si.getValueCount() >= Integer.MAX_VALUE) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Field has too many unique values. field=" + sf + " nterms= " + si.getValueCount());
    }

    if (prefixRef != null) {
      startTermIndex = (int) si.lookupTerm(prefixRef.get());
      if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
      prefixRef.append(UnicodeUtil.BIG_TERM);
      endTermIndex = (int) si.lookupTerm(prefixRef.get());
      assert endTermIndex < 0;
      endTermIndex = -endTermIndex - 1;
    } else {
      startTermIndex = 0;
      endTermIndex = (int) si.getValueCount();
    }

    nTerms = endTermIndex - startTermIndex;
  }

  @Override
  protected void collectDocs() throws IOException {
    int domainSize = fcontext.base.size();

    // TODO: what about allBuckets? missing bucket?
    if (nTerms <= 0 || domainSize < effectiveMincount) {
      return;
    }

    final SweepCountAccStruct base = SweepingCountSlotAcc.baseStructOf(this);
    final List others = SweepingCountSlotAcc.otherStructsOf(this);
    assert null != base;

    // TODO: refactor some of this logic into a base class
    boolean countOnly = collectAcc == null && allBucketsAcc == null;
    boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();

    // Are we expecting many hits per bucket?
    // FUTURE: pro-rate for nTerms?
    // FUTURE: better take into account number of values in multi-valued fields.  This info is
    // available for indexed fields.
    // FUTURE: take into account that bigger ord maps are more expensive than smaller ones
    // One test: 5M doc index, faceting on a single-valued field with almost 1M unique values,
    // crossover point where global counting was slower than per-segment counting was a domain of
    // 658k docs.  At that point, top 10 buckets had 6-7 matches each. this was for heap docvalues
    // produced by UninvertingReader Since these values were randomly distributed, lets round our
    // domain multiplier up to account for less random real world data.
    long domainMultiplier = multiValuedField ? 4L : 2L;
    // +3 to increase test coverage with small tests
    boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3);

    // If we're only calculating counts, we're not prefixing, and we expect to collect many
    // documents per unique value, then collect per-segment before mapping to global ords at the
    // end.  This will save redundant seg->global ord mappings.
    // FUTURE: there are probably some other non "countOnly" cases where we can use this as well
    // (i.e. those where the docid is not used)
    boolean canDoPerSeg = countOnly && fullRange;
    boolean accumSeg = manyHitsPerBucket && canDoPerSeg;

    if (freq.perSeg != null)
      accumSeg = canDoPerSeg && freq.perSeg; // internal - override perSeg heuristic

    final int maxSize = others.size() + 1; // others + base
    final List leaves = fcontext.searcher.getIndexReader().leaves();
    final DocIdSetIterator[] subIterators = new DocIdSetIterator[maxSize];
    final CountSlotAcc[] activeCountAccs = new CountSlotAcc[maxSize];

    for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
      LeafReaderContext subCtx = leaves.get(subIdx);

      setNextReaderFirstPhase(subCtx);

      final SweepDISI disi =
          SweepDISI.newInstance(base, others, subIterators, activeCountAccs, subCtx);
      if (disi == null) {
        continue;
      }
      LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);

      SortedDocValues singleDv = null;
      SortedSetDocValues multiDv = null;
      if (multiValuedField) {
        // TODO: get sub from multi?
        multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
        if (multiDv == null) {
          if (countOnly) {
            continue;
          } else {
            multiDv = DocValues.emptySortedSet();
          }
        } else if (countOnly && multiDv.getValueCount() < 1) {
          continue;
        }
        // some codecs may optimize SortedSet storage for single-valued fields
        // this will be null if this is not a wrapped single valued docvalues.
        if (unwrap_singleValued_multiDv) {
          singleDv = DocValues.unwrapSingleton(multiDv);
        }
      } else {
        singleDv = subCtx.reader().getSortedDocValues(sf.getName());
        if (singleDv == null) {
          if (countOnly) {
            continue;
          } else {
            singleDv = DocValues.emptySorted();
          }
        } else if (countOnly && singleDv.getValueCount() < 1) {
          continue;
        }
      }

      if (singleDv != null) {
        if (accumSeg) {
          collectPerSeg(singleDv, disi, toGlobal);
        } else {
          if (canDoPerSeg && toGlobal != null) {
            collectCounts(singleDv, disi, toGlobal);
          } else {
            collectDocs(singleDv, disi, toGlobal);
          }
        }
      } else {
        if (accumSeg) {
          collectPerSeg(multiDv, disi, toGlobal);
        } else {
          if (canDoPerSeg && toGlobal != null) {
            collectCounts(multiDv, disi, toGlobal);
          } else {
            collectDocs(multiDv, disi, toGlobal);
          }
        }
      }
    }

    Arrays.fill(reuse, null); // better GC
  }

  @Override
  protected BytesRef lookupOrd(int ord) throws IOException {
    return si.lookupOrd(ord);
  }

  private void collectPerSeg(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal)
      throws IOException {
    int segMax = singleDv.getValueCount();
    final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);

    /*
     * alternate trial implementations // ord // FieldUtil.visitOrds(singleDv, disi,
     * (doc,ord)->{counts[ord+1]++;} );
     *
     * FieldUtil.OrdValues ordValues = FieldUtil.getOrdValues(singleDv, disi); while
     * (ordValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { counts[ ordValues.getOrd() + 1]++; }
     */

    // calculate segment-local counts
    int doc;
    if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
      FieldCacheImpl.SortedDocValuesImpl.Iter fc =
          (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv;
      while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        final int segOrd = fc.getOrd(doc);
        if (segOrd >= 0) {
          final int maxIdx = disi.registerCounts(segCounter);
          segCounter.incrementCount(segOrd, 1, maxIdx);
        }
      }
    } else {
      while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        if (singleDv.advanceExact(doc)) {
          final int segOrd = singleDv.ordValue();
          if (segOrd >= 0) {
            final int maxIdx = disi.registerCounts(segCounter);
            segCounter.incrementCount(segOrd, 1, maxIdx);
          }
        }
      }
    }

    // convert segment-local counts to global counts
    segCounter.register(disi.countAccs, toGlobal, segMax - 1);
  }

  private SegCountPerSeg getSegCountPerSeg(SweepDISI disi, int segMax) {
    final int size = disi.size;
    return new SegCountPerSeg(
        getSegmentCountArrays(segMax, size), getBoolArr(segMax), segMax, size);
  }

  private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedDocValues dv) {
    return new SegCountGlobal(disi.countAccs);
  }

  private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedSetDocValues dv) {
    return new SegCountGlobal(disi.countAccs);
  }

  private void collectPerSeg(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal)
      throws IOException {
    int segMax = (int) multiDv.getValueCount();
    final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);

    int doc;
    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      if (multiDv.advanceExact(doc)) {
        final int maxIdx = disi.registerCounts(segCounter);
        for (; ; ) {
          int segOrd = (int) multiDv.nextOrd();
          if (segOrd < 0) break;
          segCounter.incrementCount(segOrd, 1, maxIdx);
        }
      }
    }

    segCounter.register(disi.countAccs, toGlobal, segMax - 1);
  }

  private boolean[] reuseBool;

  private boolean[] getBoolArr(int maxNeeded) {
    if (reuseBool == null) {
      // make the count array large enough for any segment
      // FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
      reuseBool = new boolean[(int) si.getValueCount() + 1];
    } else {
      Arrays.fill(reuseBool, 0, maxNeeded, false);
    }
    return reuseBool;
  }

  private int[][] reuse = new int[12][];

  private int[] getCountArr(int maxNeeded, int idx) {
    if (idx >= reuse.length) {
      reuse = Arrays.copyOf(reuse, idx + 1);
    }
    if (reuse[idx] == null) {
      // make the count array large enough for any segment
      // FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
      reuse[idx] = new int[(int) si.getValueCount() + 1];
    } else {
      Arrays.fill(reuse[idx], 0, maxNeeded, 0);
    }
    return reuse[idx];
  }

  private int[][] getSegmentCountArrays(int segMax, int size) {
    int[][] ret = new int[size][];
    int i = size - 1;
    do {
      ret[i] = getCountArr(segMax, i);
    } while (i-- > 0);
    return ret;
  }

  private void collectDocs(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal)
      throws IOException {
    int doc;
    final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      if (singleDv.advanceExact(doc)) {
        final int maxIdx = disi.registerCounts(segCounter);
        int segOrd = singleDv.ordValue();
        collect(doc, segOrd, toGlobal, segCounter, maxIdx, disi.collectBase());
      }
    }
  }

  private void collectCounts(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal)
      throws IOException {
    final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
    int doc;
    if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {

      FieldCacheImpl.SortedDocValuesImpl.Iter fc =
          (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv;
      while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        int segOrd = fc.getOrd(doc);
        if (segOrd < 0) continue;
        int ord = (int) toGlobal.get(segOrd);
        int maxIdx = disi.registerCounts(segCounter);
        segCounter.incrementCount(ord, 1, maxIdx);
      }

    } else {

      while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        if (singleDv.advanceExact(doc)) {
          int segOrd = singleDv.ordValue();
          int ord = (int) toGlobal.get(segOrd);
          int maxIdx = disi.registerCounts(segCounter);
          segCounter.incrementCount(ord, 1, maxIdx);
        }
      }
    }
  }

  private void collectDocs(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal)
      throws IOException {
    final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
    int doc;
    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      if (multiDv.advanceExact(doc)) {
        final int maxIdx = disi.registerCounts(segCounter);
        final boolean collectBase = disi.collectBase();
        for (; ; ) {
          int segOrd = (int) multiDv.nextOrd();
          if (segOrd < 0) break;
          collect(doc, segOrd, toGlobal, segCounter, maxIdx, collectBase);
        }
      }
    }
  }

  private void collectCounts(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal)
      throws IOException {
    final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
    int doc;
    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      if (multiDv.advanceExact(doc)) {
        final int maxIdx = disi.registerCounts(segCounter);
        for (; ; ) {
          int segOrd = (int) multiDv.nextOrd();
          if (segOrd < 0) break;
          int ord = (int) toGlobal.get(segOrd);
          segCounter.incrementCount(ord, 1, maxIdx);
        }
      }
    }
  }

  private void collect(
      int doc,
      int segOrd,
      LongValues toGlobal,
      SegCountGlobal segCounter,
      int maxIdx,
      boolean collectBase)
      throws IOException {
    int ord = (toGlobal != null && segOrd >= 0) ? (int) toGlobal.get(segOrd) : segOrd;

    int arrIdx = ord - startTermIndex;
    // This code handles faceting prefixes, which narrows the range of ords we want to collect.
    // It’s not an error for an ord to fall outside this range… we simply want to skip it.
    if (arrIdx >= 0 && arrIdx < nTerms) {
      segCounter.incrementCount(arrIdx, 1, maxIdx);
      if (collectBase) {
        if (collectAcc != null) {
          collectAcc.collect(doc, arrIdx, slotContext);
        }
        if (allBucketsAcc != null) {
          allBucketsAcc.collect(doc, arrIdx, slotContext);
        }
      }
    }
  }
}