All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.request.PerSegmentSingleValuedFaceting Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.request;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import java.util.function.Predicate;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.schema.FieldType;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.BoundedTreeSet;

/** A class which performs per-segment field faceting for single-valued string fields. */
class PerSegmentSingleValuedFaceting {

  // input params
  SolrIndexSearcher searcher;
  DocSet docs;
  String fieldName;
  int offset;
  int limit;
  int mincount;
  boolean missing;
  String sort;
  String prefix;

  private final Predicate termFilter;

  int nThreads;

  public PerSegmentSingleValuedFaceting(
      SolrIndexSearcher searcher,
      DocSet docs,
      String fieldName,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase) {
    this(
        searcher,
        docs,
        fieldName,
        offset,
        limit,
        mincount,
        missing,
        sort,
        prefix,
        new SubstringBytesRefFilter(contains, ignoreCase));
  }

  public PerSegmentSingleValuedFaceting(
      SolrIndexSearcher searcher,
      DocSet docs,
      String fieldName,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      Predicate filter) {
    this.searcher = searcher;
    this.docs = docs;
    this.fieldName = fieldName;
    this.offset = offset;
    this.limit = limit;
    this.mincount = mincount;
    this.missing = missing;
    this.sort = sort;
    this.prefix = prefix;
    this.termFilter = filter;
  }

  public void setNumThreads(int threads) {
    nThreads = threads;
  }

  NamedList getFacetCounts(Executor executor) throws IOException {

    CompletionService completionService = new ExecutorCompletionService<>(executor);

    final List leaves = searcher.getTopReaderContext().leaves();
    // The list of pending tasks that aren't immediately submitted
    // TODO: Is there a completion service, or a delegating executor that can
    // limit the number of concurrent tasks submitted to a bigger executor?
    Deque> pending = new ArrayDeque<>();

    int threads = nThreads <= 0 ? Integer.MAX_VALUE : nThreads;

    for (final LeafReaderContext leave : leaves) {
      final SegFacet segFacet = new SegFacet(leave);

      Callable task =
          () -> {
            segFacet.countTerms();
            return segFacet;
          };

      // TODO: if limiting threads, submit by largest segment first?

      if (--threads >= 0) {
        completionService.submit(task);
      } else {
        pending.add(task);
      }
    }

    // now merge the per-segment results
    PriorityQueue queue =
        new PriorityQueue<>(leaves.size()) {
          @Override
          protected boolean lessThan(SegFacet a, SegFacet b) {
            return a.tempBR.compareTo(b.tempBR) < 0;
          }
        };

    boolean hasMissingCount = false;
    int missingCount = 0;
    for (int i = 0, c = leaves.size(); i < c; i++) {
      SegFacet seg = null;

      try {
        Future future = completionService.take();
        seg = future.get();
        if (!pending.isEmpty()) {
          completionService.submit(pending.removeFirst());
        }
      } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } catch (ExecutionException e) {
        Throwable cause = e.getCause();
        if (cause instanceof RuntimeException) {
          throw (RuntimeException) cause;
        } else {
          throw new SolrException(
              SolrException.ErrorCode.SERVER_ERROR,
              "Error in per-segment faceting on field: " + fieldName,
              cause);
        }
      }

      if (seg.startTermIndex < seg.endTermIndex) {
        if (seg.startTermIndex == -1) {
          hasMissingCount = true;
          missingCount += seg.counts[0];
          seg.pos = 0;
        } else {
          seg.pos = seg.startTermIndex;
        }
        if (seg.pos < seg.endTermIndex && (mincount < 1 || seg.hasAnyCount)) {
          seg.tenum = seg.si.termsEnum();
          seg.tenum.seekExact(seg.pos);
          seg.tempBR = seg.tenum.term();
          queue.add(seg);
        }
      }
    }

    if (limit == 0) {
      NamedList res = new NamedList<>();
      return finalize(res, missingCount, hasMissingCount);
    }

    FacetCollector collector;
    if (sort.equals(FacetParams.FACET_SORT_COUNT)
        || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
      collector = new CountSortedFacetCollector(offset, limit, mincount);
    } else {
      collector = new IndexSortedFacetCollector(offset, limit, mincount);
    }

    BytesRefBuilder val = new BytesRefBuilder();

    while (queue.size() > 0) {
      SegFacet seg = queue.top();

      boolean collect = termFilter == null || termFilter.test(seg.tempBR);

      // we will normally end up advancing the term enum for this segment
      // while still using "val", so we need to make a copy since the BytesRef
      // may be shared across calls.
      if (collect) {
        val.copyBytes(seg.tempBR);
      }

      int count = 0;

      do {
        if (collect) {
          count += seg.counts[seg.pos - seg.startTermIndex];
        }

        // if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry.
        do {
          ++seg.pos;
        } while ((seg.pos < seg.endTermIndex) // stop incrementing before we run off the end
            && (seg.tenum.next() != null
                || true) // move term enum forward with position -- dont care about value
            && (mincount > 0) // only skip ahead if mincount > 0
            && (seg.counts[seg.pos - seg.startTermIndex] == 0) // check zero count
        );

        if (seg.pos >= seg.endTermIndex) {
          queue.pop();
          seg = queue.top();
        } else {
          seg.tempBR = seg.tenum.term();
          seg = queue.updateTop();
        }
      } while (seg != null && val.get().compareTo(seg.tempBR) == 0);

      if (collect) {
        boolean stop = collector.collect(val.get(), count);
        if (stop) break;
      }
    }

    NamedList res = collector.getFacetCounts();

    // convert labels to readable form
    FieldType ft = searcher.getSchema().getFieldType(fieldName);
    int sz = res.size();
    for (int i = 0; i < sz; i++) {
      res.setName(i, ft.indexedToReadable(res.getName(i)));
    }

    return finalize(res, missingCount, hasMissingCount);
  }

  private NamedList finalize(
      NamedList res, int missingCount, boolean hasMissingCount) throws IOException {
    if (missing) {
      if (!hasMissingCount) {
        missingCount = SimpleFacets.getFieldMissingCount(searcher, docs, fieldName);
      }
      res.add(null, missingCount);
    }

    return res;
  }

  class SegFacet {
    LeafReaderContext context;

    SegFacet(LeafReaderContext context) {
      this.context = context;
    }

    SortedDocValues si;
    int startTermIndex;
    int endTermIndex;
    int[] counts;

    // whether this segment has any non-zero term counts
    // used to ignore insignificant segments when mincount>0
    boolean hasAnyCount = false;

    int pos; // only used when merging
    TermsEnum tenum; // only used when merging

    BytesRef tempBR = new BytesRef();

    void countTerms() throws IOException {
      si = DocValues.getSorted(context.reader(), fieldName);
      // SolrCore.log.info("reader= " + reader + "  FC=" + System.identityHashCode(si));

      if (prefix != null) {
        BytesRefBuilder prefixRef = new BytesRefBuilder();
        prefixRef.copyChars(prefix);
        startTermIndex = si.lookupTerm(prefixRef.get());
        if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
        prefixRef.append(UnicodeUtil.BIG_TERM);
        // TODO: we could constrain the lower endpoint if we had a binarySearch method that allowed
        // passing start/end
        endTermIndex = si.lookupTerm(prefixRef.get());
        assert endTermIndex < 0;
        endTermIndex = -endTermIndex - 1;
      } else {
        startTermIndex = -1;
        endTermIndex = si.getValueCount();
      }
      final int nTerms = endTermIndex - startTermIndex;
      if (nTerms == 0) return;

      // count collection array only needs to be as big as the number of terms we are
      // going to collect counts for.
      final int[] counts = this.counts = new int[nTerms];
      final DocIdSetIterator iter = docs.iterator(context); // this set only includes live docs

      if (iter == null) {
        // `DocIdSet.iterator()` can return null instead of empty DocIdSetIterator
        return;
      }

      if (prefix == null) {
        // specialized version when collecting counts for all terms
        int doc;
        while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
          int t;
          if (si.advanceExact(doc)) {
            t = 1 + si.ordValue();
          } else {
            t = 0;
          }
          hasAnyCount = hasAnyCount || t > 0; // counts[0] == missing counts
          counts[t]++;
        }
      } else {
        // version that adjusts term numbers because we aren't collecting the full range
        int doc;
        while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
          int term;
          if (si.advanceExact(doc)) {
            term = si.ordValue();
          } else {
            term = -1;
          }
          int arrIdx = term - startTermIndex;
          if (arrIdx >= 0 && arrIdx < nTerms) {
            counts[arrIdx]++;
            hasAnyCount = true;
          }
        }
      }
    }
  }
}

abstract class FacetCollector {
  /** return true to stop collection */
  public abstract boolean collect(BytesRef term, int count);

  public abstract NamedList getFacetCounts();
}

// This collector expects facets to be collected in index order
class CountSortedFacetCollector extends FacetCollector {
  private final CharsRefBuilder spare = new CharsRefBuilder();

  final int offset;
  final int limit;
  final int maxsize;
  final BoundedTreeSet> queue;

  int min; // the smallest value in the top 'N' values

  public CountSortedFacetCollector(int offset, int limit, int mincount) {
    this.offset = offset;
    this.limit = limit;
    maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
    queue = new BoundedTreeSet<>(maxsize);
    min = mincount - 1; // the smallest value in the top 'N' values
  }

  @Override
  public boolean collect(BytesRef term, int count) {
    if (count > min) {
      // NOTE: we use c>min rather than c>=min as an optimization because we are going in
      // index order, so we already know that the keys are ordered.  This can be very
      // important if a lot of the counts are repeated (like zero counts would be).
      spare.copyUTF8Bytes(term);
      queue.add(new SimpleFacets.CountPair<>(spare.toString(), count));
      if (queue.size() >= maxsize) min = queue.last().val;
    }
    return false;
  }

  @Override
  public NamedList getFacetCounts() {
    NamedList res = new NamedList<>();
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
    // now select the right page from the results
    for (SimpleFacets.CountPair p : queue) {
      if (--off >= 0) continue;
      if (--lim < 0) break;
      res.add(p.key, p.val);
    }
    return res;
  }
}

// This collector expects facets to be collected in index order
class IndexSortedFacetCollector extends FacetCollector {
  private final CharsRefBuilder spare = new CharsRefBuilder();

  int offset;
  int limit;
  final int mincount;
  final NamedList res = new NamedList<>();

  public IndexSortedFacetCollector(int offset, int limit, int mincount) {
    this.offset = offset;
    this.limit = limit > 0 ? limit : Integer.MAX_VALUE;
    this.mincount = mincount;
  }

  @Override
  public boolean collect(BytesRef term, int count) {
    if (count < mincount) {
      return false;
    }

    if (offset > 0) {
      offset--;
      return false;
    }

    if (limit > 0) {
      spare.copyUTF8Bytes(term);
      res.add(spare.toString(), count);
      limit--;
    }

    return limit <= 0;
  }

  @Override
  public NamedList getFacetCounts() {
    return res;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy