org.apache.lucene.facet.range.OverlappingLongRangeCounter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-facet Show documentation
Apache Lucene (module: facet)
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet.range;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.internal.hppc.IntCursor;
import org.apache.lucene.internal.hppc.LongArrayList;
import org.apache.lucene.internal.hppc.LongIntHashMap;
import org.apache.lucene.util.FixedBitSet;

/**
 * This implementation supports requested ranges that overlap. Because of this, we use a
 * segment-tree to more efficiently aggregate counts into ranges at the end of processing. We also
 * need to worry about double-counting issues since it's possible that multiple elementary
 * intervals, although mutually-exclusive, can roll-up to the same requested range. This creates
 * some complexity with how we need to handle multi-valued documents.
 */
class OverlappingLongRangeCounter extends LongRangeCounter {

  /** segment tree root node */
  private final LongRangeNode root;

  /** elementary interval boundaries used for efficient counting (bsearch to find interval) */
  private final long[] boundaries;

  /**
   * whether-or-not there are elementary interval counts that still need to be rolled up at the end
   */
  private boolean hasUnflushedCounts;

  // Needed only for counting single-valued docs:
  /** counts seen in each elementary interval */
  private int[] singleValuedElementaryIntervalCounts;

  // Needed only for counting multi-valued docs:
  /** whether-or-not an elementary interval has seen at least one match for a single doc */
  private FixedBitSet multiValuedDocElementaryIntervalHits;

  /** whether-or-not a requested range has seen at least one match for a single doc */
  private FixedBitSet multiValuedDocRangeHits;

  // Used during rollup
  private int elementaryIntervalUpto;

  /** number of counted documents that haven't matched any requested ranges */
  private int missingCount;

  OverlappingLongRangeCounter(LongRange[] ranges, int[] countBuffer) {
    super(countBuffer);

    // Build elementary intervals:
    List elementaryIntervals = buildElementaryIntervals(ranges);

    // Build binary tree on top of intervals:
    root = split(0, elementaryIntervals.size(), elementaryIntervals);

    // Set outputs, so we know which range to output for each node in the tree:
    for (int i = 0; i < ranges.length; i++) {
      root.addOutputs(i, ranges[i]);
    }

    // Keep track of elementary interval max boundaries for bsearch:
    boundaries = new long[elementaryIntervals.size()];
    for (int i = 0; i < boundaries.length; i++) {
      boundaries[i] = elementaryIntervals.get(i).end;
    }
  }

  @Override
  void startMultiValuedDoc() {
    super.startMultiValuedDoc();
    // Lazy init a bitset to track the elementary intervals we see of a multi-valued doc:
    if (multiValuedDocElementaryIntervalHits == null) {
      multiValuedDocElementaryIntervalHits = new FixedBitSet(boundaries.length);
    } else {
      multiValuedDocElementaryIntervalHits.clear();
    }
  }

  @Override
  boolean endMultiValuedDoc() {
    assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";

    // Short-circuit if the caller didn't specify any ranges to count:
    if (rangeCount() == 0) {
      return false;
    }

    // Do the rollup for this doc:

    // Lazy init a bitset to track the requested ranges seen for this multi-valued doc:
    if (multiValuedDocRangeHits == null) {
      multiValuedDocRangeHits = new FixedBitSet(rangeCount());
    } else {
      multiValuedDocRangeHits.clear();
    }
    elementaryIntervalUpto = 0;
    rollupMultiValued(root);

    // Actually increment the count for each matching range, and see if the doc contributed to
    // at least one:
    boolean docContributedToAtLeastOneRange = false;
    for (int i = multiValuedDocRangeHits.nextSetBit(0); i < multiValuedDocRangeHits.length(); ) {
      increment(i);
      docContributedToAtLeastOneRange = true;
      if (++i < multiValuedDocRangeHits.length()) {
        i = multiValuedDocRangeHits.nextSetBit(i);
      }
    }

    return docContributedToAtLeastOneRange;
  }

  @Override
  int finish() {
    if (hasUnflushedCounts) {
      // Rollup any outstanding counts from single-valued cases:
      missingCount = 0;
      elementaryIntervalUpto = 0;
      rollupSingleValued(root, false);

      return missingCount;
    } else {
      return 0;
    }
  }

  @Override
  protected long[] boundaries() {
    return boundaries;
  }

  @Override
  protected void processSingleValuedHit(int elementaryIntervalNum) {
    // Lazy init:
    if (singleValuedElementaryIntervalCounts == null) {
      singleValuedElementaryIntervalCounts = new int[boundaries.length];
    }

    singleValuedElementaryIntervalCounts[elementaryIntervalNum]++;
    hasUnflushedCounts = true;
  }

  @Override
  protected void processMultiValuedHit(int elementaryIntervalNum) {
    assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
    multiValuedDocElementaryIntervalHits.set(elementaryIntervalNum);
  }

  private static LongRangeNode split(int start, int end, List elementaryIntervals) {
    if (start == end - 1) {
      // leaf
      InclusiveRange range = elementaryIntervals.get(start);
      return new LongRangeNode(range.start, range.end, null, null, start);
    } else {
      int mid = (start + end) >>> 1;
      LongRangeNode left = split(start, mid, elementaryIntervals);
      LongRangeNode right = split(mid, end, elementaryIntervals);
      return new LongRangeNode(left.start, right.end, left, right, -1);
    }
  }

  /**
   * Rolls up all the single-valued doc counts. Note that this is done once at the end of processing
   * all documents (as part of {@link #finish()}. This is done in bulk at the end for efficiency
   * purposes (vs. after ever document). This works only for cases where documents have a
   * single-value. Multi-valued docs need to get rolled up after each document to ensure there's no
   * double-counting (see {@link #rollupMultiValued(LongRangeNode)})
   */
  private int rollupSingleValued(LongRangeNode node, boolean sawOutputs) {
    int count;
    sawOutputs |= node.outputs != null;
    if (node.left != null) {
      count = rollupSingleValued(node.left, sawOutputs);
      count += rollupSingleValued(node.right, sawOutputs);
    } else {
      // Leaf:
      count = singleValuedElementaryIntervalCounts[elementaryIntervalUpto];
      elementaryIntervalUpto++;
      if (sawOutputs == false) {
        // This is a missing count (no output ranges were seen "above" us):
        missingCount += count;
      }
    }
    if (node.outputs != null) {
      for (IntCursor rangeIndex : node.outputs) {
        increment(rangeIndex.value, count);
      }
    }

    return count;
  }

  /**
   * Rolls up all the multi-valued doc counts. Note that this is done at the end of each document
   * (as part of {@link #endMultiValuedDoc()}). All of the counts contributed by a single document
   * get rolled up into the appropriate ranges in this step. It must be done after each document so
   * that counts don't get double-counted, and so we know whether-or-not an individual doc actually
   * contributed to any of the user-requested ranges.
   */
  private boolean rollupMultiValued(LongRangeNode node) {
    boolean containedHit;
    if (node.left != null) {
      containedHit = rollupMultiValued(node.left);
      containedHit |= rollupMultiValued(node.right);
    } else {
      // Leaf:
      containedHit = multiValuedDocElementaryIntervalHits.get(elementaryIntervalUpto);
      elementaryIntervalUpto++;
    }
    if (containedHit && node.outputs != null) {
      for (IntCursor rangeIndex : node.outputs) {
        multiValuedDocRangeHits.set(rangeIndex.value);
      }
    }

    return containedHit;
  }

  private static List buildElementaryIntervals(LongRange[] ranges) {
    // Maps all range inclusive endpoints to int flags; 1
    // = start of interval, 2 = end of interval.  We need to
    // track the start vs end case separately because if a
    // given point is both, then it must be its own
    // elementary interval:
    LongIntHashMap endsMap = new LongIntHashMap();

    endsMap.put(Long.MIN_VALUE, 1);
    endsMap.put(Long.MAX_VALUE, 2);

    for (LongRange range : ranges) {
      int index = endsMap.indexOf(range.min);
      if (index < 0) {
        endsMap.indexInsert(index, range.min, 1);
      } else {
        endsMap.indexReplace(index, endsMap.indexGet(index) | 1);
      }
      index = endsMap.indexOf(range.max);
      if (index < 0) {
        endsMap.indexInsert(index, range.max, 2);
      } else {
        endsMap.indexReplace(index, endsMap.indexGet(index) | 2);
      }
    }

    LongArrayList endsList = new LongArrayList(endsMap.size());
    endsList.addAll(endsMap.keys());
    Arrays.sort(endsList.buffer, 0, endsList.size());

    // Build elementaryIntervals (a 1D Venn diagram):
    List elementaryIntervals = new ArrayList<>();
    int upto = 1;
    long v = endsList.get(0);
    long prev;
    if (endsMap.get(v) == 3) {
      elementaryIntervals.add(new InclusiveRange(v, v));
      prev = v + 1;
    } else {
      prev = v;
    }

    while (upto < endsList.size()) {
      v = endsList.get(upto);
      int flags = endsMap.get(v);
      if (flags == 3) {
        // This point is both an end and a start; we need to
        // separate it:
        if (v > prev) {
          elementaryIntervals.add(new InclusiveRange(prev, v - 1));
        }
        elementaryIntervals.add(new InclusiveRange(v, v));
        prev = v + 1;
      } else if (flags == 1) {
        // This point is only the start of an interval;
        // attach it to next interval:
        if (v > prev) {
          elementaryIntervals.add(new InclusiveRange(prev, v - 1));
        }
        prev = v;
      } else {
        assert flags == 2;
        // This point is only the end of an interval; attach
        // it to last interval:
        elementaryIntervals.add(new InclusiveRange(prev, v));
        prev = v + 1;
      }
      upto++;
    }

    return elementaryIntervals;
  }

  /** Holds one node of the segment tree. */
  public static final class LongRangeNode {
    final LongRangeNode left;
    final LongRangeNode right;

    // Our range, inclusive:
    final long start;
    final long end;

    // If we are a leaf, the index into elementary ranges that we point to:
    final int elementaryIntervalIndex;

    // Which range indices to output when a query goes
    // through this node:
    IntArrayList outputs;

    public LongRangeNode(
        long start,
        long end,
        LongRangeNode left,
        LongRangeNode right,
        int elementaryIntervalIndex) {
      this.start = start;
      this.end = end;
      this.left = left;
      this.right = right;
      this.elementaryIntervalIndex = elementaryIntervalIndex;
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      toString(sb, 0);
      return sb.toString();
    }

    static void indent(StringBuilder sb, int depth) {
      for (int i = 0; i < depth; i++) {
        sb.append("  ");
      }
    }

    /** Recursively assigns range outputs to each node. */
    void addOutputs(int index, LongRange range) {
      if (start >= range.min && end <= range.max) {
        // Our range is fully included in the incoming
        // range; add to our output list:
        if (outputs == null) {
          outputs = new IntArrayList();
        }
        outputs.add(index);
      } else if (left != null) {
        assert right != null;
        // Recurse:
        left.addOutputs(index, range);
        right.addOutputs(index, range);
      }
    }

    void toString(StringBuilder sb, int depth) {
      indent(sb, depth);
      if (left == null) {
        assert right == null;
        sb.append("leaf: ").append(start).append(" to ").append(end);
      } else {
        sb.append("node: ").append(start).append(" to ").append(end);
      }
      if (outputs != null) {
        sb.append(" outputs=");
        sb.append(outputs);
      }
      sb.append('\n');

      if (left != null) {
        assert right != null;
        left.toString(sb, depth + 1);
        right.toString(sb, depth + 1);
      }
    }
  }
}