com.sindicetech.siren.search.spans.NearSpans Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of siren-core Show documentation
SIREn core module
The newest version!
/**
 * Copyright (c) 2014, Sindice Limited. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package com.sindicetech.siren.search.spans;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.IntsRef;

import com.sindicetech.siren.index.DocsAndNodesIterator;
import com.sindicetech.siren.util.NodeUtils;

import java.io.IOException;
import java.util.List;

/**
 * A spans that is formed from the sub-spans of a NearSpanQuery
 * where the sub-spans have a maximum slop between them.
 * 
 * The score is computed as the sum of the sub-spans' score weighted by the sloppy weight.
 */
abstract class NearSpans extends Spans {

  protected final int allowedSlop;

  /**
   * The spans in the same order as the NearSpanQuery
   */
  protected final Spans[] subSpans;

  /**
   * The spans array that will be used for the sorting. This is necessary as we do not want to lose the order of
   * spans in {@link #subSpans}.
   */
  private final Spans[] subSpansSort;

  private int matchDoc = -1;
  private IntsRef matchNode = DocsAndNodesIterator.NULL_NODE;
  protected int matchStart = -1;
  protected int matchEnd = -1;

  // Even though the array is probably almost sorted, InPlaceMergeSorter will likely
  // perform better since it has a lower overhead than TimSorter for small arrays
  // TODO: Investigate if a priority queue will not be more performant than multiple sorters

  /**
   * A spans array sorter based on the document identifier
   */
  private final InPlaceMergeSorter docSorter = new InPlaceMergeSorter() {
    @Override
    protected void swap(final int i, final int j) {
      ArrayUtil.swap(subSpansSort, i, j);
    }
    @Override
    protected int compare(final int i, final int j) {
      return subSpansSort[i].doc() - subSpansSort[j].doc();
    }
  };

  /**
   * A spans array sorter based on the node
   */
  private final InPlaceMergeSorter nodeSorter = new InPlaceMergeSorter() {
    @Override
    protected void swap(final int i, final int j) {
      ArrayUtil.swap(subSpansSort, i, j);
    }
    @Override
    protected int compare(int i, int j) {
      return NodeUtils.compare(subSpansSort[i].node(), subSpansSort[j].node());
    }
  };

  public NearSpans(final List spans, final int slop) throws IOException {
    if (spans.size() < 2) {
      throw new IllegalArgumentException("Less than 2 spans clauses");
    }

    allowedSlop = slop;

    subSpans = new Spans[spans.size()];
    subSpansSort = new Spans[spans.size()];
    for (int i = 0; i < spans.size(); i++) {
      subSpans[i] = spans.get(i);
      subSpansSort[i] = subSpans[i]; // used in toSameDoc()
    }
  }

  @Override
  public int doc() { return matchDoc; }

  @Override
  public IntsRef node() { return matchNode; }

  @Override
  public int start() { return matchStart; }

  @Override
  public int end() { return matchEnd; }

  public Spans[] getSubSpans() {
    return subSpans;
  }

  @Override
  public long cost() {
    long minCost = Long.MAX_VALUE;
    for (int i = 0; i < subSpans.length; i++) {
      minCost = Math.min(minCost, subSpans[i].cost());
    }
    return minCost;
  }

  @Override
  public boolean nextCandidateDocument() throws IOException {
    // reset matchNode, matchStart and matchEnd
    this.resetMatchNode();
    this.resetStartAndEnd();

    // Advance all the spans to their next candidate
    for (int i = 0; i < subSpans.length; i++) {
      if (!subSpans[i].nextCandidateDocument()) {
        matchDoc = DocsAndNodesIterator.NO_MORE_DOC;
        return false;
      }
    }

    // Ensure that all the spans are on the same candidate
    return toSameCandidateDocument();
  }

  @Override
  public boolean skipToCandidate(final int target) throws IOException {
    // reset matchNode, matchStart and matchEnd
    this.resetMatchNode();
    this.resetStartAndEnd();

    // Advance all the spans to or after the target
    for (int i = 0; i < subSpans.length; i++) {
      if (!subSpans[i].skipToCandidate(target)) {
        matchDoc = DocsAndNodesIterator.NO_MORE_DOC;
        return false;
      }
    }

    // Ensure that all the spans are on the same candidate
    return toSameCandidateDocument();
  }

  /**
   * Advance the spans to the same candidate document
   */
  private boolean toSameCandidateDocument() throws IOException {
    docSorter.sort(0, subSpansSort.length);
    int firstIndex = 0;
    int maxDoc = subSpansSort[subSpansSort.length - 1].doc();

    while (subSpansSort[firstIndex].doc() != maxDoc) {
      if (!subSpansSort[firstIndex].skipToCandidate(maxDoc)) {
        matchDoc = DocsAndNodesIterator.NO_MORE_DOC;
        return false;
      }
      maxDoc = subSpansSort[firstIndex].doc();
      if (++firstIndex == subSpansSort.length) {
        firstIndex = 0;
      }
    }

    // assert doc - loop will be removed by jit if assert is disabled
    for (int i = 0; i < subSpansSort.length; i++) {
      assert (subSpansSort[i].doc() == maxDoc)
              : " NearSpansOrdered.toSameCandidateDocument() spans " + subSpansSort[0]
              + "\n at doc " + subSpansSort[i].doc()
              + ", but should be at " + maxDoc;
    }

    matchDoc = subSpans[0].doc();
    return true;
  }

  @Override
  public boolean nextNode() throws IOException {
    // reset matchStart and matchEnd
    this.resetStartAndEnd();

    // Advance all the spans to their next node
    for (int i = 0; i < subSpans.length; i++) {
      if (!subSpans[i].nextNode()) {
        matchNode = DocsAndNodesIterator.NO_MORE_NOD;
        return false;
      }
    }

    // Ensure that all the spans are on the same node
    return toSameNode();
  }

  /**
   * Advance the spans to the same node
   */
  private boolean toSameNode() throws IOException {
    nodeSorter.sort(0, subSpansSort.length);
    int firstIndex = 0;
    IntsRef maxNode = subSpansSort[subSpansSort.length - 1].node();

    while (NodeUtils.compare(subSpansSort[firstIndex].node(), maxNode) != 0) {
      // Advance the first span to or after the max node
      do {
        if (!subSpansSort[firstIndex].nextNode()) {
          matchNode = DocsAndNodesIterator.NO_MORE_NOD;
          return false;
        }
      // ensure that we have advanced to or after the max node
      } while (NodeUtils.compare(subSpansSort[firstIndex].node(), maxNode) < 0);

      maxNode = subSpansSort[firstIndex].node();
      if (++firstIndex == subSpansSort.length) {
        firstIndex = 0;
      }
    }

    // assert node - loop will be removed by jit if assert is disabled
    for (int i = 0; i < subSpansSort.length; i++) {
      assert (subSpansSort[i].node().equals(maxNode))
              : " NearSpansOrdered.toSameNode() spans " + subSpansSort[0]
              + "\n at node " + subSpansSort[i].node()
              + ", but should be at " + maxNode;
    }

    matchNode = subSpans[0].node();
    return true;
  }

  @Override
  public float scoreInNode() throws IOException {
    float score = 0.0f;

    // Sum the score of the sub-spans
    for (int i = 0; i < subSpans.length; i++) {
      score += subSpans[i].scoreInNode();
    }

    // compute the sloppy weight
    float sloppyWeight = this.sloppyWeight(this.getSlop());
    return score * sloppyWeight;
  }

  /**
   * Reset the {@link #matchNode} to null
   */
  private void resetMatchNode() {
    this.matchNode = DocsAndNodesIterator.NULL_NODE;
  }

  /**
   * Reset the {@link #matchStart} and {@link #matchEnd} to sentinel value -1
   */
  private void resetStartAndEnd() {
    this.matchStart = -1;
    this.matchEnd = -1;
  }

  /**
   * Check whether two {@link com.sindicetech.siren.search.spans.Spans} in the same document are ordered.
   *
   * @return true iff spans1 starts before spans2
   *              or the spans start at the same position,
   *              and spans1 ends before spans2.
   */
  static final boolean isSpansOrdered(final int start1, final int end1, final int start2, final int end2) {
    return (start1 == start2) ? (end1 < end2) : (start1 < start2);
  }

  @Override
  public String toString() {
    int doc = this.doc();
    IntsRef node = this.node();
    int start = this.start();
    int end = this.end();

    return getClass().getName() + "@" +
      (doc == -1 ? "START" : (doc == DocsAndNodesIterator.NO_MORE_DOC) ? "END" : doc + "-" + node + "-" + start + "-" + end);
  }

}