org.apache.lucene.codecs.blocktreeords.OrdsIntersectTermsEnum Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Codecs and postings formats for Apache Lucene.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blocktreeords;

import java.io.IOException;
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.FST;

// NOTE: cannot seek!
final class OrdsIntersectTermsEnum extends BaseTermsEnum {
  final IndexInput in;

  private OrdsIntersectTermsEnumFrame[] stack;

  @SuppressWarnings({"rawtypes", "unchecked"})
  private FST.Arc[] arcs = new FST.Arc[5];

  final ByteRunnable byteRunnable;
  protected final TransitionAccessor transitionAccessor;
  private final BytesRef commonSuffixRef;

  private OrdsIntersectTermsEnumFrame currentFrame;

  private final BytesRef term = new BytesRef();

  private final FST.BytesReader fstReader;

  final OrdsFieldReader fr;

  private BytesRef savedStartTerm;

  // TODO: in some cases we can filter by length?  eg
  // regexp foo*bar must be at least length 6 bytes
  public OrdsIntersectTermsEnum(OrdsFieldReader fr, CompiledAutomaton compiled, BytesRef startTerm)
      throws IOException {
    // if (DEBUG) {
    //   System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" +
    // ToStringUtils.bytesRefToString(compiled.commonSuffixRef));
    // }
    this.fr = fr;
    this.byteRunnable = compiled.getByteRunnable();
    this.transitionAccessor = compiled.getTransitionAccessor();
    commonSuffixRef = compiled.commonSuffixRef;
    in = fr.parent.in.clone();
    stack = new OrdsIntersectTermsEnumFrame[5];
    for (int idx = 0; idx < stack.length; idx++) {
      stack[idx] = new OrdsIntersectTermsEnumFrame(this, idx);
    }
    for (int arcIdx = 0; arcIdx < arcs.length; arcIdx++) {
      arcs[arcIdx] = new FST.Arc<>();
    }

    if (fr.index == null) {
      fstReader = null;
    } else {
      fstReader = fr.index.getBytesReader();
    }

    // TODO: if the automaton is "smallish" we really
    // should use the terms index to seek at least to
    // the initial term and likely to subsequent terms
    // (or, maybe just fallback to ATE for such cases).
    // Else the seek cost of loading the frames will be
    // too costly.

    final FST.Arc arc = fr.index.getFirstArc(arcs[0]);
    // Empty string prefix must have an output in the index!
    assert arc.isFinal();

    // Special pushFrame since it's the first one:
    final OrdsIntersectTermsEnumFrame f = stack[0];
    f.fp = f.fpOrig = fr.rootBlockFP;
    f.prefix = 0;
    f.setState(0);
    f.arc = arc;
    f.outputPrefix = arc.output();
    f.load(fr.rootCode);

    // for assert:
    assert setSavedStartTerm(startTerm);

    currentFrame = f;
    if (startTerm != null) {
      seekToStartTerm(startTerm);
    }
  }

  // only for assert:
  private boolean setSavedStartTerm(BytesRef startTerm) {
    savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
    return true;
  }

  @Override
  public TermState termState() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.clone();
  }

  private OrdsIntersectTermsEnumFrame getFrame(int ord) throws IOException {
    if (ord >= stack.length) {
      final OrdsIntersectTermsEnumFrame[] next =
          new OrdsIntersectTermsEnumFrame
              [ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(stack, 0, next, 0, stack.length);
      for (int stackOrd = stack.length; stackOrd < next.length; stackOrd++) {
        next[stackOrd] = new OrdsIntersectTermsEnumFrame(this, stackOrd);
      }
      stack = next;
    }
    assert stack[ord].ord == ord;
    return stack[ord];
  }

  private FST.Arc getArc(int ord) {
    if (ord >= arcs.length) {
      @SuppressWarnings({"rawtypes", "unchecked"})
      final FST.Arc[] next =
          new FST.Arc[ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(arcs, 0, next, 0, arcs.length);
      for (int arcOrd = arcs.length; arcOrd < next.length; arcOrd++) {
        next[arcOrd] = new FST.Arc<>();
      }
      arcs = next;
    }
    return arcs[ord];
  }

  private OrdsIntersectTermsEnumFrame pushFrame(int state) throws IOException {
    final OrdsIntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1 + currentFrame.ord);

    f.fp = f.fpOrig = currentFrame.lastSubFP;
    f.prefix = currentFrame.prefix + currentFrame.suffix;
    // if (DEBUG) System.out.println("    pushFrame state=" + state + " prefix=" + f.prefix);
    f.setState(state);

    // Walk the arc through the index -- we only
    // "bother" with this so we can get the floor data
    // from the index and skip floor blocks when
    // possible:
    FST.Arc arc = currentFrame.arc;
    int idx = currentFrame.prefix;
    assert currentFrame.suffix > 0;
    Output output = currentFrame.outputPrefix;
    while (idx < f.prefix) {
      final int target = term.bytes[idx] & 0xff;
      // TODO: we could be more efficient for the next()
      // case by using current arc as starting point,
      // passed to findTargetArc
      arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
      assert arc != null;
      output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output());
      idx++;
    }

    f.arc = arc;
    f.outputPrefix = output;
    assert arc.isFinal();
    f.load(OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput()));
    return f;
  }

  @Override
  public BytesRef term() {
    return term;
  }

  // TODO: do we need ord() here?  OrdsIntersectTermsEnumFrame tracks termOrd but it may be buggy!

  @Override
  public int docFreq() throws IOException {
    // if (DEBUG) System.out.println("BTIR.docFreq");
    currentFrame.decodeMetaData();
    // if (DEBUG) System.out.println("  return " + currentFrame.termState.docFreq);
    return currentFrame.termState.docFreq;
  }

  @Override
  public long totalTermFreq() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.totalTermFreq;
  }

  @Override
  public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
    currentFrame.decodeMetaData();
    return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags);
  }

  @Override
  public ImpactsEnum impacts(int flags) throws IOException {
    currentFrame.decodeMetaData();
    return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.termState, flags);
  }

  private int getState() {
    int state = currentFrame.state;
    for (int idx = 0; idx < currentFrame.suffix; idx++) {
      state =
          byteRunnable.step(
              state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff);
      assert state != -1;
    }
    return state;
  }

  // NOTE: specialized to only doing the first-time
  // seek, but we could generalize it to allow
  // arbitrary seekExact/Ceil.  Note that this is a
  // seekFloor!
  private void seekToStartTerm(BytesRef target) throws IOException {
    // if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString());
    assert currentFrame.ord == 0;
    if (term.length < target.length) {
      term.bytes = ArrayUtil.grow(term.bytes, target.length);
    }
    FST.Arc arc = arcs[0];
    assert arc == currentFrame.arc;

    for (int idx = 0; idx <= target.length; idx++) {

      while (true) {
        final int savePos = currentFrame.suffixesReader.getPosition();
        final int saveStartBytePos = currentFrame.startBytePos;
        final int saveSuffix = currentFrame.suffix;
        final long saveLastSubFP = currentFrame.lastSubFP;
        final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;

        final boolean isSubBlock = currentFrame.next();

        // if (DEBUG) System.out.println("    cycle ent=" + currentFrame.nextEnt + " (of " +
        // currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" +
        // currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix ==
        // 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
        term.length = currentFrame.prefix + currentFrame.suffix;
        if (term.bytes.length < term.length) {
          term.bytes = ArrayUtil.grow(term.bytes, term.length);
        }
        System.arraycopy(
            currentFrame.suffixBytes,
            currentFrame.startBytePos,
            term.bytes,
            currentFrame.prefix,
            currentFrame.suffix);

        if (isSubBlock && StringHelper.startsWith(target, term)) {
          // Recurse
          // if (DEBUG) System.out.println("      recurse!");
          currentFrame = pushFrame(getState());
          break;
        } else {
          final int cmp = term.compareTo(target);
          if (cmp < 0) {
            if (currentFrame.nextEnt == currentFrame.entCount) {
              if (!currentFrame.isLastInFloor) {
                // if (DEBUG) System.out.println("  load floorBlock");
                currentFrame.loadNextFloorBlock();
                continue;
              } else {
                // if (DEBUG) System.out.println("  return term=" +
                // ToStringUtils.bytesRefToString(term));
                return;
              }
            }
            continue;
          } else if (cmp == 0) {
            // if (DEBUG) System.out.println("  return term=" +
            // ToStringUtils.bytesRefToString(term));
            return;
          } else {
            // Fallback to prior entry: the semantics of
            // this method is that the first call to
            // next() will return the term after the
            // requested term
            currentFrame.nextEnt--;
            currentFrame.lastSubFP = saveLastSubFP;
            currentFrame.startBytePos = saveStartBytePos;
            currentFrame.suffix = saveSuffix;
            currentFrame.suffixesReader.setPosition(savePos);
            currentFrame.termState.termBlockOrd = saveTermBlockOrd;
            System.arraycopy(
                currentFrame.suffixBytes,
                currentFrame.startBytePos,
                term.bytes,
                currentFrame.prefix,
                currentFrame.suffix);
            term.length = currentFrame.prefix + currentFrame.suffix;
            // If the last entry was a block we don't
            // need to bother recursing and pushing to
            // the last term under it because the first
            // next() will simply skip the frame anyway
            return;
          }
        }
      }
    }

    assert false;
  }

  @Override
  public BytesRef next() throws IOException {

    // if (DEBUG) {
    //   System.out.println("\nintEnum.next seg=" + segment);
    //   System.out.println("  frame ord=" + currentFrame.ord + " prefix=" +
    // ToStringUtils.bytesRefToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) +
    // " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor +
    // " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" :
    // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" +
    // currentFrame.outputPrefix);
    // }

    nextTerm:
    while (true) {
      // Pop finished frames
      while (currentFrame.nextEnt == currentFrame.entCount) {
        if (!currentFrame.isLastInFloor) {
          // if (DEBUG) System.out.println("    next-floor-block");
          currentFrame.loadNextFloorBlock();
          // if (DEBUG) System.out.println("\n  frame ord=" + currentFrame.ord + " prefix=" +
          // ToStringUtils.bytesRefToString(new BytesRef(term.bytes, term.offset,
          // currentFrame.prefix)) +
          // " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor +
          // " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" :
          // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" +
          // currentFrame.outputPrefix);
        } else {
          // if (DEBUG) System.out.println("  pop frame");
          if (currentFrame.ord == 0) {
            return null;
          }
          final long lastFP = currentFrame.fpOrig;
          currentFrame = stack[currentFrame.ord - 1];
          assert currentFrame.lastSubFP == lastFP;
          // if (DEBUG) System.out.println("\n  frame ord=" + currentFrame.ord + " prefix=" +
          // ToStringUtils.bytesRefToString(new BytesRef(term.bytes, term.offset,
          // currentFrame.prefix)) +
          // " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor +
          // " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" :
          // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" +
          // currentFrame.outputPrefix);
        }
      }

      final boolean isSubBlock = currentFrame.next();
      // if (DEBUG) {
      //   final BytesRef suffixRef = new BytesRef();
      //   suffixRef.bytes = currentFrame.suffixBytes;
      //   suffixRef.offset = currentFrame.startBytePos;
      //   suffixRef.length = currentFrame.suffix;
      //   System.out.println("    " + (isSubBlock ? "sub-block" : "term") + " " +
      // currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" +
      // ToStringUtils.bytesRefToString(suffixRef));
      // }

      if (currentFrame.suffix != 0) {
        final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
        while (label > currentFrame.curTransitionMax) {
          if (currentFrame.transitionIndex >= currentFrame.transitionCount - 1) {
            // Stop processing this frame -- no further
            // matches are possible because we've moved
            // beyond what the max transition will allow
            // if (DEBUG) System.out.println("      break: trans=" +
            // (currentFrame.transitions.length == 0 ? "n/a" :
            // currentFrame.transitions[currentFrame.transitionIndex]));

            // sneaky!  forces a pop above
            currentFrame.isLastInFloor = true;
            currentFrame.nextEnt = currentFrame.entCount;
            continue nextTerm;
          }
          currentFrame.transitionIndex++;
          transitionAccessor.getNextTransition(currentFrame.transition);
          currentFrame.curTransitionMax = currentFrame.transition.max;
          // if (DEBUG) System.out.println("      next trans=" +
          // currentFrame.transitions[currentFrame.transitionIndex]);
        }
      }

      // First test the common suffix, if set:
      if (commonSuffixRef != null && !isSubBlock) {
        final int termLen = currentFrame.prefix + currentFrame.suffix;
        if (termLen < commonSuffixRef.length) {
          // No match
          // if (DEBUG) {
          //   System.out.println("      skip: common suffix length");
          // }
          continue nextTerm;
        }

        final byte[] suffixBytes = currentFrame.suffixBytes;
        final byte[] commonSuffixBytes = commonSuffixRef.bytes;

        final int lenInPrefix = commonSuffixRef.length - currentFrame.suffix;
        assert commonSuffixRef.offset == 0;
        int suffixBytesPos;
        int commonSuffixBytesPos = 0;

        if (lenInPrefix > 0) {
          // A prefix of the common suffix overlaps with
          // the suffix of the block prefix so we first
          // test whether the prefix part matches:
          final byte[] termBytes = term.bytes;
          int termBytesPos = currentFrame.prefix - lenInPrefix;
          assert termBytesPos >= 0;
          final int termBytesPosEnd = currentFrame.prefix;
          while (termBytesPos < termBytesPosEnd) {
            if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
              // if (DEBUG) {
              //   System.out.println("      skip: common suffix mismatch (in prefix)");
              // }
              continue nextTerm;
            }
          }
          suffixBytesPos = currentFrame.startBytePos;
        } else {
          suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffixRef.length;
        }

        // Test overlapping suffix part:
        final int commonSuffixBytesPosEnd = commonSuffixRef.length;
        while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
          if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
            // if (DEBUG) {
            //   System.out.println("      skip: common suffix mismatch");
            // }
            continue nextTerm;
          }
        }
      }

      // TODO: maybe we should do the same linear test
      // that AutomatonTermsEnum does, so that if we
      // reach a part of the automaton where .* is
      // "temporarily" accepted, we just blindly .next()
      // until the limit

      // See if the term prefix matches the automaton:
      int state = currentFrame.state;
      for (int idx = 0; idx < currentFrame.suffix; idx++) {
        state =
            byteRunnable.step(
                state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff);
        if (state == -1) {
          // No match
          // System.out.println("    no s=" + state);
          continue nextTerm;
        } else {
          // System.out.println("    c s=" + state);
        }
      }

      if (isSubBlock) {
        // Match!  Recurse:
        // if (DEBUG) System.out.println("      sub-block match to state=" + state + "; recurse fp="
        // + currentFrame.lastSubFP);
        copyTerm();
        currentFrame = pushFrame(state);
        // if (DEBUG) System.out.println("\n  frame ord=" + currentFrame.ord + " prefix=" +
        // ToStringUtils.bytesRefToString(new BytesRef(term.bytes, term.offset,
        // currentFrame.prefix)) +
        // " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" +
        // currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" :
        // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" +
        // currentFrame.outputPrefix);
      } else if (byteRunnable.isAccept(state)) {
        copyTerm();
        // if (DEBUG) System.out.println("      term match to state=" + state + "; return term=" +
        // ToStringUtils.bytesRefToString(term));
        assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0
            : "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
        return term;
      } else {
        // System.out.println("    no s=" + state);
      }
    }
  }

  private void copyTerm() {
    // System.out.println("      copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" +
    // currentFrame.suffix + " first=" + (char)
    // currentFrame.suffixBytes[currentFrame.startBytePos]);
    final int len = currentFrame.prefix + currentFrame.suffix;
    if (term.bytes.length < len) {
      term.bytes = ArrayUtil.grow(term.bytes, len);
    }
    System.arraycopy(
        currentFrame.suffixBytes,
        currentFrame.startBytePos,
        term.bytes,
        currentFrame.prefix,
        currentFrame.suffix);
    term.length = len;
  }

  @Override
  public boolean seekExact(BytesRef text) {
    throw new UnsupportedOperationException();
  }

  @Override
  public void seekExact(long ord) {
    throw new UnsupportedOperationException();
  }

  @Override
  public long ord() {
    throw new UnsupportedOperationException();
  }

  @Override
  public SeekStatus seekCeil(BytesRef text) {
    throw new UnsupportedOperationException();
  }
}