All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.blocktree.IntersectTermsEnum Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blocktree;


import java.io.IOException;

import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;

/** This is used to implement efficient {@link Terms#intersect} for
 *  block-tree.  Note that it cannot seek, except for the initial term on
 *  init.  It just "nexts" through the intersection of the automaton and
 *  the terms.  It does not use the terms index at all: on init, it
 *  loads the root block, and scans its way to the initial term.
 *  Likewise, in next it scans until it finds a term that matches the
 *  current automaton transition.  If the index has auto-prefix terms
 *  (only for DOCS_ONLY fields currently) it will visit these terms
 *  when possible and then skip the real terms that auto-prefix term
 *  matched. */

final class IntersectTermsEnum extends TermsEnum {

  //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;

  final IndexInput in;
  final static Outputs fstOutputs = ByteSequenceOutputs.getSingleton();

  IntersectTermsEnumFrame[] stack;
      
  @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] arcs = new FST.Arc[5];

  final RunAutomaton runAutomaton;
  final Automaton automaton;
  final BytesRef commonSuffix;

  private IntersectTermsEnumFrame currentFrame;
  private Transition currentTransition;

  private final BytesRef term = new BytesRef();

  private final FST.BytesReader fstReader;

  private final boolean allowAutoPrefixTerms;

  final FieldReader fr;

  /** Which state in the automaton accepts all possible suffixes. */
  private final int sinkState;

  private BytesRef savedStartTerm;
      
  /** True if we did return the current auto-prefix term */
  private boolean useAutoPrefixTerm;

  // TODO: in some cases we can filter by length?  eg
  // regexp foo*bar must be at least length 6 bytes
  public IntersectTermsEnum(FieldReader fr, Automaton automaton, RunAutomaton runAutomaton, BytesRef commonSuffix, BytesRef startTerm, int sinkState) throws IOException {
    this.fr = fr;
    this.sinkState = sinkState;

    assert automaton != null;
    assert runAutomaton != null;

    this.runAutomaton = runAutomaton;
    this.allowAutoPrefixTerms = sinkState != -1;
    this.automaton = automaton;
    this.commonSuffix = commonSuffix;

    in = fr.parent.termsIn.clone();
    stack = new IntersectTermsEnumFrame[5];
    for(int idx=0;idx();
    }

    if (fr.index == null) {
      fstReader = null;
    } else {
      fstReader = fr.index.getBytesReader();
    }

    // TODO: if the automaton is "smallish" we really
    // should use the terms index to seek at least to
    // the initial term and likely to subsequent terms
    // (or, maybe just fallback to ATE for such cases).
    // Else the seek cost of loading the frames will be
    // too costly.

    final FST.Arc arc = fr.index.getFirstArc(arcs[0]);
    // Empty string prefix must have an output in the index!
    assert arc.isFinal();

    // Special pushFrame since it's the first one:
    final IntersectTermsEnumFrame f = stack[0];
    f.fp = f.fpOrig = fr.rootBlockFP;
    f.prefix = 0;
    f.setState(0);
    f.arc = arc;
    f.outputPrefix = arc.output;
    f.load(fr.rootCode);

    // for assert:
    assert setSavedStartTerm(startTerm);

    currentFrame = f;
    if (startTerm != null) {
      seekToStartTerm(startTerm);
    }
    currentTransition = currentFrame.transition;
  }

  // only for assert:
  private boolean setSavedStartTerm(BytesRef startTerm) {
    savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
    return true;
  }

  @Override
  public TermState termState() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.clone();
  }

  private IntersectTermsEnumFrame getFrame(int ord) throws IOException {
    if (ord >= stack.length) {
      final IntersectTermsEnumFrame[] next = new IntersectTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(stack, 0, next, 0, stack.length);
      for(int stackOrd=stack.length;stackOrd getArc(int ord) {
    if (ord >= arcs.length) {
      @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc[] next =
      new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(arcs, 0, next, 0, arcs.length);
      for(int arcOrd=arcs.length;arcOrd();
      }
      arcs = next;
    }
    return arcs[ord];
  }

  private IntersectTermsEnumFrame pushFrame(int state) throws IOException {
    assert currentFrame != null;

    final IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord);
        
    f.fp = f.fpOrig = currentFrame.lastSubFP;
    f.prefix = currentFrame.prefix + currentFrame.suffix;
    f.setState(state);

    // Walk the arc through the index -- we only
    // "bother" with this so we can get the floor data
    // from the index and skip floor blocks when
    // possible:
    FST.Arc arc = currentFrame.arc;
    int idx = currentFrame.prefix;
    assert currentFrame.suffix > 0;
    BytesRef output = currentFrame.outputPrefix;
    while (idx < f.prefix) {
      final int target = term.bytes[idx] & 0xff;
      // TODO: we could be more efficient for the next()
      // case by using current arc as starting point,
      // passed to findTargetArc
      arc = fr.index.findTargetArc(target, arc, getArc(1+idx), fstReader);
      assert arc != null;
      output = fstOutputs.add(output, arc.output);
      idx++;
    }

    f.arc = arc;
    f.outputPrefix = output;
    assert arc.isFinal();
    f.load(fstOutputs.add(output, arc.nextFinalOutput));
    return f;
  }

  @Override
  public BytesRef term() {
    return term;
  }

  @Override
  public int docFreq() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.docFreq;
  }

  @Override
  public long totalTermFreq() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.totalTermFreq;
  }

  @Override
  public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
    currentFrame.decodeMetaData();
    return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags);
  }

  private int getState() {
    int state = currentFrame.state;
    for(int idx=0;idx arc = arcs[0];
    assert arc == currentFrame.arc;

    for(int idx=0;idx<=target.length;idx++) {

      while (true) {
        final int savNextEnt = currentFrame.nextEnt;
        final int savePos = currentFrame.suffixesReader.getPosition();
        final int saveStartBytePos = currentFrame.startBytePos;
        final int saveSuffix = currentFrame.suffix;
        final long saveLastSubFP = currentFrame.lastSubFP;
        final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
        final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm;

        final boolean isSubBlock = currentFrame.next();

        term.length = currentFrame.prefix + currentFrame.suffix;
        if (term.bytes.length < term.length) {
          term.bytes = ArrayUtil.grow(term.bytes, term.length);
        }
        System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);

        if (isSubBlock && StringHelper.startsWith(target, term)) {
          // Recurse
          currentFrame = pushFrame(getState());
          break;
        } else {
          final int cmp = term.compareTo(target);
          if (cmp < 0) {
            if (currentFrame.nextEnt == currentFrame.entCount) {
              if (!currentFrame.isLastInFloor) {
                // Advance to next floor block
                currentFrame.loadNextFloorBlock();
                continue;
              } else {
                return;
              }
            }
            continue;
          } else if (cmp == 0) {
            if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) {
              continue;
            }
            return;
          } else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) {
            // Fallback to prior entry: the semantics of
            // this method is that the first call to
            // next() will return the term after the
            // requested term
            currentFrame.nextEnt = savNextEnt;
            currentFrame.lastSubFP = saveLastSubFP;
            currentFrame.startBytePos = saveStartBytePos;
            currentFrame.suffix = saveSuffix;
            currentFrame.suffixesReader.setPosition(savePos);
            currentFrame.termState.termBlockOrd = saveTermBlockOrd;
            currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm;
            System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
            term.length = currentFrame.prefix + currentFrame.suffix;
            // If the last entry was a block we don't
            // need to bother recursing and pushing to
            // the last term under it because the first
            // next() will simply skip the frame anyway
            return;
          }
        }
      }
    }

    assert false;
  }

  private boolean popPushNext() throws IOException {
    // Pop finished frames
    while (currentFrame.nextEnt == currentFrame.entCount) {
      if (!currentFrame.isLastInFloor) {
        // Advance to next floor block
        currentFrame.loadNextFloorBlock();
        break;
      } else {
        if (currentFrame.ord == 0) {
          throw NoMoreTermsException.INSTANCE;
        }
        final long lastFP = currentFrame.fpOrig;
        currentFrame = stack[currentFrame.ord-1];
        currentTransition = currentFrame.transition;
        assert currentFrame.lastSubFP == lastFP;
      }
    }

    return currentFrame.next();
  }

  private boolean skipPastLastAutoPrefixTerm() throws IOException {
    assert currentFrame.isAutoPrefixTerm;
    useAutoPrefixTerm = false;

    // If we last returned an auto-prefix term, we must now skip all
    // actual terms sharing that prefix.  At most, that skipping
    // requires popping one frame, but it can also require simply
    // scanning ahead within the current frame.  This scanning will
    // skip sub-blocks that contain many terms, which is why the
    // optimization "works":
    int floorSuffixLeadEnd = currentFrame.floorSuffixLeadEnd;

    boolean isSubBlock;

    if (floorSuffixLeadEnd == -1) {
      // An ordinary prefix, e.g. foo*
      int prefix = currentFrame.prefix;
      int suffix = currentFrame.suffix;
      if (suffix == 0) {

        // Easy case: the prefix term's suffix is the empty string,
        // meaning the prefix corresponds to all terms in the
        // current block, so we just pop this entire block:
        if (currentFrame.ord == 0) {
          throw NoMoreTermsException.INSTANCE;
        }
        currentFrame = stack[currentFrame.ord-1];
        currentTransition = currentFrame.transition;

        return popPushNext();

      } else {

        // Just next() until we hit an entry that doesn't share this
        // prefix.  The first next should be a sub-block sharing the
        // same prefix, because if there are enough terms matching a
        // given prefix to warrant an auto-prefix term, then there
        // must also be enough to make a sub-block (assuming
        // minItemsInPrefix > minItemsInBlock):
        scanPrefix:
        while (true) {
          if (currentFrame.nextEnt == currentFrame.entCount) {
            if (currentFrame.isLastInFloor == false) {
              currentFrame.loadNextFloorBlock();
            } else if (currentFrame.ord == 0) {
              throw NoMoreTermsException.INSTANCE;
            } else {
              // Pop frame, which also means we've moved beyond this
              // auto-prefix term:
              currentFrame = stack[currentFrame.ord-1];
              currentTransition = currentFrame.transition;

              return popPushNext();
            }
          }
          isSubBlock = currentFrame.next();
          for(int i=0;i floorSuffixLeadEnd
        //assert currentFrame.prefix == prefix-1;
        //prefix = currentFrame.prefix;

        // In case when we pop, and the parent block is not just prefix-1, e.g. in block 417* on
        // its first term = floor prefix term 41[7-9], popping to block 4*:
        prefix = currentFrame.prefix;

        suffix = term.length - currentFrame.prefix;
      } else {
        // No need to pop; just scan in currentFrame:
      }

      // Now we scan until the lead suffix byte is > floorSuffixLeadEnd
      scanFloor:
      while (true) {
        if (currentFrame.nextEnt == currentFrame.entCount) {
          if (currentFrame.isLastInFloor == false) {
            currentFrame.loadNextFloorBlock();
          } else if (currentFrame.ord == 0) {
            throw NoMoreTermsException.INSTANCE;
          } else {
            // Pop frame, which also means we've moved beyond this
            // auto-prefix term:
            currentFrame = stack[currentFrame.ord-1];
            currentTransition = currentFrame.transition;

            return popPushNext();
          }
        }
        isSubBlock = currentFrame.next();
        for(int i=0;i= suffix && (currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]&0xff) > floorSuffixLeadEnd) {
          // Done scanning: we are now on the first term after all
          // terms matched by this auto-prefix term
          break;
        }
      }
    }

    return isSubBlock;
  }

  // Only used internally when there are no more terms in next():
  private static final class NoMoreTermsException extends RuntimeException {

    // Only used internally when there are no more terms in next():
    public static final NoMoreTermsException INSTANCE = new NoMoreTermsException();

    private NoMoreTermsException() {
    }

    @Override
    public Throwable fillInStackTrace() {
      // Do nothing:
      return this;
    }    
  }

  @Override
  public BytesRef next() throws IOException {
    try {
      return _next();
    } catch (NoMoreTermsException eoi) {
      // Provoke NPE if we are (illegally!) called again:
      currentFrame = null;
      return null;
    }
  }

  private BytesRef _next() throws IOException {

    boolean isSubBlock;

    if (useAutoPrefixTerm) {
      // If the current term was an auto-prefix term, we have to skip past it:
      isSubBlock = skipPastLastAutoPrefixTerm();
      assert useAutoPrefixTerm == false;
    } else {
      isSubBlock = popPushNext();
    }

    nextTerm:

    while (true) {
      assert currentFrame.transition == currentTransition;

      int state;
      int lastState;

      // NOTE: suffix == 0 can only happen on the first term in a block, when
      // there is a term exactly matching a prefix in the index.  If we
      // could somehow re-org the code so we only checked this case immediately
      // after pushing a frame...
      if (currentFrame.suffix != 0) {

        final byte[] suffixBytes = currentFrame.suffixBytes;

        // This is the first byte of the suffix of the term we are now on:
        final int label = suffixBytes[currentFrame.startBytePos] & 0xff;

        if (label < currentTransition.min) {
          // Common case: we are scanning terms in this block to "catch up" to
          // current transition in the automaton:
          int minTrans = currentTransition.min;
          while (currentFrame.nextEnt < currentFrame.entCount) {
            isSubBlock = currentFrame.next();
            if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) {
              continue nextTerm;
            }
          }

          // End of frame:
          isSubBlock = popPushNext();
          continue nextTerm;
        }

        // Advance where we are in the automaton to match this label:

        while (label > currentTransition.max) {
          if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
            // Pop this frame: no further matches are possible because
            // we've moved beyond what the max transition will allow
            if (currentFrame.ord == 0) {
              // Provoke NPE if we are (illegally!) called again:
              currentFrame = null;
              return null;
            }
            currentFrame = stack[currentFrame.ord-1];
            currentTransition = currentFrame.transition;
            isSubBlock = popPushNext();
            continue nextTerm;
          }
          currentFrame.transitionIndex++;
          automaton.getNextTransition(currentTransition);

          if (label < currentTransition.min) {
            int minTrans = currentTransition.min;
            while (currentFrame.nextEnt < currentFrame.entCount) {
              isSubBlock = currentFrame.next();
              if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) {
                continue nextTerm;
              }
            }

            // End of frame:
            isSubBlock = popPushNext();
            continue nextTerm;
          }
        }

        if (commonSuffix != null && !isSubBlock) {
          final int termLen = currentFrame.prefix + currentFrame.suffix;
          if (termLen < commonSuffix.length) {
            // No match
            isSubBlock = popPushNext();
            continue nextTerm;
          }

          final byte[] commonSuffixBytes = commonSuffix.bytes;

          final int lenInPrefix = commonSuffix.length - currentFrame.suffix;
          assert commonSuffix.offset == 0;
          int suffixBytesPos;
          int commonSuffixBytesPos = 0;

          if (lenInPrefix > 0) {
            // A prefix of the common suffix overlaps with
            // the suffix of the block prefix so we first
            // test whether the prefix part matches:
            final byte[] termBytes = term.bytes;
            int termBytesPos = currentFrame.prefix - lenInPrefix;
            assert termBytesPos >= 0;
            final int termBytesPosEnd = currentFrame.prefix;
            while (termBytesPos < termBytesPosEnd) {
              if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
                isSubBlock = popPushNext();
                continue nextTerm;
              }
            }
            suffixBytesPos = currentFrame.startBytePos;
          } else {
            suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length;
          }

          // Test overlapping suffix part:
          final int commonSuffixBytesPosEnd = commonSuffix.length;
          while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
            if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
              isSubBlock = popPushNext();
              continue nextTerm;
            }
          }
        }

        // TODO: maybe we should do the same linear test
        // that AutomatonTermsEnum does, so that if we
        // reach a part of the automaton where .* is
        // "temporarily" accepted, we just blindly .next()
        // until the limit

        // See if the term suffix matches the automaton:

        // We know from above that the first byte in our suffix (label) matches
        // the current transition, so we step from the 2nd byte
        // in the suffix:
        lastState = currentFrame.state;
        state = currentTransition.dest;

        int end = currentFrame.startBytePos + currentFrame.suffix;
        for (int idx=currentFrame.startBytePos+1;idx 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
        return term;
      } else {
        // This term is a prefix of a term accepted by the automaton, but is not itself acceptd
      }

      isSubBlock = popPushNext();
    }
  }

  private final Transition scratchTransition = new Transition();

  /** Returns true if, from this state, the automaton accepts any suffix
   *  starting with a label between start and end, inclusive.  We just
   *  look for a transition, matching this range, to the sink state.  */
  private boolean acceptsSuffixRange(int state, int start, int end) {

    int count = automaton.initTransition(state, scratchTransition);
    for(int i=0;i= scratchTransition.min && end <= scratchTransition.max && scratchTransition.dest == sinkState) {
        return true;
      }
    }

    return false;
  }

  // for debugging
  @SuppressWarnings("unused")
  static String brToString(BytesRef b) {
    try {
      return b.utf8ToString() + " " + b;
    } catch (Throwable t) {
      // If BytesRef isn't actually UTF8, or it's eg a
      // prefix of UTF8 that ends mid-unicode-char, we
      // fallback to hex:
      return b.toString();
    }
  }

  private void copyTerm() {
    final int len = currentFrame.prefix + currentFrame.suffix;
    if (term.bytes.length < len) {
      term.bytes = ArrayUtil.grow(term.bytes, len);
    }
    System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
    term.length = len;
  }

  @Override
  public boolean seekExact(BytesRef text) {
    throw new UnsupportedOperationException();
  }

  @Override
  public void seekExact(long ord) {
    throw new UnsupportedOperationException();
  }

  @Override
  public long ord() {
    throw new UnsupportedOperationException();
  }

  @Override
  public SeekStatus seekCeil(BytesRef text) {
    throw new UnsupportedOperationException();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy