All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.util.fst.FSTEnum Maven / Gradle / Ivy

There is a newer version: 9.10.0
Show newest version
package org.apache.lucene.util.fst;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

/** Can next() and advance() through the terms in an FST
 *
  * @lucene.experimental
*/

abstract class FSTEnum {
  protected final FST fst;

  @SuppressWarnings({"rawtypes","unchecked"}) protected FST.Arc[] arcs = new FST.Arc[10];
  // outputs are cumulative
  @SuppressWarnings({"rawtypes","unchecked"}) protected T[] output = (T[]) new Object[10];

  protected final T NO_OUTPUT;
  protected final FST.BytesReader fstReader;
  protected final FST.Arc scratchArc = new FST.Arc<>();

  protected int upto;
  protected int targetLength;

  /** doFloor controls the behavior of advance: if it's true
   *  doFloor is true, advance positions to the biggest
   *  term before target.  */
  protected FSTEnum(FST fst) {
    this.fst = fst;
    fstReader = fst.getBytesReader();
    NO_OUTPUT = fst.outputs.getNoOutput();
    fst.getFirstArc(getArc(0));
    output[0] = NO_OUTPUT;
  }

  protected abstract int getTargetLabel();
  protected abstract int getCurrentLabel();

  protected abstract void setCurrentLabel(int label);
  protected abstract void grow();

  /** Rewinds enum state to match the shared prefix between
   *  current term and target term */
  protected final void rewindPrefix() throws IOException {
    if (upto == 0) {
      //System.out.println("  init");
      upto = 1;
      fst.readFirstTargetArc(getArc(0), getArc(1), fstReader);
      return;
    }
    //System.out.println("  rewind upto=" + upto + " vs targetLength=" + targetLength);

    final int currentLimit = upto;
    upto = 1;
    while (upto < currentLimit && upto <= targetLength+1) {
      final int cmp = getCurrentLabel() - getTargetLabel();
      if (cmp < 0) {
        // seek forward
        //System.out.println("    seek fwd");
        break;
      } else if (cmp > 0) {
        // seek backwards -- reset this arc to the first arc
        final FST.Arc arc = getArc(upto);
        fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
        //System.out.println("    seek first arc");
        break;
      }
      upto++;
    }
    //System.out.println("  fall through upto=" + upto);
  }

  protected void doNext() throws IOException {
    //System.out.println("FE: next upto=" + upto);
    if (upto == 0) {
      //System.out.println("  init");
      upto = 1;
      fst.readFirstTargetArc(getArc(0), getArc(1), fstReader);
    } else {
      // pop
      //System.out.println("  check pop curArc target=" + arcs[upto].target + " label=" + arcs[upto].label + " isLast?=" + arcs[upto].isLast());
      while (arcs[upto].isLast()) {
        upto--;
        if (upto == 0) {
          //System.out.println("  eof");
          return;
        }
      }
      fst.readNextArc(arcs[upto], fstReader);
    }

    pushFirst();
  }

  // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
  // SEEK_END)?  saves the eq check above?

  /** Seeks to smallest term that's >= target. */
  protected void doSeekCeil() throws IOException {

    //System.out.println("    advance len=" + target.length + " curlen=" + current.length);

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton

    //System.out.println("FE.seekCeil upto=" + upto);

    // Save time by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();
    //System.out.println("  after rewind upto=" + upto);

    FST.Arc arc = getArc(upto);
    int targetLabel = getTargetLabel();
    //System.out.println("  init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while(true) {

      //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);

      if (arc.bytesPerArc != 0 && arc.label != -1) {

        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST.BytesReader in = fst.getBytesReader();
        int low = arc.arcIdx;
        int high = arc.numArcs-1;
        int mid = 0;
        //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.setPosition(arc.posArcsStart);
          in.skipBytes(arc.bytesPerArc*mid+1);
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0)
            low = mid + 1;
          else if (cmp > 0)
            high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match
          arc.arcIdx = mid-1;
          fst.readNextRealArc(arc, in);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto), fstReader);
          targetLabel = getTargetLabel();
          continue;
        } else if (low == arc.numArcs) {
          // Dead end
          arc.arcIdx = arc.numArcs-2;
          fst.readNextRealArc(arc, in);
          assert arc.isLast();
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while(true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc prevArc = getArc(upto);
            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc, fstReader);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          arc.arcIdx = (low > high ? low : high)-1;
          fst.readNextRealArc(arc, in);
          assert arc.label > targetLabel;
          pushFirst();
          return;
        }
      } else {
        // Arcs are not array'd -- must do linear scan:
        if (arc.label == targetLabel) {
          // recurse
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto), fstReader);
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          pushFirst();
          return;
        } else if (arc.isLast()) {
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while(true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc prevArc = getArc(upto);
            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc, fstReader);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          // keep scanning
          //System.out.println("    next scan");
          fst.readNextArc(arc, fstReader);
        }
      }
    }
  }

  // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
  // SEEK_END)?  saves the eq check above?
  /** Seeks to largest term that's <= target. */
  protected void doSeekFloor() throws IOException {

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton
    //System.out.println("FE: seek floor upto=" + upto);

    // Save CPU by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();

    //System.out.println("FE: after rewind upto=" + upto);

    FST.Arc arc = getArc(upto);
    int targetLabel = getTargetLabel();

    //System.out.println("FE: init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while(true) {
      //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc);

      if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST.BytesReader in = fst.getBytesReader();
        int low = arc.arcIdx;
        int high = arc.numArcs-1;
        int mid = 0;
        //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.setPosition(arc.posArcsStart);
          in.skipBytes(arc.bytesPerArc*mid+1);
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0) {
            low = mid + 1;
          } else if (cmp > 0) {
            high = mid - 1;
          } else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match -- recurse
          //System.out.println("  match!  arcIdx=" + mid);
          arc.arcIdx = mid-1;
          fst.readNextRealArc(arc, in);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto), fstReader);
          targetLabel = getTargetLabel();
          continue;
        } else if (high == -1) {
          //System.out.println("  before first");
          // Very first arc is after our target
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while(true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
                fst.readNextArc(arc, fstReader);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else {
          // There is a floor arc:
          arc.arcIdx = (low > high ? high : low)-1;
          //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
          fst.readNextRealArc(arc, in);
          assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
          assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel;
          pushLast();
          return;
        }        
      } else {

        if (arc.label == targetLabel) {
          // Match -- recurse
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto), fstReader);
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while(true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while(!arc.isLast() && fst.readNextArcLabel(arc, fstReader) < targetLabel) {
                fst.readNextArc(arc, fstReader);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else if (!arc.isLast()) {
          //System.out.println("  check next label=" + fst.readNextArcLabel(arc) + " (" + (char) fst.readNextArcLabel(arc) + ")");
          if (fst.readNextArcLabel(arc, fstReader) > targetLabel) {
            pushLast();
            return;
          } else {
            // keep scanning
            fst.readNextArc(arc, fstReader);
          }
        } else {
          pushLast();
          return;
        }
      }
    }
  }

  /** Seeks to exactly target term. */
  protected boolean doSeekExact() throws IOException {

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton

    //System.out.println("FE: seek exact upto=" + upto);

    // Save time by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();

    //System.out.println("FE: after rewind upto=" + upto);
    FST.Arc arc = getArc(upto-1);
    int targetLabel = getTargetLabel();

    final FST.BytesReader fstReader = fst.getBytesReader();

    while(true) {
      //System.out.println("  cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
      final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader);
      if (nextArc == null) {
        // short circuit
        //upto--;
        //upto = 0;
        fst.readFirstTargetArc(arc, getArc(upto), fstReader);
        //System.out.println("  no match upto=" + upto);
        return false;
      }
      // Match -- recurse:
      output[upto] = fst.outputs.add(output[upto-1], nextArc.output);
      if (targetLabel == FST.END_LABEL) {
        //System.out.println("  return found; upto=" + upto + " output=" + output[upto] + " nextArc=" + nextArc.isLast());
        return true;
      }
      setCurrentLabel(targetLabel);
      incr();
      targetLabel = getTargetLabel();
      arc = nextArc;
    }
  }

  private void incr() {
    upto++;
    grow();
    if (arcs.length <= upto) {
      @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc[] newArcs =
        new FST.Arc[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(arcs, 0, newArcs, 0, arcs.length);
      arcs = newArcs;
    }
    if (output.length <= upto) {
      @SuppressWarnings({"rawtypes","unchecked"}) final T[] newOutput =
        (T[]) new Object[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(output, 0, newOutput, 0, output.length);
      output = newOutput;
    }
  }

  // Appends current arc, and then recurses from its target,
  // appending first arc all the way to the final node
  private void pushFirst() throws IOException {

    FST.Arc arc = arcs[upto];
    assert arc != null;

    while (true) {
      output[upto] = fst.outputs.add(output[upto-1], arc.output);
      if (arc.label == FST.END_LABEL) {
        // Final node
        break;
      }
      //System.out.println("  pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + fst.outputs.outputToString(output[upto]));
      setCurrentLabel(arc.label);
      incr();
      
      final FST.Arc nextArc = getArc(upto);
      fst.readFirstTargetArc(arc, nextArc, fstReader);
      arc = nextArc;
    }
  }

  // Recurses from current arc, appending last arc all the
  // way to the first final node
  private void pushLast() throws IOException {

    FST.Arc arc = arcs[upto];
    assert arc != null;

    while (true) {
      setCurrentLabel(arc.label);
      output[upto] = fst.outputs.add(output[upto-1], arc.output);
      if (arc.label == FST.END_LABEL) {
        // Final node
        break;
      }
      incr();

      arc = fst.readLastTargetArc(arc, getArc(upto), fstReader);
    }
  }

  private FST.Arc getArc(int idx) {
    if (arcs[idx] == null) {
      arcs[idx] = new FST.Arc<>();
    }
    return arcs[idx];
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy