org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-kuromoji Show documentation
Apache Lucene (module: kuromoji)
There is a newer version: 9.12.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;

// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...

/**
 * Tokenizer for Japanese that uses morphological analysis.
 *
 * This tokenizer sets a number of additional attributes:
 *
 * 

 *   {@link BaseFormAttribute} containing base form for inflected adjectives and verbs.
 *   
{@link PartOfSpeechAttribute} containing part-of-speech.
 *   
{@link ReadingAttribute} containing reading and pronunciation.
 *   
{@link InflectionAttribute} containing additional part-of-speech information for inflected
 *       forms.
 * 
 *
 * This tokenizer uses a rolling Viterbi search to find the least cost segmentation (path) of the
 * incoming characters. For tokens that appear to be compound (> length 2 for all Kanji, or >
 * length 7 for non-Kanji), we see if there is a 2nd best segmentation of that token after applying
 * penalties to the long tokens. If so, and the Mode is {@link Mode#SEARCH}, we output the alternate
 * segmentation as well.
 */
public final class JapaneseTokenizer extends Tokenizer {

  /** Tokenization mode: this determines how the tokenizer handles compound and unknown words. */
  public enum Mode {
    /** Ordinary segmentation: no decomposition for compounds, */
    NORMAL,

    /**
     * Segmentation geared towards search: this includes a decompounding process for long nouns,
     * also including the full compound token as a synonym.
     */
    SEARCH,

    /**
     * Extended mode outputs unigrams for unknown words.
     *
     * @lucene.experimental
     */
    EXTENDED
  }

  /** Default tokenization mode. Currently this is {@link Mode#SEARCH}. */
  public static final Mode DEFAULT_MODE = Mode.SEARCH;

  /** Token type reflecting the original source of this token */
  public enum Type {
    /** Known words from the system dictionary. */
    KNOWN,
    /** Unknown words (heuristically segmented). */
    UNKNOWN,
    /** Known words from the user dictionary. */
    USER
  }

  private static final boolean VERBOSE = false;

  private static final int SEARCH_MODE_KANJI_LENGTH = 2;

  private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH

  private static final int SEARCH_MODE_KANJI_PENALTY = 3000;

  private static final int SEARCH_MODE_OTHER_PENALTY = 1700;

  // For safety:
  private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
  private static final int MAX_BACKTRACE_GAP = 1024;

  private final EnumMap dictionaryMap = new EnumMap<>(Type.class);

  private final TokenInfoFST fst;
  private final TokenInfoDictionary dictionary;
  private final UnknownDictionary unkDictionary;
  private final ConnectionCosts costs;
  private final UserDictionary userDictionary;
  private final CharacterDefinition characterDefinition;

  private final FST.Arc arc = new FST.Arc<>();
  private final FST.BytesReader fstReader;
  private final IntsRef wordIdRef = new IntsRef();

  private final FST.BytesReader userFSTReader;
  private final TokenInfoFST userFST;

  private final RollingCharBuffer buffer = new RollingCharBuffer();

  private final WrappedPositionArray positions = new WrappedPositionArray();

  private final boolean discardPunctuation;
  private final boolean searchMode;
  private final boolean extendedMode;
  private final boolean outputCompounds;
  private boolean outputNBest = false;

  // Allowable cost difference for N-best output:
  private int nBestCost = 0;

  // True once we've hit the EOF from the input reader:
  private boolean end;

  // Last absolute position we backtraced from:
  private int lastBackTracePos;

  // Position of last token we returned; we use this to
  // figure out whether to set posIncr to 0 or 1:
  private int lastTokenPos;

  // Next absolute position to process:
  private int pos;

  // Already parsed, but not yet passed to caller, tokens:
  private final List pending = new ArrayList<>();

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAtt =
      addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);

  /**
   * Create a new JapaneseTokenizer.
   *
   * 
Uses the default AttributeFactory.
   *
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, true, mode);
  }

  /**
   * Create a new JapaneseTokenizer.
   *
   * Uses the default AttributeFactory.
   *
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param discardCompoundToken true if compound tokens should be dropped from the output when
   *     tokenization mode is not NORMAL.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(
      UserDictionary userDictionary,
      boolean discardPunctuation,
      boolean discardCompoundToken,
      Mode mode) {
    this(
        DEFAULT_TOKEN_ATTRIBUTE_FACTORY,
        userDictionary,
        discardPunctuation,
        discardCompoundToken,
        mode);
  }

  /**
   * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
   *
   * @param factory the AttributeFactory to use
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(
      AttributeFactory factory,
      UserDictionary userDictionary,
      boolean discardPunctuation,
      Mode mode) {
    this(
        factory,
        TokenInfoDictionary.getInstance(),
        UnknownDictionary.getInstance(),
        ConnectionCosts.getInstance(),
        userDictionary,
        discardPunctuation,
        true,
        mode);
  }

  /**
   * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
   *
   * @param factory the AttributeFactory to use
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param discardCompoundToken true if compound tokens should be dropped from the output when
   *     tokenization mode is not NORMAL.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(
      AttributeFactory factory,
      UserDictionary userDictionary,
      boolean discardPunctuation,
      boolean discardCompoundToken,
      Mode mode) {
    this(
        factory,
        TokenInfoDictionary.getInstance(),
        UnknownDictionary.getInstance(),
        ConnectionCosts.getInstance(),
        userDictionary,
        discardPunctuation,
        discardCompoundToken,
        mode);
  }

  /**
   * Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
   * This constructor provides an entry point for users that want to construct custom language
   * models that can be used as input to {@link
   * org.apache.lucene.analysis.ja.util.DictionaryBuilder}.
   *
   * @param factory the AttributeFactory to use
   * @param systemDictionary a custom known token dictionary
   * @param unkDictionary a custom unknown token dictionary
   * @param connectionCosts custom token transition costs
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param discardCompoundToken true if compound tokens should be dropped from the output when
   *     tokenization mode is not NORMAL.
   * @param mode tokenization mode.
   * @lucene.experimental
   */
  @IgnoreRandomChains(reason = "Parameters are too complex to be tested")
  public JapaneseTokenizer(
      AttributeFactory factory,
      TokenInfoDictionary systemDictionary,
      UnknownDictionary unkDictionary,
      ConnectionCosts connectionCosts,
      UserDictionary userDictionary,
      boolean discardPunctuation,
      boolean discardCompoundToken,
      Mode mode) {
    super(factory);
    this.dictionary = systemDictionary;
    this.fst = dictionary.getFST();
    this.unkDictionary = unkDictionary;
    this.characterDefinition = unkDictionary.getCharacterDefinition();
    this.userDictionary = userDictionary;
    this.costs = connectionCosts;
    fstReader = fst.getBytesReader();
    if (userDictionary != null) {
      userFST = userDictionary.getFST();
      userFSTReader = userFST.getBytesReader();
    } else {
      userFST = null;
      userFSTReader = null;
    }
    this.discardPunctuation = discardPunctuation;
    switch (mode) {
      case SEARCH:
        searchMode = true;
        extendedMode = false;
        outputCompounds = !discardCompoundToken;
        break;
      case EXTENDED:
        searchMode = true;
        extendedMode = true;
        outputCompounds = !discardCompoundToken;
        break;
      case NORMAL:
      default:
        searchMode = false;
        extendedMode = false;
        outputCompounds = false;
        break;
    }
    buffer.reset(this.input);

    resetState();

    dictionaryMap.put(Type.KNOWN, dictionary);
    dictionaryMap.put(Type.UNKNOWN, unkDictionary);
    dictionaryMap.put(Type.USER, userDictionary);
  }

  private GraphvizFormatter dotOut;

  /** Expert: set this to produce graphviz (dot) output of the Viterbi lattice */
  public void setGraphvizFormatter(GraphvizFormatter dotOut) {
    this.dotOut = dotOut;
  }

  @Override
  public void close() throws IOException {
    super.close();
    buffer.reset(input);
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    buffer.reset(input);
    resetState();
  }

  private void resetState() {
    positions.reset();
    pos = 0;
    end = false;
    lastBackTracePos = 0;
    lastTokenPos = -1;
    pending.clear();

    // Add BOS:
    positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
  }

  @Override
  public void end() throws IOException {
    super.end();
    // Set final offset
    int finalOffset = correctOffset(pos);
    offsetAtt.setOffset(finalOffset, finalOffset);
  }

  // Returns the added cost that a 2nd best segmentation is
  // allowed to have.  Ie, if we see path with cost X,
  // ending in a compound word, and this method returns
  // threshold > 0, then we will also find the 2nd best
  // segmentation and if its path score is within this
  // threshold of X, we'll include it in the output:
  private int computeSecondBestThreshold(int pos, int length) throws IOException {
    // TODO: maybe we do something else here, instead of just
    // using the penalty...?  EG we can be more aggressive on
    // when to also test for 2nd best path
    return computePenalty(pos, length);
  }

  private int computePenalty(int pos, int length) throws IOException {
    if (length > SEARCH_MODE_KANJI_LENGTH) {
      boolean allKanji = true;
      // check if node consists of only kanji
      final int endPos = pos + length;
      for (int pos2 = pos; pos2 < endPos; pos2++) {
        if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
          allKanji = false;
          break;
        }
      }
      if (allKanji) { // Process only Kanji keywords
        return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
      } else if (length > SEARCH_MODE_OTHER_LENGTH) {
        return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
      }
    }
    return 0;
  }

  // Holds all back pointers arriving to this position:
  static final class Position {

    int pos;

    int count;

    // maybe single int array * 5?
    int[] costs = new int[8];
    int[] lastRightID = new int[8];
    int[] backPos = new int[8];
    int[] backIndex = new int[8];
    int[] backID = new int[8];
    Type[] backType = new Type[8];

    // Only used when finding 2nd best segmentation under a
    // too-long token:
    int forwardCount;
    int[] forwardPos = new int[8];
    int[] forwardID = new int[8];
    int[] forwardIndex = new int[8];
    Type[] forwardType = new Type[8];

    public void grow() {
      costs = ArrayUtil.grow(costs, 1 + count);
      lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
      backPos = ArrayUtil.grow(backPos, 1 + count);
      backIndex = ArrayUtil.grow(backIndex, 1 + count);
      backID = ArrayUtil.grow(backID, 1 + count);

      // NOTE: sneaky: grow separately because
      // ArrayUtil.grow will otherwise pick a different
      // length than the int[]s we just grew:
      final Type[] newBackType = new Type[backID.length];
      System.arraycopy(backType, 0, newBackType, 0, backType.length);
      backType = newBackType;
    }

    public void growForward() {
      forwardPos = ArrayUtil.grow(forwardPos, 1 + forwardCount);
      forwardID = ArrayUtil.grow(forwardID, 1 + forwardCount);
      forwardIndex = ArrayUtil.grow(forwardIndex, 1 + forwardCount);

      // NOTE: sneaky: grow separately because
      // ArrayUtil.grow will otherwise pick a different
      // length than the int[]s we just grew:
      final Type[] newForwardType = new Type[forwardPos.length];
      System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length);
      forwardType = newForwardType;
    }

    public void add(
        int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) {
      // NOTE: this isn't quite a true Viterbi search,
      // because we should check if lastRightID is
      // already present here, and only update if the new
      // cost is less than the current cost, instead of
      // simply appending.  However, that will likely hurt
      // performance (usually we add a lastRightID only once),
      // and it means we actually create the full graph
      // intersection instead of a "normal" Viterbi lattice:
      if (count == costs.length) {
        grow();
      }
      this.costs[count] = cost;
      this.lastRightID[count] = lastRightID;
      this.backPos[count] = backPos;
      this.backIndex[count] = backIndex;
      this.backID[count] = backID;
      this.backType[count] = backType;
      count++;
    }

    public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) {
      if (forwardCount == this.forwardID.length) {
        growForward();
      }
      this.forwardPos[forwardCount] = forwardPos;
      this.forwardIndex[forwardCount] = forwardIndex;
      this.forwardID[forwardCount] = forwardID;
      this.forwardType[forwardCount] = forwardType;
      forwardCount++;
    }

    public void reset() {
      count = 0;
      // forwardCount naturally resets after it runs:
      assert forwardCount == 0 : "pos=" + pos + " forwardCount=" + forwardCount;
    }
  }

  private void add(
      Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty)
      throws IOException {
    final int wordCost = dict.getWordCost(wordID);
    final int leftID = dict.getLeftId(wordID);
    int leastCost = Integer.MAX_VALUE;
    int leastIDX = -1;
    assert fromPosData.count > 0;
    for (int idx = 0; idx < fromPosData.count; idx++) {
      // Cost is path cost so far, plus word cost (added at
      // end of loop), plus bigram cost:
      final int cost = fromPosData.costs[idx] + costs.get(fromPosData.lastRightID[idx], leftID);
      if (VERBOSE) {
        System.out.println(
            "      fromIDX="
                + idx
                + ": cost="
                + cost
                + " (prevCost="
                + fromPosData.costs[idx]
                + " wordCost="
                + wordCost
                + " bgCost="
                + costs.get(fromPosData.lastRightID[idx], leftID)
                + " leftID="
                + leftID
                + ")");
      }
      if (cost < leastCost) {
        leastCost = cost;
        leastIDX = idx;
        if (VERBOSE) {
          System.out.println("        **");
        }
      }
    }

    leastCost += wordCost;

    if (VERBOSE) {
      System.out.println(
          "      + cost="
              + leastCost
              + " wordID="
              + wordID
              + " leftID="
              + leftID
              + " leastIDX="
              + leastIDX
              + " toPos="
              + endPos
              + " toPos.idx="
              + positions.get(endPos).count);
    }

    if (addPenalty && type != Type.USER) {
      final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
      if (VERBOSE) {
        if (penalty > 0) {
          System.out.println("        + penalty=" + penalty + " cost=" + (leastCost + penalty));
        }
      }
      leastCost += penalty;
    }

    // positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX,
    // wordID, type);
    assert leftID == dict.getRightId(wordID);
    positions.get(endPos).add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type);
  }

  @Override
  public boolean incrementToken() throws IOException {

    // parse() is able to return w/o producing any new
    // tokens, when the tokens it had produced were entirely
    // punctuation.  So we loop here until we get a real
    // token or we end:
    while (pending.size() == 0) {
      if (end) {
        return false;
      }

      // Push Viterbi forward some more:
      parse();
    }

    final Token token = pending.remove(pending.size() - 1);

    int position = token.getPosition();
    int length = token.getLength();
    clearAttributes();
    assert length > 0;
    // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
    // token.getSurfaceForm().length);
    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
    offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
    basicFormAtt.setToken(token);
    posAtt.setToken(token);
    readingAtt.setToken(token);
    inflectionAtt.setToken(token);
    if (token.getPosition() == lastTokenPos) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else if (outputNBest) {
      // The position length is always calculated if outputNBest is true.
      assert token.getPosition() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else {
      assert token.getPosition() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getPosition();
    return true;
  }

  // TODO: make generic'd version of this "circular array"?
  // It's a bit tricky because we do things to the Position
  // (eg, set .pos = N on reuse)...
  static final class WrappedPositionArray {
    private Position[] positions = new Position[8];

    public WrappedPositionArray() {
      for (int i = 0; i < positions.length; i++) {
        positions[i] = new Position();
      }
    }

    // Next array index to write to in positions:
    private int nextWrite;

    // Next position to write:
    private int nextPos;

    // How many valid Position instances are held in the
    // positions array:
    private int count;

    public void reset() {
      nextWrite--;
      while (count > 0) {
        if (nextWrite == -1) {
          nextWrite = positions.length - 1;
        }
        positions[nextWrite--].reset();
        count--;
      }
      nextWrite = 0;
      nextPos = 0;
      count = 0;
    }

    /**
     * Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
     * the future" but cannot be before the last freeBefore.
     */
    public Position get(int pos) {
      while (pos >= nextPos) {
        // System.out.println("count=" + count + " vs len=" + positions.length);
        if (count == positions.length) {
          Position[] newPositions =
              new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
          // System.out.println("grow positions " + newPositions.length);
          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
          System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
          for (int i = positions.length; i < newPositions.length; i++) {
            newPositions[i] = new Position();
          }
          nextWrite = positions.length;
          positions = newPositions;
        }
        if (nextWrite == positions.length) {
          nextWrite = 0;
        }
        // Should have already been reset:
        assert positions[nextWrite].count == 0;
        positions[nextWrite++].pos = nextPos++;
        count++;
      }
      assert inBounds(pos);
      final int index = getIndex(pos);
      assert positions[index].pos == pos;
      return positions[index];
    }

    public int getNextPos() {
      return nextPos;
    }

    // For assert:
    private boolean inBounds(int pos) {
      return pos < nextPos && pos >= nextPos - count;
    }

    private int getIndex(int pos) {
      int index = nextWrite - (nextPos - pos);
      if (index < 0) {
        index += positions.length;
      }
      return index;
    }

    public void freeBefore(int pos) {
      final int toFree = count - (nextPos - pos);
      assert toFree >= 0;
      assert toFree <= count;
      int index = nextWrite - count;
      if (index < 0) {
        index += positions.length;
      }
      for (int i = 0; i < toFree; i++) {
        if (index == positions.length) {
          index = 0;
        }
        // System.out.println("  fb idx=" + index);
        positions[index].reset();
        index++;
      }
      count -= toFree;
    }
  }

  /* Incrementally parse some more characters.  This runs
   * the viterbi search forwards "enough" so that we
   * generate some more tokens.  How much forward depends on
   * the chars coming in, since some chars could cause
   * longer-lasting ambiguity in the parsing.  Once the
   * ambiguity is resolved, then we back trace, produce
   * the pending tokens, and return. */
  private void parse() throws IOException {
    if (VERBOSE) {
      System.out.println("\nPARSE");
    }

    // Index of the last character of unknown word:
    int unknownWordEndIndex = -1;

    // Advances over each position (character):
    while (true) {

      if (buffer.get(pos) == -1) {
        // End
        break;
      }

      final Position posData = positions.get(pos);
      final boolean isFrontier = positions.getNextPos() == pos + 1;

      if (posData.count == 0) {
        // No arcs arrive here; move to next position:
        if (VERBOSE) {
          System.out.println("    no arcs in; skip pos=" + pos);
        }
        pos++;
        continue;
      }

      if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
        //  if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
        // We are at a "frontier", and only one node is
        // alive, so whatever the eventual best path is must
        // come through this node.  So we can safely commit
        // to the prefix of the best path at this point:
        if (outputNBest) {
          backtraceNBest(posData, false);
        }
        backtrace(posData, 0);
        if (outputNBest) {
          fixupPendingList();
        }

        // Re-base cost so we don't risk int overflow:
        posData.costs[0] = 0;

        if (pending.size() != 0) {
          return;
        } else {
          // This means the backtrace only produced
          // punctuation tokens, so we must keep parsing.
        }
      }

      if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
        // Safety: if we've buffered too much, force a
        // backtrace now.  We find the least-cost partial
        // path, across all paths, backtrace from it, and
        // then prune all others.  Note that this, in
        // general, can produce the wrong result, if the
        // total best path did not in fact back trace
        // through this partial best path.  But it's the
        // best we can do... (short of not having a
        // safety!).

        // First pass: find least cost partial path so far,
        // including ending at future positions:
        int leastIDX = -1;
        int leastCost = Integer.MAX_VALUE;
        Position leastPosData = null;
        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
          final Position posData2 = positions.get(pos2);
          for (int idx = 0; idx < posData2.count; idx++) {
            // System.out.println("    idx=" + idx + " cost=" + cost);
            final int cost = posData2.costs[idx];
            if (cost < leastCost) {
              leastCost = cost;
              leastIDX = idx;
              leastPosData = posData2;
            }
          }
        }

        // We will always have at least one live path:
        assert leastIDX != -1;

        if (outputNBest) {
          backtraceNBest(leastPosData, false);
        }

        // Second pass: prune all but the best path:
        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
          final Position posData2 = positions.get(pos2);
          if (posData2 != leastPosData) {
            posData2.reset();
          } else {
            if (leastIDX != 0) {
              posData2.costs[0] = posData2.costs[leastIDX];
              posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
              posData2.backPos[0] = posData2.backPos[leastIDX];
              posData2.backIndex[0] = posData2.backIndex[leastIDX];
              posData2.backID[0] = posData2.backID[leastIDX];
              posData2.backType[0] = posData2.backType[leastIDX];
            }
            posData2.count = 1;
          }
        }

        backtrace(leastPosData, 0);
        if (outputNBest) {
          fixupPendingList();
        }

        // Re-base cost so we don't risk int overflow:
        Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);

        if (pos != leastPosData.pos) {
          // We jumped into a future position:
          assert pos < leastPosData.pos;
          pos = leastPosData.pos;
        }

        if (pending.size() != 0) {
          return;
        } else {
          // This means the backtrace only produced
          // punctuation tokens, so we must keep parsing.
          continue;
        }
      }

      if (VERBOSE) {
        System.out.println(
            "\n  extend @ pos="
                + pos
                + " char="
                + (char) buffer.get(pos)
                + " hex="
                + Integer.toHexString(buffer.get(pos)));
      }

      if (VERBOSE) {
        System.out.println("    " + posData.count + " arcs in");
      }

      boolean anyMatches = false;

      // First try user dict:
      if (userFST != null) {
        userFST.getFirstArc(arc);
        int output = 0;
        for (int posAhead = posData.pos; ; posAhead++) {
          final int ch = buffer.get(posAhead);
          if (ch == -1) {
            break;
          }
          if (userFST.findTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) {
            break;
          }
          output += arc.output().intValue();
          if (arc.isFinal()) {
            if (VERBOSE) {
              System.out.println(
                  "    USER word "
                      + new String(buffer.get(pos, posAhead - pos + 1))
                      + " toPos="
                      + (posAhead + 1));
            }
            add(
                userDictionary,
                posData,
                posAhead + 1,
                output + arc.nextFinalOutput().intValue(),
                Type.USER,
                false);
            anyMatches = true;
          }
        }
      }

      // TODO: we can be more aggressive about user
      // matches?  if we are "under" a user match then don't
      // extend KNOWN/UNKNOWN paths?

      if (!anyMatches) {
        // Next, try known dictionary matches
        fst.getFirstArc(arc);
        int output = 0;

        for (int posAhead = posData.pos; ; posAhead++) {
          final int ch = buffer.get(posAhead);
          if (ch == -1) {
            break;
          }
          // System.out.println("    match " + (char) ch + " posAhead=" + posAhead);

          if (fst.findTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) {
            break;
          }

          output += arc.output().intValue();

          // Optimization: for known words that are too-long
          // (compound), we should pre-compute the 2nd
          // best segmentation and store it in the
          // dictionary instead of recomputing it each time a
          // match is found.

          if (arc.isFinal()) {
            dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
            if (VERBOSE) {
              System.out.println(
                  "    KNOWN word "
                      + new String(buffer.get(pos, posAhead - pos + 1))
                      + " toPos="
                      + (posAhead + 1)
                      + " "
                      + wordIdRef.length
                      + " wordIDs");
            }
            for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
              add(
                  dictionary,
                  posData,
                  posAhead + 1,
                  wordIdRef.ints[wordIdRef.offset + ofs],
                  Type.KNOWN,
                  false);
              anyMatches = true;
            }
          }
        }
      }

      // In the case of normal mode, it doesn't process unknown word greedily.

      if (!searchMode && unknownWordEndIndex > posData.pos) {
        pos++;
        continue;
      }

      final char firstCharacter = (char) buffer.get(pos);
      if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {

        // Find unknown match:
        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
        final boolean isPunct = isPunctuation(firstCharacter);

        // NOTE: copied from UnknownDictionary.lookup:
        int unknownWordLength;
        if (!characterDefinition.isGroup(firstCharacter)) {
          unknownWordLength = 1;
        } else {
          // Extract unknown word. Characters with the same character class are considered to be
          // part of unknown word
          unknownWordLength = 1;
          for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
            final int ch = buffer.get(posAhead);
            if (ch == -1) {
              break;
            }
            if (characterId == characterDefinition.getCharacterClass((char) ch)
                && isPunctuation((char) ch) == isPunct) {
              unknownWordLength++;
            } else {
              break;
            }
          }
        }

        unkDictionary.lookupWordIds(
            characterId, wordIdRef); // characters in input text are supposed to be the same
        if (VERBOSE) {
          System.out.println(
              "    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
        }
        for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
          add(
              unkDictionary,
              posData,
              posData.pos + unknownWordLength,
              wordIdRef.ints[wordIdRef.offset + ofs],
              Type.UNKNOWN,
              false);
        }

        unknownWordEndIndex = posData.pos + unknownWordLength;
      }

      pos++;
    }

    end = true;

    if (pos > 0) {

      final Position endPosData = positions.get(pos);
      int leastCost = Integer.MAX_VALUE;
      int leastIDX = -1;
      if (VERBOSE) {
        System.out.println("  end: " + endPosData.count + " nodes");
      }
      for (int idx = 0; idx < endPosData.count; idx++) {
        // Add EOS cost:
        final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
        // System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" +
        // endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
        // backPos=" + endPosData.backPos[idx]);
        if (cost < leastCost) {
          leastCost = cost;
          leastIDX = idx;
        }
      }

      if (outputNBest) {
        backtraceNBest(endPosData, true);
      }
      backtrace(endPosData, leastIDX);
      if (outputNBest) {
        fixupPendingList();
      }
    } else {
      // No characters in the input string; return no tokens!
    }
  }

  // Eliminates arcs from the lattice that are compound
  // tokens (have a penalty) or are not congruent with the
  // compound token we've matched (ie, span across the
  // startPos).  This should be fairly efficient, because we
  // just keep the already intersected structure of the
  // graph, eg we don't have to consult the FSTs again:

  private void pruneAndRescore(int startPos, int endPos, int bestStartIDX) throws IOException {
    if (VERBOSE) {
      System.out.println(
          "  pruneAndRescore startPos="
              + startPos
              + " endPos="
              + endPos
              + " bestStartIDX="
              + bestStartIDX);
    }

    // First pass: walk backwards, building up the forward
    // arcs and pruning inadmissible arcs:
    for (int pos = endPos; pos > startPos; pos--) {
      final Position posData = positions.get(pos);
      if (VERBOSE) {
        System.out.println("    back pos=" + pos);
      }
      for (int arcIDX = 0; arcIDX < posData.count; arcIDX++) {
        final int backPos = posData.backPos[arcIDX];
        if (backPos >= startPos) {
          // Keep this arc:
          // System.out.println("      keep backPos=" + backPos);
          positions
              .get(backPos)
              .addForward(pos, arcIDX, posData.backID[arcIDX], posData.backType[arcIDX]);
        } else {
          if (VERBOSE) {
            System.out.println("      prune");
          }
        }
      }
      if (pos != startPos) {
        posData.count = 0;
      }
    }

    // Second pass: walk forward, re-scoring:
    for (int pos = startPos; pos < endPos; pos++) {
      final Position posData = positions.get(pos);
      if (VERBOSE) {
        System.out.println("    forward pos=" + pos + " count=" + posData.forwardCount);
      }
      if (posData.count == 0) {
        // No arcs arrive here...
        if (VERBOSE) {
          System.out.println("      skip");
        }
        posData.forwardCount = 0;
        continue;
      }

      if (pos == startPos) {
        // On the initial position, only consider the best
        // path so we "force congruence":  the
        // sub-segmentation is "in context" of what the best
        // path (compound token) had matched:
        final int rightID;
        if (startPos == 0) {
          rightID = 0;
        } else {
          rightID =
              getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
        }
        final int pathCost = posData.costs[bestStartIDX];
        for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final Dictionary dict2 = getDict(forwardType);
          final int wordID = posData.forwardID[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          final int newCost =
              pathCost
                  + dict2.getWordCost(wordID)
                  + costs.get(rightID, dict2.getLeftId(wordID))
                  + computePenalty(pos, toPos - pos);
          if (VERBOSE) {
            System.out.println(
                "      + "
                    + forwardType
                    + " word "
                    + new String(buffer.get(pos, toPos - pos))
                    + " toPos="
                    + toPos
                    + " cost="
                    + newCost
                    + " penalty="
                    + computePenalty(pos, toPos - pos)
                    + " toPos.idx="
                    + positions.get(toPos).count);
          }
          positions
              .get(toPos)
              .add(newCost, dict2.getRightId(wordID), pos, bestStartIDX, wordID, forwardType);
        }
      } else {
        // On non-initial positions, we maximize score
        // across all arriving lastRightIDs:
        for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          if (VERBOSE) {
            System.out.println(
                "      + "
                    + forwardType
                    + " word "
                    + new String(buffer.get(pos, toPos - pos))
                    + " toPos="
                    + toPos);
          }
          add(
              getDict(forwardType),
              posData,
              toPos,
              posData.forwardID[forwardArcIDX],
              forwardType,
              true);
        }
      }
      posData.forwardCount = 0;
    }
  }

  // yet another lattice data structure
  private static final class Lattice {
    char[] fragment;
    EnumMap dictionaryMap;
    boolean useEOS;

    int rootCapacity = 0;
    int rootSize = 0;
    int rootBase = 0;

    // root pointers of node chain by leftChain_ that have same start offset.
    int[] lRoot;
    // root pointers of node chain by rightChain_ that have same end offset.
    int[] rRoot;

    int capacity = 0;
    int nodeCount = 0;

    // The variables below are elements of lattice node that indexed by node number.
    Type[] nodeDicType;
    int[] nodeWordID;
    // nodeMark - -1:excluded, 0:unused, 1:bestpath, 2:2-best-path, ... N:N-best-path
    int[] nodeMark;
    int[] nodeLeftID;
    int[] nodeRightID;
    int[] nodeWordCost;
    int[] nodeLeftCost;
    int[] nodeRightCost;
    // nodeLeftNode, nodeRightNode - are left/right node number with minimum cost path.
    int[] nodeLeftNode;
    int[] nodeRightNode;
    // nodeLeft, nodeRight - start/end offset
    int[] nodeLeft;
    int[] nodeRight;
    int[] nodeLeftChain;
    int[] nodeRightChain;

    private void setupRoot(int baseOffset, int lastOffset) {
      assert baseOffset <= lastOffset;
      int size = lastOffset - baseOffset + 1;
      if (rootCapacity < size) {
        int oversize = ArrayUtil.oversize(size, Integer.BYTES);
        lRoot = new int[oversize];
        rRoot = new int[oversize];
        rootCapacity = oversize;
      }
      Arrays.fill(lRoot, 0, size, -1);
      Arrays.fill(rRoot, 0, size, -1);
      rootSize = size;
      rootBase = baseOffset;
    }

    // Reserve at least N nodes.
    private void reserve(int n) {
      if (capacity < n) {
        int oversize = ArrayUtil.oversize(n, Integer.BYTES);
        nodeDicType = new Type[oversize];
        nodeWordID = new int[oversize];
        nodeMark = new int[oversize];
        nodeLeftID = new int[oversize];
        nodeRightID = new int[oversize];
        nodeWordCost = new int[oversize];
        nodeLeftCost = new int[oversize];
        nodeRightCost = new int[oversize];
        nodeLeftNode = new int[oversize];
        nodeRightNode = new int[oversize];
        nodeLeft = new int[oversize];
        nodeRight = new int[oversize];
        nodeLeftChain = new int[oversize];
        nodeRightChain = new int[oversize];
        capacity = oversize;
      }
    }

    private void setupNodePool(int n) {
      reserve(n);
      nodeCount = 0;
      if (VERBOSE) {
        System.out.printf("DEBUG: setupNodePool: n = %d\n", n);
        System.out.printf("DEBUG: setupNodePool: lattice.capacity = %d\n", capacity);
      }
    }

    private int addNode(Type dicType, int wordID, int left, int right) {
      if (VERBOSE) {
        System.out.printf(
            "DEBUG: addNode: dicType=%s, wordID=%d, left=%d, right=%d, str=%s\n",
            dicType.toString(),
            wordID,
            left,
            right,
            left == -1 ? "BOS" : right == -1 ? "EOS" : new String(fragment, left, right - left));
      }
      assert nodeCount < capacity;
      assert left == -1 || right == -1 || left < right;
      assert left == -1 || (0 <= left && left < rootSize);
      assert right == -1 || (0 <= right && right < rootSize);

      int node = nodeCount++;

      if (VERBOSE) {
        System.out.printf("DEBUG: addNode: node=%d\n", node);
      }

      nodeDicType[node] = dicType;
      nodeWordID[node] = wordID;
      nodeMark[node] = 0;

      if (wordID < 0) {
        nodeWordCost[node] = 0;
        nodeLeftCost[node] = 0;
        nodeRightCost[node] = 0;
        nodeLeftID[node] = 0;
        nodeRightID[node] = 0;
      } else {
        Dictionary dic = dictionaryMap.get(dicType);
        nodeWordCost[node] = dic.getWordCost(wordID);
        nodeLeftID[node] = dic.getLeftId(wordID);
        nodeRightID[node] = dic.getRightId(wordID);
      }

      if (VERBOSE) {
        System.out.printf(
            "DEBUG: addNode: wordCost=%d, leftID=%d, rightID=%d\n",
            nodeWordCost[node], nodeLeftID[node], nodeRightID[node]);
      }

      nodeLeft[node] = left;
      nodeRight[node] = right;
      if (0 <= left) {
        nodeLeftChain[node] = lRoot[left];
        lRoot[left] = node;
      } else {
        nodeLeftChain[node] = -1;
      }
      if (0 <= right) {
        nodeRightChain[node] = rRoot[right];
        rRoot[right] = node;
      } else {
        nodeRightChain[node] = -1;
      }
      return node;
    }

    // Sum of positions.get(i).count in [beg, end) range.
    // using stream:
    //   return IntStream.range(beg, end).map(i -> positions.get(i).count).sum();
    private int positionCount(WrappedPositionArray positions, int beg, int end) {
      int count = 0;
      for (int i = beg; i < end; ++i) {
        count += positions.get(i).count;
      }
      return count;
    }

    void setup(
        char[] fragment,
        EnumMap dictionaryMap,
        WrappedPositionArray positions,
        int prevOffset,
        int endOffset,
        boolean useEOS) {
      assert positions.get(prevOffset).count == 1;
      if (VERBOSE) {
        System.out.printf("DEBUG: setup: prevOffset=%d, endOffset=%d\n", prevOffset, endOffset);
      }

      this.fragment = fragment;
      this.dictionaryMap = dictionaryMap;
      this.useEOS = useEOS;

      // Initialize lRoot and rRoot.
      setupRoot(prevOffset, endOffset);

      // "+ 2" for first/last record.
      setupNodePool(positionCount(positions, prevOffset + 1, endOffset + 1) + 2);

      // substitute for BOS = 0
      Position first = positions.get(prevOffset);
      if (addNode(first.backType[0], first.backID[0], -1, 0) != 0) {
        assert false;
      }

      // EOS = 1
      if (addNode(Type.KNOWN, -1, endOffset - rootBase, -1) != 1) {
        assert false;
      }

      for (int offset = endOffset; prevOffset < offset; --offset) {
        int right = offset - rootBase;
        // optimize: exclude disconnected nodes.
        if (0 <= lRoot[right]) {
          Position pos = positions.get(offset);
          for (int i = 0; i < pos.count; ++i) {
            addNode(pos.backType[i], pos.backID[i], pos.backPos[i] - rootBase, right);
          }
        }
      }
    }

    // set mark = -1 for unreachable nodes.
    void markUnreachable() {
      for (int index = 1; index < rootSize - 1; ++index) {
        if (rRoot[index] < 0) {
          for (int node = lRoot[index]; 0 <= node; node = nodeLeftChain[node]) {
            if (VERBOSE) {
              System.out.printf("DEBUG: markUnreachable: node=%d\n", node);
            }
            nodeMark[node] = -1;
          }
        }
      }
    }

    int connectionCost(ConnectionCosts costs, int left, int right) {
      int leftID = nodeLeftID[right];
      return ((leftID == 0 && !useEOS) ? 0 : costs.get(nodeRightID[left], leftID));
    }

    void calcLeftCost(ConnectionCosts costs) {
      for (int index = 0; index < rootSize; ++index) {
        for (int node = lRoot[index]; 0 <= node; node = nodeLeftChain[node]) {
          if (0 <= nodeMark[node]) {
            int leastNode = -1;
            int leastCost = Integer.MAX_VALUE;
            for (int leftNode = rRoot[index]; 0 <= leftNode; leftNode = nodeRightChain[leftNode]) {
              if (0 <= nodeMark[leftNode]) {
                int cost =
                    nodeLeftCost[leftNode]
                        + nodeWordCost[leftNode]
                        + connectionCost(costs, leftNode, node);
                if (cost < leastCost) {
                  leastCost = cost;
                  leastNode = leftNode;
                }
              }
            }
            assert 0 <= leastNode;
            nodeLeftNode[node] = leastNode;
            nodeLeftCost[node] = leastCost;
            if (VERBOSE) {
              System.out.printf(
                  "DEBUG: calcLeftCost: node=%d, leftNode=%d, leftCost=%d\n",
                  node, nodeLeftNode[node], nodeLeftCost[node]);
            }
          }
        }
      }
    }

    void calcRightCost(ConnectionCosts costs) {
      for (int index = rootSize - 1; 0 <= index; --index) {
        for (int node = rRoot[index]; 0 <= node; node = nodeRightChain[node]) {
          if (0 <= nodeMark[node]) {
            int leastNode = -1;
            int leastCost = Integer.MAX_VALUE;
            for (int rightNode = lRoot[index];
                0 <= rightNode;
                rightNode = nodeLeftChain[rightNode]) {
              if (0 <= nodeMark[rightNode]) {
                int cost =
                    nodeRightCost[rightNode]
                        + nodeWordCost[rightNode]
                        + connectionCost(costs, node, rightNode);
                if (cost < leastCost) {
                  leastCost = cost;
                  leastNode = rightNode;
                }
              }
            }
            assert 0 <= leastNode;
            nodeRightNode[node] = leastNode;
            nodeRightCost[node] = leastCost;
            if (VERBOSE) {
              System.out.printf(
                  "DEBUG: calcRightCost: node=%d, rightNode=%d, rightCost=%d\n",
                  node, nodeRightNode[node], nodeRightCost[node]);
            }
          }
        }
      }
    }

    // Mark all nodes that have same text and different par-of-speech or reading.
    void markSameSpanNode(int refNode, int value) {
      int left = nodeLeft[refNode];
      int right = nodeRight[refNode];
      for (int node = lRoot[left]; 0 <= node; node = nodeLeftChain[node]) {
        if (nodeRight[node] == right) {
          nodeMark[node] = value;
        }
      }
    }

    List bestPathNodeList() {
      List list = new ArrayList<>();
      for (int node = nodeRightNode[0]; node != 1; node = nodeRightNode[node]) {
        list.add(node);
        markSameSpanNode(node, 1);
      }
      return list;
    }

    private int cost(int node) {
      return nodeLeftCost[node] + nodeWordCost[node] + nodeRightCost[node];
    }

    List nBestNodeList(int N) {
      List list = new ArrayList<>();
      int leastCost = Integer.MAX_VALUE;
      int leastLeft = -1;
      int leastRight = -1;
      for (int node = 2; node < nodeCount; ++node) {
        if (nodeMark[node] == 0) {
          int cost = cost(node);
          if (cost < leastCost) {
            leastCost = cost;
            leastLeft = nodeLeft[node];
            leastRight = nodeRight[node];
            list.clear();
            list.add(node);
          } else if (cost == leastCost
              && (nodeLeft[node] != leastLeft || nodeRight[node] != leastRight)) {
            list.add(node);
          }
        }
      }
      for (int node : list) {
        markSameSpanNode(node, N);
      }
      return list;
    }

    int bestCost() {
      return nodeLeftCost[1];
    }

    int probeDelta(int start, int end) {
      int left = start - rootBase;
      int right = end - rootBase;
      if (left < 0 || rootSize < right) {
        return Integer.MAX_VALUE;
      }
      int probedCost = Integer.MAX_VALUE;
      for (int node = lRoot[left]; 0 <= node; node = nodeLeftChain[node]) {
        if (nodeRight[node] == right) {
          probedCost = Math.min(probedCost, cost(node));
        }
      }
      return probedCost - bestCost();
    }

    void debugPrint() {
      if (VERBOSE) {
        for (int node = 0; node < nodeCount; ++node) {
          System.out.printf(
              "DEBUG NODE: node=%d, mark=%d, cost=%d, left=%d, right=%d\n",
              node, nodeMark[node], cost(node), nodeLeft[node], nodeRight[node]);
        }
      }
    }
  }

  private Lattice lattice = null;

  private void registerNode(int node, char[] fragment) {
    int left = lattice.nodeLeft[node];
    int right = lattice.nodeRight[node];
    Type type = lattice.nodeDicType[node];
    if (!discardPunctuation || !isPunctuation(fragment[left])) {
      if (type == Type.USER) {
        // The code below are based on backtrace().
        //
        // Expand the phraseID we recorded into the actual segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(lattice.nodeWordID[node]);
        int wordID = wordIDAndLength[0];
        pending.add(
            new Token(
                wordID,
                fragment,
                left,
                right - left,
                Type.USER,
                lattice.rootBase + left,
                userDictionary));
        // Output compound
        int current = 0;
        for (int j = 1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          if (len < right - left) {
            pending.add(
                new Token(
                    wordID + j - 1,
                    fragment,
                    current + left,
                    len,
                    Type.USER,
                    lattice.rootBase + current + left,
                    userDictionary));
          }
          current += len;
        }
      } else {
        pending.add(
            new Token(
                lattice.nodeWordID[node],
                fragment,
                left,
                right - left,
                type,
                lattice.rootBase + left,
                getDict(type)));
      }
    }
  }

  // Sort pending tokens, and set position increment values.
  private void fixupPendingList() {
    // Sort for removing same tokens.
    // USER token should be ahead from normal one.
    Collections.sort(
        pending,
        new Comparator() {
          @Override
          public int compare(Token a, Token b) {
            int aOff = a.getOffset();
            int bOff = b.getOffset();
            if (aOff != bOff) {
              return aOff - bOff;
            }
            int aLen = a.getLength();
            int bLen = b.getLength();
            if (aLen != bLen) {
              return aLen - bLen;
            }
            // order of Type is KNOWN, UNKNOWN, USER,
            // so we use reversed comparison here.
            return b.getType().ordinal() - a.getType().ordinal();
          }
        });

    // Remove same token.
    for (int i = 1; i < pending.size(); ++i) {
      Token a = pending.get(i - 1);
      Token b = pending.get(i);
      if (a.getOffset() == b.getOffset() && a.getLength() == b.getLength()) {
        pending.remove(i);
        // It is important to decrement "i" here, because a next may be removed.
        --i;
      }
    }

    // offset=>position map
    HashMap map = new HashMap<>();
    for (Token t : pending) {
      map.put(t.getOffset(), 0);
      map.put(t.getOffset() + t.getLength(), 0);
    }

    // Get uniqe and sorted list of all edge position of tokens.
    Integer[] offsets = map.keySet().toArray(new Integer[0]);
    Arrays.sort(offsets);

    // setup all value of map.  It specify N-th position from begin.
    for (int i = 0; i < offsets.length; ++i) {
      map.put(offsets[i], i);
    }

    // We got all position length now.
    for (Token t : pending) {
      t.setPositionLength(map.get(t.getOffset() + t.getLength()) - map.get(t.getOffset()));
    }

    // Make PENDING to be reversed order to fit its usage.
    // If you would like to speedup, you can try reversed order sort
    // at first of this function.
    Collections.reverse(pending);
  }

  private int probeDelta(String inText, String requiredToken) throws IOException {
    int start = inText.indexOf(requiredToken);
    if (start < 0) {
      // -1 when no requiredToken.
      return -1;
    }

    int delta = Integer.MAX_VALUE;
    int saveNBestCost = nBestCost;
    setReader(new StringReader(inText));
    reset();
    try {
      setNBestCost(1);
      int prevRootBase = -1;
      while (incrementToken()) {
        if (lattice.rootBase != prevRootBase) {
          prevRootBase = lattice.rootBase;
          delta = Math.min(delta, lattice.probeDelta(start, start + requiredToken.length()));
        }
      }
    } finally {
      // reset & end
      end();
      // setReader & close
      close();
      setNBestCost(saveNBestCost);
    }

    if (VERBOSE) {
      System.out.printf("JapaneseTokenizer: delta = %d: %s-%s\n", delta, inText, requiredToken);
    }
    return delta == Integer.MAX_VALUE ? -1 : delta;
  }

  public int calcNBestCost(String examples) {
    int maxDelta = 0;
    for (String example : examples.split("/")) {
      if (!example.isEmpty()) {
        String[] pair = example.split("-");
        if (pair.length != 2) {
          throw new RuntimeException("Unexpected example form: " + example + " (expected two '-')");
        } else {
          try {
            maxDelta = Math.max(maxDelta, probeDelta(pair[0], pair[1]));
          } catch (IOException e) {
            throw new RuntimeException(
                "Internal error calculating best costs from examples. Got ", e);
          }
        }
      }
    }
    return maxDelta;
  }

  public void setNBestCost(int value) {
    nBestCost = value;
    outputNBest = 0 < nBestCost;
  }

  private void backtraceNBest(final Position endPosData, final boolean useEOS) throws IOException {
    if (lattice == null) {
      lattice = new Lattice();
    }

    final int endPos = endPosData.pos;
    char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
    lattice.setup(fragment, dictionaryMap, positions, lastBackTracePos, endPos, useEOS);
    lattice.markUnreachable();
    lattice.calcLeftCost(costs);
    lattice.calcRightCost(costs);

    int bestCost = lattice.bestCost();
    if (VERBOSE) {
      System.out.printf("DEBUG: 1-BEST COST: %d\n", bestCost);
    }
    for (int node : lattice.bestPathNodeList()) {
      registerNode(node, fragment);
    }

    for (int n = 2; ; ++n) {
      List nbest = lattice.nBestNodeList(n);
      if (nbest.isEmpty()) {
        break;
      }
      int cost = lattice.cost(nbest.get(0));
      if (VERBOSE) {
        System.out.printf("DEBUG: %d-BEST COST: %d\n", n, cost);
      }
      if (bestCost + nBestCost < cost) {
        break;
      }
      for (int node : nbest) {
        registerNode(node, fragment);
      }
    }
    if (VERBOSE) {
      lattice.debugPrint();
    }
  }

  // Backtrace from the provided position, back to the last
  // time we back-traced, accumulating the resulting tokens to
  // the pending list.  The pending list is then in-reverse
  // (last token should be returned first).
  private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
    final int endPos = endPosData.pos;

    /**
     * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
     * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
     * empty buffer
     */
    if (endPos == lastBackTracePos) {
      return;
    }

    if (VERBOSE) {
      System.out.println(
          "\n  backtrace: endPos="
              + endPos
              + " pos="
              + pos
              + "; "
              + (pos - lastBackTracePos)
              + " characters; last="
              + lastBackTracePos
              + " cost="
              + endPosData.costs[fromIDX]);
    }

    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);

    if (dotOut != null) {
      dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
    }

    int pos = endPos;
    int bestIDX = fromIDX;
    Token altToken = null;

    // We trace backwards, so this will be the leftWordID of
    // the token after the one we are now on:
    int lastLeftWordID = -1;

    int backCount = 0;

    // TODO: sort of silly to make Token instances here; the
    // back trace has all info needed to generate the
    // token.  So, we could just directly set the attrs,
    // from the backtrace, in incrementToken w/o ever
    // creating Token; we'd have to defer calling freeBefore
    // until after the backtrace was fully "consumed" by
    // incrementToken.

    while (pos > lastBackTracePos) {
      // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
      final Position posData = positions.get(pos);
      assert bestIDX < posData.count;

      int backPos = posData.backPos[bestIDX];
      assert backPos >= lastBackTracePos
          : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
      int length = pos - backPos;
      Type backType = posData.backType[bestIDX];
      int backID = posData.backID[bestIDX];
      int nextBestIDX = posData.backIndex[bestIDX];

      if (searchMode && altToken == null && backType != Type.USER) {

        // In searchMode, if best path had picked a too-long
        // token, we use the "penalty" to compute the allowed
        // max cost of an alternate back-trace.  If we find an
        // alternate back trace with cost below that
        // threshold, we pursue it instead (but also output
        // the long token).
        // System.out.println("    2nd best backPos=" + backPos + " pos=" + pos);

        final int penalty = computeSecondBestThreshold(backPos, pos - backPos);

        if (penalty > 0) {
          if (VERBOSE) {
            System.out.println(
                "  compound="
                    + new String(buffer.get(backPos, pos - backPos))
                    + " backPos="
                    + backPos
                    + " pos="
                    + pos
                    + " penalty="
                    + penalty
                    + " cost="
                    + posData.costs[bestIDX]
                    + " bestIDX="
                    + bestIDX
                    + " lastLeftID="
                    + lastLeftWordID);
          }

          // Use the penalty to set maxCost on the 2nd best
          // segmentation:
          int maxCost = posData.costs[bestIDX] + penalty;
          if (lastLeftWordID != -1) {
            maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
          }

          // Now, prune all too-long tokens from the graph:
          pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]);

          // Finally, find 2nd best back-trace and resume
          // backtrace there:
          int leastCost = Integer.MAX_VALUE;
          int leastIDX = -1;
          for (int idx = 0; idx < posData.count; idx++) {
            int cost = posData.costs[idx];
            // System.out.println("    idx=" + idx + " prevCost=" + cost);

            if (lastLeftWordID != -1) {
              cost +=
                  costs.get(
                      getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
                      lastLeftWordID);
              // System.out.println("      += bgCost=" +
              // costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
              // lastLeftWordID) + " -> " + cost);
            }
            // System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
            // cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
            if (cost < leastCost) {
              // System.out.println("      ** ");
              leastCost = cost;
              leastIDX = idx;
            }
          }
          // System.out.println("  leastIDX=" + leastIDX);

          if (VERBOSE) {
            System.out.println(
                "  afterPrune: "
                    + posData.count
                    + " arcs arriving; leastCost="
                    + leastCost
                    + " vs threshold="
                    + maxCost
                    + " lastLeftWordID="
                    + lastLeftWordID);
          }

          if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
            // We should have pruned the altToken from the graph:
            assert posData.backPos[leastIDX] != backPos;

            // Save the current compound token, to output when
            // this alternate path joins back:
            altToken =
                new Token(
                    backID,
                    fragment,
                    backPos - lastBackTracePos,
                    length,
                    backType,
                    backPos,
                    getDict(backType));

            // Redirect our backtrace to 2nd best:
            bestIDX = leastIDX;
            nextBestIDX = posData.backIndex[bestIDX];

            backPos = posData.backPos[bestIDX];
            length = pos - backPos;
            backType = posData.backType[bestIDX];
            backID = posData.backID[bestIDX];
            backCount = 0;
            // System.out.println("  do alt token!");

          } else {
            // I think in theory it's possible there is no
            // 2nd best path, which is fine; in this case we
            // only output the compound token:
            // System.out.println("  no alt token! bestIDX=" + bestIDX);
          }
        }
      }

      final int offset = backPos - lastBackTracePos;
      assert offset >= 0;

      if (altToken != null && altToken.getPosition() >= backPos) {
        if (outputCompounds) {
          // We've backtraced to the position where the
          // compound token starts; add it now:

          // The pruning we did when we created the altToken
          // ensures that the back trace will align back with
          // the start of the altToken:
          assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;

          // NOTE: not quite right: the compound token may
          // have had all punctuation back traced so far, but
          // then the decompounded token at this position is
          // not punctuation.  In this case backCount is 0,
          // but we should maybe add the altToken anyway...?

          if (backCount > 0) {
            backCount++;
            altToken.setPositionLength(backCount);
            if (VERBOSE) {
              System.out.println("    add altToken=" + altToken);
            }
            pending.add(altToken);
          } else {
            // This means alt token was all punct tokens:
            if (VERBOSE) {
              System.out.println("    discard all-punctuation altToken=" + altToken);
            }
            assert discardPunctuation;
          }
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for (int j = 1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          // System.out.println("    add user: len=" + len);
          pending.add(
              new Token(
                  wordID + j - 1,
                  fragment,
                  current + offset,
                  len,
                  Type.USER,
                  current + backPos,
                  dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size() - 1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(
            pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size()));

        backCount += wordIDAndLength.length - 1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for (int i = length - 1; i >= 0; i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) {
              i--;
              charLen = 2;
            }
            // System.out.println("    extended tok offset="
            // + (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset + i])) {
              pending.add(
                  new Token(
                      CharacterDefinition.NGRAM,
                      fragment,
                      offset + i,
                      charLen,
                      Type.UNKNOWN,
                      backPos + i,
                      unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;

        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size() - 1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println(
                "    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;

    if (VERBOSE) {
      System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
  }

  Dictionary getDict(Type type) {
    return dictionaryMap.get(type);
  }

  private static boolean isPunctuation(char ch) {
    switch (Character.getType(ch)) {
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
      case Character.PARAGRAPH_SEPARATOR:
      case Character.CONTROL:
      case Character.FORMAT:
      case Character.DASH_PUNCTUATION:
      case Character.START_PUNCTUATION:
      case Character.END_PUNCTUATION:
      case Character.CONNECTOR_PUNCTUATION:
      case Character.OTHER_PUNCTUATION:
      case Character.MATH_SYMBOL:
      case Character.CURRENCY_SYMBOL:
      case Character.MODIFIER_SYMBOL:
      case Character.OTHER_SYMBOL:
      case Character.INITIAL_QUOTE_PUNCTUATION:
      case Character.FINAL_QUOTE_PUNCTUATION:
        return true;
      default:
        return false;
    }
  }
}