org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-kuromoji Show documentation
Lucene Kuromoji Japanese Morphological Analyzer
There is a newer version: 8.11.4
Show newest version
package org.apache.lucene.analysis.ja;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;

// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...

/**
 * Tokenizer for Japanese that uses morphological analysis.
 * 
 * This tokenizer sets a number of additional attributes:
 * 

 *   {@link BaseFormAttribute} containing base form for inflected
 *       adjectives and verbs.
 *   
{@link PartOfSpeechAttribute} containing part-of-speech.
 *   
{@link ReadingAttribute} containing reading and pronunciation.
 *   
{@link InflectionAttribute} containing additional part-of-speech
 *       information for inflected forms.
 * 
 * 
 * This tokenizer uses a rolling Viterbi search to find the 
 * least cost segmentation (path) of the incoming characters.  
 * For tokens that appear to be compound (> length 2 for all
 * Kanji, or > length 7 for non-Kanji), we see if there is a
 * 2nd best segmentation of that token after applying
 * penalties to the long tokens.  If so, and the Mode is
 * {@link Mode#SEARCH}, we output the alternate segmentation 
 * as well.
 */
public final class JapaneseTokenizer extends Tokenizer {

  /**
   * Tokenization mode: this determines how the tokenizer handles
   * compound and unknown words.
   */
  public static enum Mode {
    /**
     * Ordinary segmentation: no decomposition for compounds,
     */
    NORMAL, 

    /**
     * Segmentation geared towards search: this includes a 
     * decompounding process for long nouns, also including
     * the full compound token as a synonym.
     */
    SEARCH, 

    /**
     * Extended mode outputs unigrams for unknown words.
     * @lucene.experimental
     */
    EXTENDED
  }

  /**
   * Default tokenization mode. Currently this is {@link Mode#SEARCH}.
   */
  public static final Mode DEFAULT_MODE = Mode.SEARCH;

  /**
   * Token type reflecting the original source of this token
   */
  public enum Type {
    /**
     * Known words from the system dictionary.
     */
    KNOWN,
    /**
     * Unknown words (heuristically segmented).
     */
    UNKNOWN,
    /**
     * Known words from the user dictionary.
     */
    USER
  }

  private static final boolean VERBOSE = false;

  private static final int SEARCH_MODE_KANJI_LENGTH = 2;

  private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH

  private static final int SEARCH_MODE_KANJI_PENALTY = 3000;

  private static final int SEARCH_MODE_OTHER_PENALTY = 1700;

  // For safety:
  private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
  private static final int MAX_BACKTRACE_GAP = 1024;

  private final EnumMap dictionaryMap = new EnumMap(Type.class);

  private final TokenInfoFST fst;
  private final TokenInfoDictionary dictionary;
  private final UnknownDictionary unkDictionary;
  private final ConnectionCosts costs;
  private final UserDictionary userDictionary;
  private final CharacterDefinition characterDefinition;

  private final FST.Arc arc = new FST.Arc();
  private final FST.BytesReader fstReader;
  private final IntsRef wordIdRef = new IntsRef();

  private final FST.BytesReader userFSTReader;
  private final TokenInfoFST userFST;

  private final RollingCharBuffer buffer = new RollingCharBuffer();

  private final WrappedPositionArray positions = new WrappedPositionArray();

  private final boolean discardPunctuation;
  private final boolean searchMode;
  private final boolean extendedMode;
  private final boolean outputCompounds;

  // Index of the last character of unknown word:
  private int unknownWordEndIndex = -1;

  // True once we've hit the EOF from the input reader:
  private boolean end;

  // Last absolute position we backtraced from:
  private int lastBackTracePos;

  // Position of last token we returned; we use this to
  // figure out whether to set posIncr to 0 or 1:
  private int lastTokenPos;

  // Next absolute position to process:
  private int pos;

  // Already parsed, but not yet passed to caller, tokens:
  private final List pending = new ArrayList();

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);

  /**
   * Create a new JapaneseTokenizer.
   * 
   * Uses the default AttributeFactory.
   * 
   * @param input Reader containing text
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
    this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
  }

  /**
   * Create a new JapaneseTokenizer.
   *
   * @param factory the AttributeFactory to use
   * @param input Reader containing text
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer
      (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
    super(factory, input);
    dictionary = TokenInfoDictionary.getInstance();
    fst = dictionary.getFST();
    unkDictionary = UnknownDictionary.getInstance();
    characterDefinition = unkDictionary.getCharacterDefinition();
    this.userDictionary = userDictionary;
    costs = ConnectionCosts.getInstance();
    fstReader = fst.getBytesReader();
    if (userDictionary != null) {
      userFST = userDictionary.getFST();
      userFSTReader = userFST.getBytesReader();
    } else {
      userFST = null;
      userFSTReader = null;
    }
    this.discardPunctuation = discardPunctuation;
    switch(mode){
      case SEARCH:
        searchMode = true;
        extendedMode = false;
        outputCompounds = true;
        break;
      case EXTENDED:
        searchMode = true;
        extendedMode = true;
        outputCompounds = false;
        break;
      default:
        searchMode = false;
        extendedMode = false;
        outputCompounds = false;
        break;
    }
    buffer.reset(this.input);

    resetState();

    dictionaryMap.put(Type.KNOWN, dictionary);
    dictionaryMap.put(Type.UNKNOWN, unkDictionary);
    dictionaryMap.put(Type.USER, userDictionary);
  }

  private GraphvizFormatter dotOut;

  /** Expert: set this to produce graphviz (dot) output of
   *  the Viterbi lattice */
  public void setGraphvizFormatter(GraphvizFormatter dotOut) {
    this.dotOut = dotOut;
  }

  @Override
  public void close() throws IOException {
    super.close();
    buffer.reset(input);
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    buffer.reset(input);
    resetState();
  }

  private void resetState() {
    positions.reset();
    unknownWordEndIndex = -1;
    pos = 0;
    end = false;
    lastBackTracePos = 0;
    lastTokenPos = -1;
    pending.clear();

    // Add BOS:
    positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
  }

  @Override
  public void end() throws IOException {
    super.end();
    // Set final offset
    int finalOffset = correctOffset(pos);
    offsetAtt.setOffset(finalOffset, finalOffset);
  }

  // Returns the added cost that a 2nd best segmentation is
  // allowed to have.  Ie, if we see path with cost X,
  // ending in a compound word, and this method returns
  // threshold > 0, then we will also find the 2nd best
  // segmentation and if its path score is within this
  // threshold of X, we'll include it in the output:
  private int computeSecondBestThreshold(int pos, int length) throws IOException {
    // TODO: maybe we do something else here, instead of just
    // using the penalty...?  EG we can be more aggressive on
    // when to also test for 2nd best path
    return computePenalty(pos, length);
  }

  private int computePenalty(int pos, int length) throws IOException {
    if (length > SEARCH_MODE_KANJI_LENGTH) {
      boolean allKanji = true;
      // check if node consists of only kanji
      final int endPos = pos + length;
      for (int pos2 = pos; pos2 < endPos; pos2++) {
        if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
          allKanji = false;
          break;
        }
      }
      if (allKanji) {  // Process only Kanji keywords
        return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
      } else if (length > SEARCH_MODE_OTHER_LENGTH) {
        return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
      }
    }
    return 0;
  }

  // Holds all back pointers arriving to this position:
  final static class Position {

    int pos;

    int count;

    // maybe single int array * 5?
    int[] costs = new int[8];
    int[] lastRightID = new int[8];
    int[] backPos = new int[8];
    int[] backIndex = new int[8];
    int[] backID = new int[8];
    Type[] backType = new Type[8];

    // Only used when finding 2nd best segmentation under a
    // too-long token:
    int forwardCount;
    int[] forwardPos = new int[8];
    int[] forwardID = new int[8];
    int[] forwardIndex = new int[8];
    Type[] forwardType = new Type[8];

    public void grow() {
      costs = ArrayUtil.grow(costs, 1+count);
      lastRightID = ArrayUtil.grow(lastRightID, 1+count);
      backPos = ArrayUtil.grow(backPos, 1+count);
      backIndex = ArrayUtil.grow(backIndex, 1+count);
      backID = ArrayUtil.grow(backID, 1+count);

      // NOTE: sneaky: grow separately because
      // ArrayUtil.grow will otherwise pick a different
      // length than the int[]s we just grew:
      final Type[] newBackType = new Type[backID.length];
      System.arraycopy(backType, 0, newBackType, 0, backType.length);
      backType = newBackType;
    }

    public void growForward() {
      forwardPos = ArrayUtil.grow(forwardPos, 1+forwardCount);
      forwardID = ArrayUtil.grow(forwardID, 1+forwardCount);
      forwardIndex = ArrayUtil.grow(forwardIndex, 1+forwardCount);

      // NOTE: sneaky: grow separately because
      // ArrayUtil.grow will otherwise pick a different
      // length than the int[]s we just grew:
      final Type[] newForwardType = new Type[forwardPos.length];
      System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length);
      forwardType = newForwardType;
    }

    public void add(int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) {
      // NOTE: this isn't quite a true Viterbi search,
      // because we should check if lastRightID is
      // already present here, and only update if the new
      // cost is less than the current cost, instead of
      // simply appending.  However, that will likely hurt
      // performance (usually we add a lastRightID only once),
      // and it means we actually create the full graph
      // intersection instead of a "normal" Viterbi lattice:
      if (count == costs.length) {
        grow();
      }
      this.costs[count] = cost;
      this.lastRightID[count] = lastRightID;
      this.backPos[count] = backPos;
      this.backIndex[count] = backIndex;
      this.backID[count] = backID;
      this.backType[count] = backType;
      count++;
    }

    public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) {
      if (forwardCount == this.forwardID.length) {
        growForward();
      }
      this.forwardPos[forwardCount] = forwardPos;
      this.forwardIndex[forwardCount] = forwardIndex;
      this.forwardID[forwardCount] = forwardID;
      this.forwardType[forwardCount] = forwardType;
      forwardCount++;
    }

    public void reset() {
      count = 0;
      // forwardCount naturally resets after it runs:
      assert forwardCount == 0: "pos=" + pos + " forwardCount=" + forwardCount;
    }
  }

  private void add(Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) throws IOException {
    final int wordCost = dict.getWordCost(wordID);
    final int leftID = dict.getLeftId(wordID);
    int leastCost = Integer.MAX_VALUE;
    int leastIDX = -1;
    assert fromPosData.count > 0;
    for(int idx=0;idx lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getPosition();
    return true;
  }

  // TODO: make generic'd version of this "circular array"?
  // It's a bit tricky because we do things to the Position
  // (eg, set .pos = N on reuse)...
  static final class WrappedPositionArray {
    private Position[] positions = new Position[8];

    public WrappedPositionArray() {
      for(int i=0;i 0) {
        if (nextWrite == -1) {
          nextWrite = positions.length - 1;
        }
        positions[nextWrite--].reset();
        count--;
      }
      nextWrite = 0;
      nextPos = 0;
      count = 0;
    }

    /** Get Position instance for this absolute position;
     *  this is allowed to be arbitrarily far "in the
     *  future" but cannot be before the last freeBefore. */
    public Position get(int pos) {
      while(pos >= nextPos) {
        //System.out.println("count=" + count + " vs len=" + positions.length);
        if (count == positions.length) {
          Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
          //System.out.println("grow positions " + newPositions.length);
          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
          System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
          for(int i=positions.length;i= nextPos - count;
    }

    private int getIndex(int pos) {
      int index = nextWrite - (nextPos - pos);
      if (index < 0) {
        index += positions.length;
      }
      return index;
    }

    public void freeBefore(int pos) {
      final int toFree = count - (nextPos - pos);
      assert toFree >= 0;
      assert toFree <= count;
      int index = nextWrite - count;
      if (index < 0) {
        index += positions.length;
      }
      for(int i=0;i= MAX_BACKTRACE_GAP) {
        // Safety: if we've buffered too much, force a
        // backtrace now.  We find the least-cost partial
        // path, across all paths, backtrace from it, and
        // then prune all others.  Note that this, in
        // general, can produce the wrong result, if the
        // total best path did not in fact back trace
        // through this partial best path.  But it's the
        // best we can do... (short of not having a
        // safety!).

        // First pass: find least cost partial path so far,
        // including ending at future positions:
        int leastIDX = -1;
        int leastCost = Integer.MAX_VALUE;
        Position leastPosData = null;
        for(int pos2=pos;pos2 posData.pos) {
        pos++;
        continue;
      }

      final char firstCharacter = (char) buffer.get(pos);
      if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {

        // Find unknown match:
        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
        final boolean isPunct = isPunctuation(firstCharacter);

        // NOTE: copied from UnknownDictionary.lookup:
        int unknownWordLength;
        if (!characterDefinition.isGroup(firstCharacter)) {
          unknownWordLength = 1;
        } else {
          // Extract unknown word. Characters with the same character class are considered to be part of unknown word
          unknownWordLength = 1;
          for (int posAhead=pos+1;unknownWordLength 0) {

      final Position endPosData = positions.get(pos);
      int leastCost = Integer.MAX_VALUE;
      int leastIDX = -1;
      if (VERBOSE) {
        System.out.println("  end: " + endPosData.count + " nodes");
      }
      for(int idx=0;idx lastBackTracePos) {
      //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
      final Position posData = positions.get(pos);
      assert bestIDX < posData.count;

      int backPos = posData.backPos[bestIDX];
      assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
      int length = pos - backPos;
      Type backType = posData.backType[bestIDX];
      int backID = posData.backID[bestIDX];
      int nextBestIDX = posData.backIndex[bestIDX];

      if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
        
        // In searchMode, if best path had picked a too-long
        // token, we use the "penalty" to compute the allowed
        // max cost of an alternate back-trace.  If we find an
        // alternate back trace with cost below that
        // threshold, we pursue it instead (but also output
        // the long token).
        //System.out.println("    2nd best backPos=" + backPos + " pos=" + pos);

        final int penalty = computeSecondBestThreshold(backPos, pos-backPos);
        
        if (penalty > 0) {
          if (VERBOSE) {
            System.out.println("  compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
          }

          // Use the penalty to set maxCost on the 2nd best
          // segmentation:
          int maxCost = posData.costs[bestIDX] + penalty;
          if (lastLeftWordID != -1) {
            maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
          }

          // Now, prune all too-long tokens from the graph:
          pruneAndRescore(backPos, pos,
                          posData.backIndex[bestIDX]);

          // Finally, find 2nd best back-trace and resume
          // backtrace there:
          int leastCost = Integer.MAX_VALUE;
          int leastIDX = -1;
          for(int idx=0;idx " + cost);
            }
            //System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
            //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
            if (cost < leastCost) {
              //System.out.println("      ** ");
              leastCost = cost;
              leastIDX = idx;
            }
          }
          //System.out.println("  leastIDX=" + leastIDX);

          if (VERBOSE) {
            System.out.println("  afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
          }

          if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
            // We should have pruned the altToken from the graph:
            assert posData.backPos[leastIDX] != backPos;

            // Save the current compound token, to output when
            // this alternate path joins back:
            altToken = new Token(backID,
                                 fragment,
                                 backPos - lastBackTracePos,
                                 length,
                                 backType,
                                 backPos,
                                 getDict(backType));

            // Redirect our backtrace to 2nd best:
            bestIDX = leastIDX;
            nextBestIDX = posData.backIndex[bestIDX];

            backPos = posData.backPos[bestIDX];
            length = pos - backPos;
            backType = posData.backType[bestIDX];
            backID = posData.backID[bestIDX];
            backCount = 0;
            //System.out.println("  do alt token!");
            
          } else {
            // I think in theory it's possible there is no
            // 2nd best path, which is fine; in this case we
            // only output the compound token:
            //System.out.println("  no alt token! bestIDX=" + bestIDX);
          }
        }
      }

      final int offset = backPos - lastBackTracePos;
      assert offset >= 0;

      if (altToken != null && altToken.getPosition() >= backPos) {

        // We've backtraced to the position where the
        // compound token starts; add it now:

        // The pruning we did when we created the altToken
        // ensures that the back trace will align back with
        // the start of the altToken:
        assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;

        // NOTE: not quite right: the compound token may
        // have had all punctuation back traced so far, but
        // then the decompounded token at this position is
        // not punctuation.  In this case backCount is 0,
        // but we should maybe add the altToken anyway...?

        if (backCount > 0) {
          backCount++;
          altToken.setPositionLength(backCount);
          if (VERBOSE) {
            System.out.println("    add altToken=" + altToken);
          }
          pending.add(altToken);
        } else {
          // This means alt token was all punct tokens:
          if (VERBOSE) {
            System.out.println("    discard all-punctuation altToken=" + altToken);
          }
          assert discardPunctuation;
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for(int j=1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          //System.out.println("    add user: len=" + len);
          pending.add(new Token(wordID+j-1,
                                fragment,
                                current + offset,
                                len,
                                Type.USER,
                                current + backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size()-1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
                                            pending.size()));

        backCount += wordIDAndLength.length-1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for(int i=length-1;i>=0;i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
              i--;
              charLen = 2;
            }
            //System.out.println("    extended tok offset="
            //+ (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
              pending.add(new Token(CharacterDefinition.NGRAM,
                                    fragment,
                                    offset + i,
                                    charLen,
                                    Type.UNKNOWN,
                                    backPos + i,
                                    unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;
          
        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID,
                                fragment,
                                offset,
                                length,
                                backType,
                                backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size()-1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;

    if (VERBOSE) {
      System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
  }

  Dictionary getDict(Type type) {
    return dictionaryMap.get(type);
  }

  private static boolean isPunctuation(char ch) {
    switch(Character.getType(ch)) {
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
      case Character.PARAGRAPH_SEPARATOR:
      case Character.CONTROL:
      case Character.FORMAT:
      case Character.DASH_PUNCTUATION:
      case Character.START_PUNCTUATION:
      case Character.END_PUNCTUATION:
      case Character.CONNECTOR_PUNCTUATION:
      case Character.OTHER_PUNCTUATION:
      case Character.MATH_SYMBOL:
      case Character.CURRENCY_SYMBOL:
      case Character.MODIFIER_SYMBOL:
      case Character.OTHER_SYMBOL:
      case Character.INITIAL_QUOTE_PUNCTUATION:
      case Character.FINAL_QUOTE_PUNCTUATION:
        return true;
      default:
        return false;
    }
  }
}