org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-kuromoji Show documentation
Apache Lucene (module: kuromoji)
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja;

import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.JaMorphData;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.fst.FST;

/**
 * Tokenizer for Japanese that uses morphological analysis.
 *
 * This tokenizer sets a number of additional attributes:
 *
 * 

 *   {@link BaseFormAttribute} containing base form for inflected adjectives and verbs.
 *   
{@link PartOfSpeechAttribute} containing part-of-speech.
 *   
{@link ReadingAttribute} containing reading and pronunciation.
 *   
{@link InflectionAttribute} containing additional part-of-speech information for inflected
 *       forms.
 * 
 *
 * This tokenizer uses a rolling Viterbi search to find the least cost segmentation (path) of the
 * incoming characters. For tokens that appear to be compound (> length 2 for all Kanji, or >
 * length 7 for non-Kanji), we see if there is a 2nd best segmentation of that token after applying
 * penalties to the long tokens. If so, and the Mode is {@link Mode#SEARCH}, we output the alternate
 * segmentation as well.
 */
public final class JapaneseTokenizer extends Tokenizer {

  /** Tokenization mode: this determines how the tokenizer handles compound and unknown words. */
  public enum Mode {
    /** Ordinary segmentation: no decomposition for compounds, */
    NORMAL,

    /**
     * Segmentation geared towards search: this includes a decompounding process for long nouns,
     * also including the full compound token as a synonym.
     */
    SEARCH,

    /**
     * Extended mode outputs unigrams for unknown words.
     *
     * @lucene.experimental
     */
    EXTENDED
  }

  /** Default tokenization mode. Currently this is {@link Mode#SEARCH}. */
  public static final Mode DEFAULT_MODE = Mode.SEARCH;

  private static final boolean VERBOSE = false;

  // Position of last token we returned; we use this to
  // figure out whether to set posIncr to 0 or 1:
  private int lastTokenPos;

  private final ViterbiNBest viterbi;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAtt =
      addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);

  /**
   * Create a new JapaneseTokenizer.
   *
   * 
Uses the default AttributeFactory.
   *
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, true, mode);
  }

  /**
   * Create a new JapaneseTokenizer.
   *
   * Uses the default AttributeFactory.
   *
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param discardCompoundToken true if compound tokens should be dropped from the output when
   *     tokenization mode is not NORMAL.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(
      UserDictionary userDictionary,
      boolean discardPunctuation,
      boolean discardCompoundToken,
      Mode mode) {
    this(
        DEFAULT_TOKEN_ATTRIBUTE_FACTORY,
        userDictionary,
        discardPunctuation,
        discardCompoundToken,
        mode);
  }

  /**
   * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
   *
   * @param factory the AttributeFactory to use
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(
      AttributeFactory factory,
      UserDictionary userDictionary,
      boolean discardPunctuation,
      Mode mode) {
    this(
        factory,
        TokenInfoDictionary.getInstance(),
        UnknownDictionary.getInstance(),
        ConnectionCosts.getInstance(),
        userDictionary,
        discardPunctuation,
        true,
        mode);
  }

  /**
   * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
   *
   * @param factory the AttributeFactory to use
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param discardCompoundToken true if compound tokens should be dropped from the output when
   *     tokenization mode is not NORMAL.
   * @param mode tokenization mode.
   */
  public JapaneseTokenizer(
      AttributeFactory factory,
      UserDictionary userDictionary,
      boolean discardPunctuation,
      boolean discardCompoundToken,
      Mode mode) {
    this(
        factory,
        TokenInfoDictionary.getInstance(),
        UnknownDictionary.getInstance(),
        ConnectionCosts.getInstance(),
        userDictionary,
        discardPunctuation,
        discardCompoundToken,
        mode);
  }

  /**
   * Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
   * This constructor provides an entry point for users that want to construct custom language
   * models that can be used as input to {@link
   * org.apache.lucene.analysis.ja.dict.DictionaryBuilder}.
   *
   * @param factory the AttributeFactory to use
   * @param systemDictionary a custom known token dictionary
   * @param unkDictionary a custom unknown token dictionary
   * @param connectionCosts custom token transition costs
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param discardCompoundToken true if compound tokens should be dropped from the output when
   *     tokenization mode is not NORMAL.
   * @param mode tokenization mode.
   * @lucene.experimental
   */
  @IgnoreRandomChains(reason = "Parameters are too complex to be tested")
  public JapaneseTokenizer(
      AttributeFactory factory,
      TokenInfoDictionary systemDictionary,
      UnknownDictionary unkDictionary,
      ConnectionCosts connectionCosts,
      UserDictionary userDictionary,
      boolean discardPunctuation,
      boolean discardCompoundToken,
      Mode mode) {
    super(factory);
    TokenInfoFST fst = systemDictionary.getFST();
    FST.BytesReader fstReader = fst.getBytesReader();
    TokenInfoFST userFST = null;
    FST.BytesReader userFSTReader = null;
    if (userDictionary != null) {
      userFST = userDictionary.getFST();
      userFSTReader = userFST.getBytesReader();
    }
    boolean searchMode;
    boolean extendedMode;
    boolean outputCompounds;
    switch (mode) {
      case SEARCH:
        searchMode = true;
        extendedMode = false;
        outputCompounds = !discardCompoundToken;
        break;
      case EXTENDED:
        searchMode = true;
        extendedMode = true;
        outputCompounds = !discardCompoundToken;
        break;
      case NORMAL:
      default:
        searchMode = false;
        extendedMode = false;
        outputCompounds = false;
        break;
    }
    CharacterDefinition characterDefinition = unkDictionary.getCharacterDefinition();
    this.viterbi =
        new ViterbiNBest(
            fst,
            fstReader,
            systemDictionary,
            userFST,
            userFSTReader,
            userDictionary,
            connectionCosts,
            unkDictionary,
            characterDefinition,
            discardPunctuation,
            searchMode,
            extendedMode,
            outputCompounds);
    viterbi.resetBuffer(this.input);
    viterbi.resetState();
  }

  /** Expert: set this to produce graphviz (dot) output of the Viterbi lattice */
  public void setGraphvizFormatter(GraphvizFormatter dotOut) {
    viterbi.setGraphvizFormatter(dotOut);
  }

  @Override
  public void close() throws IOException {
    super.close();
    viterbi.resetBuffer(input);
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    viterbi.resetBuffer(input);
    viterbi.resetState();
    lastTokenPos = -1;
  }

  @Override
  public void end() throws IOException {
    super.end();
    // Set final offset
    int finalOffset = correctOffset(viterbi.getPos());
    offsetAtt.setOffset(finalOffset, finalOffset);
  }

  @Override
  public boolean incrementToken() throws IOException {

    // forward() can return w/o producing any new
    // tokens, when the tokens it had produced were entirely
    // punctuation.  So we loop here until we get a real
    // token or we end:
    while (viterbi.getPending().size() == 0) {
      if (viterbi.isEnd()) {
        return false;
      }
      // Push Viterbi forward some more:
      viterbi.forward();
    }

    final Token token = viterbi.getPending().remove(viterbi.getPending().size() - 1);

    int length = token.getLength();
    clearAttributes();
    assert length > 0;
    // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
    // token.getSurfaceForm().length);
    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
    offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
    basicFormAtt.setToken(token);
    posAtt.setToken(token);
    readingAtt.setToken(token);
    inflectionAtt.setToken(token);
    if (token.getStartOffset() == lastTokenPos) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else if (viterbi.isOutputNBest()) {
      // The position length is always calculated if outputNBest is true.
      assert token.getStartOffset() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else {
      assert token.getStartOffset() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getStartOffset();

    return true;
  }

  private int probeDelta(String inText, String requiredToken) throws IOException {
    int start = inText.indexOf(requiredToken);
    if (start < 0) {
      // -1 when no requiredToken.
      return -1;
    }

    int delta = Integer.MAX_VALUE;
    int saveNBestCost = viterbi.getNBestCost();
    setReader(new StringReader(inText));
    reset();
    try {
      setNBestCost(1);
      int prevRootBase = -1;
      while (incrementToken()) {
        if (viterbi.getLatticeRootBase() != prevRootBase) {
          prevRootBase = viterbi.getLatticeRootBase();
          delta = Math.min(delta, viterbi.probeDelta(start, start + requiredToken.length()));
        }
      }
    } finally {
      // reset & end
      end();
      // setReader & close
      close();
      setNBestCost(saveNBestCost);
    }

    if (VERBOSE) {
      System.out.printf("JapaneseTokenizer: delta = %d: %s-%s\n", delta, inText, requiredToken);
    }
    return delta == Integer.MAX_VALUE ? -1 : delta;
  }

  public int calcNBestCost(String examples) {
    int maxDelta = 0;
    for (String example : examples.split("/")) {
      if (!example.isEmpty()) {
        String[] pair = example.split("-");
        if (pair.length != 2) {
          throw new RuntimeException("Unexpected example form: " + example + " (expected two '-')");
        } else {
          try {
            maxDelta = Math.max(maxDelta, probeDelta(pair[0], pair[1]));
          } catch (IOException e) {
            throw new RuntimeException(
                "Internal error calculating best costs from examples. Got ", e);
          }
        }
      }
    }
    return maxDelta;
  }

  public void setNBestCost(int value) {
    viterbi.setNBestCost(value);
  }
}