All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja;

import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.JaMorphData;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.fst.FST;

/**
 * Tokenizer for Japanese that uses morphological analysis.
 *
 * 

This tokenizer sets a number of additional attributes: * *

    *
  • {@link BaseFormAttribute} containing base form for inflected adjectives and verbs. *
  • {@link PartOfSpeechAttribute} containing part-of-speech. *
  • {@link ReadingAttribute} containing reading and pronunciation. *
  • {@link InflectionAttribute} containing additional part-of-speech information for inflected * forms. *
* *

This tokenizer uses a rolling Viterbi search to find the least cost segmentation (path) of the * incoming characters. For tokens that appear to be compound (> length 2 for all Kanji, or > * length 7 for non-Kanji), we see if there is a 2nd best segmentation of that token after applying * penalties to the long tokens. If so, and the Mode is {@link Mode#SEARCH}, we output the alternate * segmentation as well. */ public final class JapaneseTokenizer extends Tokenizer { /** Tokenization mode: this determines how the tokenizer handles compound and unknown words. */ public enum Mode { /** Ordinary segmentation: no decomposition for compounds, */ NORMAL, /** * Segmentation geared towards search: this includes a decompounding process for long nouns, * also including the full compound token as a synonym. */ SEARCH, /** * Extended mode outputs unigrams for unknown words. * * @lucene.experimental */ EXTENDED } /** Default tokenization mode. Currently this is {@link Mode#SEARCH}. */ public static final Mode DEFAULT_MODE = Mode.SEARCH; private static final boolean VERBOSE = false; // Position of last token we returned; we use this to // figure out whether to set posIncr to 0 or 1: private int lastTokenPos; private final ViterbiNBest viterbi; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class); private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class); private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class); /** * Create a new JapaneseTokenizer. * *

Uses the default AttributeFactory. * * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, true, mode); } /** * Create a new JapaneseTokenizer. * *

Uses the default AttributeFactory. * * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param discardCompoundToken true if compound tokens should be dropped from the output when * tokenization mode is not NORMAL. * @param mode tokenization mode. */ public JapaneseTokenizer( UserDictionary userDictionary, boolean discardPunctuation, boolean discardCompoundToken, Mode mode) { this( DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, discardCompoundToken, mode); } /** * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene. * * @param factory the AttributeFactory to use * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer( AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { this( factory, TokenInfoDictionary.getInstance(), UnknownDictionary.getInstance(), ConnectionCosts.getInstance(), userDictionary, discardPunctuation, true, mode); } /** * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene. * * @param factory the AttributeFactory to use * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param discardCompoundToken true if compound tokens should be dropped from the output when * tokenization mode is not NORMAL. * @param mode tokenization mode. */ public JapaneseTokenizer( AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, boolean discardCompoundToken, Mode mode) { this( factory, TokenInfoDictionary.getInstance(), UnknownDictionary.getInstance(), ConnectionCosts.getInstance(), userDictionary, discardPunctuation, discardCompoundToken, mode); } /** * Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary. * This constructor provides an entry point for users that want to construct custom language * models that can be used as input to {@link * org.apache.lucene.analysis.ja.dict.DictionaryBuilder}. * * @param factory the AttributeFactory to use * @param systemDictionary a custom known token dictionary * @param unkDictionary a custom unknown token dictionary * @param connectionCosts custom token transition costs * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param discardCompoundToken true if compound tokens should be dropped from the output when * tokenization mode is not NORMAL. * @param mode tokenization mode. * @lucene.experimental */ @IgnoreRandomChains(reason = "Parameters are too complex to be tested") public JapaneseTokenizer( AttributeFactory factory, TokenInfoDictionary systemDictionary, UnknownDictionary unkDictionary, ConnectionCosts connectionCosts, UserDictionary userDictionary, boolean discardPunctuation, boolean discardCompoundToken, Mode mode) { super(factory); TokenInfoFST fst = systemDictionary.getFST(); FST.BytesReader fstReader = fst.getBytesReader(); TokenInfoFST userFST = null; FST.BytesReader userFSTReader = null; if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } boolean searchMode; boolean extendedMode; boolean outputCompounds; switch (mode) { case SEARCH: searchMode = true; extendedMode = false; outputCompounds = !discardCompoundToken; break; case EXTENDED: searchMode = true; extendedMode = true; outputCompounds = !discardCompoundToken; break; case NORMAL: default: searchMode = false; extendedMode = false; outputCompounds = false; break; } CharacterDefinition characterDefinition = unkDictionary.getCharacterDefinition(); this.viterbi = new ViterbiNBest( fst, fstReader, systemDictionary, userFST, userFSTReader, userDictionary, connectionCosts, unkDictionary, characterDefinition, discardPunctuation, searchMode, extendedMode, outputCompounds); viterbi.resetBuffer(this.input); viterbi.resetState(); } /** Expert: set this to produce graphviz (dot) output of the Viterbi lattice */ public void setGraphvizFormatter(GraphvizFormatter dotOut) { viterbi.setGraphvizFormatter(dotOut); } @Override public void close() throws IOException { super.close(); viterbi.resetBuffer(input); } @Override public void reset() throws IOException { super.reset(); viterbi.resetBuffer(input); viterbi.resetState(); lastTokenPos = -1; } @Override public void end() throws IOException { super.end(); // Set final offset int finalOffset = correctOffset(viterbi.getPos()); offsetAtt.setOffset(finalOffset, finalOffset); } @Override public boolean incrementToken() throws IOException { // forward() can return w/o producing any new // tokens, when the tokens it had produced were entirely // punctuation. So we loop here until we get a real // token or we end: while (viterbi.getPending().size() == 0) { if (viterbi.isEnd()) { return false; } // Push Viterbi forward some more: viterbi.forward(); } final Token token = viterbi.getPending().remove(viterbi.getPending().size() - 1); int length = token.getLength(); clearAttributes(); assert length > 0; // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + // token.getSurfaceForm().length); termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length); offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset())); basicFormAtt.setToken(token); posAtt.setToken(token); readingAtt.setToken(token); inflectionAtt.setToken(token); if (token.getStartOffset() == lastTokenPos) { posIncAtt.setPositionIncrement(0); posLengthAtt.setPositionLength(token.getPositionLength()); } else if (viterbi.isOutputNBest()) { // The position length is always calculated if outputNBest is true. assert token.getStartOffset() > lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(token.getPositionLength()); } else { assert token.getStartOffset() > lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(1); } if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); } lastTokenPos = token.getStartOffset(); return true; } private int probeDelta(String inText, String requiredToken) throws IOException { int start = inText.indexOf(requiredToken); if (start < 0) { // -1 when no requiredToken. return -1; } int delta = Integer.MAX_VALUE; int saveNBestCost = viterbi.getNBestCost(); setReader(new StringReader(inText)); reset(); try { setNBestCost(1); int prevRootBase = -1; while (incrementToken()) { if (viterbi.getLatticeRootBase() != prevRootBase) { prevRootBase = viterbi.getLatticeRootBase(); delta = Math.min(delta, viterbi.probeDelta(start, start + requiredToken.length())); } } } finally { // reset & end end(); // setReader & close close(); setNBestCost(saveNBestCost); } if (VERBOSE) { System.out.printf("JapaneseTokenizer: delta = %d: %s-%s\n", delta, inText, requiredToken); } return delta == Integer.MAX_VALUE ? -1 : delta; } public int calcNBestCost(String examples) { int maxDelta = 0; for (String example : examples.split("/")) { if (!example.isEmpty()) { String[] pair = example.split("-"); if (pair.length != 2) { throw new RuntimeException("Unexpected example form: " + example + " (expected two '-')"); } else { try { maxDelta = Math.max(maxDelta, probeDelta(pair[0], pair[1])); } catch (IOException e) { throw new RuntimeException( "Internal error calculating best costs from examples. Got ", e); } } } } return maxDelta; } public void setNBestCost(int value) { viterbi.setNBestCost(value); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy