All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy

There is a newer version: 8.11.4
Show newest version
package org.apache.lucene.analysis.ja;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;

// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...

/**
 * Tokenizer for Japanese that uses morphological analysis.
 * 

* This tokenizer sets a number of additional attributes: *

    *
  • {@link BaseFormAttribute} containing base form for inflected * adjectives and verbs. *
  • {@link PartOfSpeechAttribute} containing part-of-speech. *
  • {@link ReadingAttribute} containing reading and pronunciation. *
  • {@link InflectionAttribute} containing additional part-of-speech * information for inflected forms. *
*

* This tokenizer uses a rolling Viterbi search to find the * least cost segmentation (path) of the incoming characters. * For tokens that appear to be compound (> length 2 for all * Kanji, or > length 7 for non-Kanji), we see if there is a * 2nd best segmentation of that token after applying * penalties to the long tokens. If so, and the Mode is * {@link Mode#SEARCH}, we output the alternate segmentation * as well. */ public final class JapaneseTokenizer extends Tokenizer { /** * Tokenization mode: this determines how the tokenizer handles * compound and unknown words. */ public static enum Mode { /** * Ordinary segmentation: no decomposition for compounds, */ NORMAL, /** * Segmentation geared towards search: this includes a * decompounding process for long nouns, also including * the full compound token as a synonym. */ SEARCH, /** * Extended mode outputs unigrams for unknown words. * @lucene.experimental */ EXTENDED } /** * Default tokenization mode. Currently this is {@link Mode#SEARCH}. */ public static final Mode DEFAULT_MODE = Mode.SEARCH; /** * Token type reflecting the original source of this token */ public enum Type { /** * Known words from the system dictionary. */ KNOWN, /** * Unknown words (heuristically segmented). */ UNKNOWN, /** * Known words from the user dictionary. */ USER } private static final boolean VERBOSE = false; private static final int SEARCH_MODE_KANJI_LENGTH = 2; private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH private static final int SEARCH_MODE_KANJI_PENALTY = 3000; private static final int SEARCH_MODE_OTHER_PENALTY = 1700; // For safety: private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; private static final int MAX_BACKTRACE_GAP = 1024; private final EnumMap dictionaryMap = new EnumMap(Type.class); private final TokenInfoFST fst; private final TokenInfoDictionary dictionary; private final UnknownDictionary unkDictionary; private final ConnectionCosts costs; private final UserDictionary userDictionary; private final CharacterDefinition characterDefinition; private final FST.Arc arc = new FST.Arc(); private final FST.BytesReader fstReader; private final IntsRef wordIdRef = new IntsRef(); private final FST.BytesReader userFSTReader; private final TokenInfoFST userFST; private final RollingCharBuffer buffer = new RollingCharBuffer(); private final WrappedPositionArray positions = new WrappedPositionArray(); private final boolean discardPunctuation; private final boolean searchMode; private final boolean extendedMode; private final boolean outputCompounds; // Index of the last character of unknown word: private int unknownWordEndIndex = -1; // True once we've hit the EOF from the input reader: private boolean end; // Last absolute position we backtraced from: private int lastBackTracePos; // Position of last token we returned; we use this to // figure out whether to set posIncr to 0 or 1: private int lastTokenPos; // Next absolute position to process: private int pos; // Already parsed, but not yet passed to caller, tokens: private final List pending = new ArrayList(); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class); private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class); private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class); /** * Create a new JapaneseTokenizer. *

* Uses the default AttributeFactory. * * @param input Reader containing text * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode); } /** * Create a new JapaneseTokenizer. * * @param factory the AttributeFactory to use * @param input Reader containing text * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { super(factory, input); dictionary = TokenInfoDictionary.getInstance(); fst = dictionary.getFST(); unkDictionary = UnknownDictionary.getInstance(); characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; } this.discardPunctuation = discardPunctuation; switch(mode){ case SEARCH: searchMode = true; extendedMode = false; outputCompounds = true; break; case EXTENDED: searchMode = true; extendedMode = true; outputCompounds = false; break; default: searchMode = false; extendedMode = false; outputCompounds = false; break; } buffer.reset(this.input); resetState(); dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); } private GraphvizFormatter dotOut; /** Expert: set this to produce graphviz (dot) output of * the Viterbi lattice */ public void setGraphvizFormatter(GraphvizFormatter dotOut) { this.dotOut = dotOut; } @Override public void close() throws IOException { super.close(); buffer.reset(input); } @Override public void reset() throws IOException { super.reset(); buffer.reset(input); resetState(); } private void resetState() { positions.reset(); unknownWordEndIndex = -1; pos = 0; end = false; lastBackTracePos = 0; lastTokenPos = -1; pending.clear(); // Add BOS: positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN); } @Override public void end() throws IOException { super.end(); // Set final offset int finalOffset = correctOffset(pos); offsetAtt.setOffset(finalOffset, finalOffset); } // Returns the added cost that a 2nd best segmentation is // allowed to have. Ie, if we see path with cost X, // ending in a compound word, and this method returns // threshold > 0, then we will also find the 2nd best // segmentation and if its path score is within this // threshold of X, we'll include it in the output: private int computeSecondBestThreshold(int pos, int length) throws IOException { // TODO: maybe we do something else here, instead of just // using the penalty...? EG we can be more aggressive on // when to also test for 2nd best path return computePenalty(pos, length); } private int computePenalty(int pos, int length) throws IOException { if (length > SEARCH_MODE_KANJI_LENGTH) { boolean allKanji = true; // check if node consists of only kanji final int endPos = pos + length; for (int pos2 = pos; pos2 < endPos; pos2++) { if (!characterDefinition.isKanji((char) buffer.get(pos2))) { allKanji = false; break; } } if (allKanji) { // Process only Kanji keywords return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY; } else if (length > SEARCH_MODE_OTHER_LENGTH) { return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY; } } return 0; } // Holds all back pointers arriving to this position: final static class Position { int pos; int count; // maybe single int array * 5? int[] costs = new int[8]; int[] lastRightID = new int[8]; int[] backPos = new int[8]; int[] backIndex = new int[8]; int[] backID = new int[8]; Type[] backType = new Type[8]; // Only used when finding 2nd best segmentation under a // too-long token: int forwardCount; int[] forwardPos = new int[8]; int[] forwardID = new int[8]; int[] forwardIndex = new int[8]; Type[] forwardType = new Type[8]; public void grow() { costs = ArrayUtil.grow(costs, 1+count); lastRightID = ArrayUtil.grow(lastRightID, 1+count); backPos = ArrayUtil.grow(backPos, 1+count); backIndex = ArrayUtil.grow(backIndex, 1+count); backID = ArrayUtil.grow(backID, 1+count); // NOTE: sneaky: grow separately because // ArrayUtil.grow will otherwise pick a different // length than the int[]s we just grew: final Type[] newBackType = new Type[backID.length]; System.arraycopy(backType, 0, newBackType, 0, backType.length); backType = newBackType; } public void growForward() { forwardPos = ArrayUtil.grow(forwardPos, 1+forwardCount); forwardID = ArrayUtil.grow(forwardID, 1+forwardCount); forwardIndex = ArrayUtil.grow(forwardIndex, 1+forwardCount); // NOTE: sneaky: grow separately because // ArrayUtil.grow will otherwise pick a different // length than the int[]s we just grew: final Type[] newForwardType = new Type[forwardPos.length]; System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length); forwardType = newForwardType; } public void add(int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) { // NOTE: this isn't quite a true Viterbi search, // because we should check if lastRightID is // already present here, and only update if the new // cost is less than the current cost, instead of // simply appending. However, that will likely hurt // performance (usually we add a lastRightID only once), // and it means we actually create the full graph // intersection instead of a "normal" Viterbi lattice: if (count == costs.length) { grow(); } this.costs[count] = cost; this.lastRightID[count] = lastRightID; this.backPos[count] = backPos; this.backIndex[count] = backIndex; this.backID[count] = backID; this.backType[count] = backType; count++; } public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) { if (forwardCount == this.forwardID.length) { growForward(); } this.forwardPos[forwardCount] = forwardPos; this.forwardIndex[forwardCount] = forwardIndex; this.forwardID[forwardCount] = forwardID; this.forwardType[forwardCount] = forwardType; forwardCount++; } public void reset() { count = 0; // forwardCount naturally resets after it runs: assert forwardCount == 0: "pos=" + pos + " forwardCount=" + forwardCount; } } private void add(Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) throws IOException { final int wordCost = dict.getWordCost(wordID); final int leftID = dict.getLeftId(wordID); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; assert fromPosData.count > 0; for(int idx=0;idx lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(1); } if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); } lastTokenPos = token.getPosition(); return true; } // TODO: make generic'd version of this "circular array"? // It's a bit tricky because we do things to the Position // (eg, set .pos = N on reuse)... static final class WrappedPositionArray { private Position[] positions = new Position[8]; public WrappedPositionArray() { for(int i=0;i 0) { if (nextWrite == -1) { nextWrite = positions.length - 1; } positions[nextWrite--].reset(); count--; } nextWrite = 0; nextPos = 0; count = 0; } /** Get Position instance for this absolute position; * this is allowed to be arbitrarily far "in the * future" but cannot be before the last freeBefore. */ public Position get(int pos) { while(pos >= nextPos) { //System.out.println("count=" + count + " vs len=" + positions.length); if (count == positions.length) { Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; //System.out.println("grow positions " + newPositions.length); System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite); System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite); for(int i=positions.length;i= nextPos - count; } private int getIndex(int pos) { int index = nextWrite - (nextPos - pos); if (index < 0) { index += positions.length; } return index; } public void freeBefore(int pos) { final int toFree = count - (nextPos - pos); assert toFree >= 0; assert toFree <= count; int index = nextWrite - count; if (index < 0) { index += positions.length; } for(int i=0;i= MAX_BACKTRACE_GAP) { // Safety: if we've buffered too much, force a // backtrace now. We find the least-cost partial // path, across all paths, backtrace from it, and // then prune all others. Note that this, in // general, can produce the wrong result, if the // total best path did not in fact back trace // through this partial best path. But it's the // best we can do... (short of not having a // safety!). // First pass: find least cost partial path so far, // including ending at future positions: int leastIDX = -1; int leastCost = Integer.MAX_VALUE; Position leastPosData = null; for(int pos2=pos;pos2 posData.pos) { pos++; continue; } final char firstCharacter = (char) buffer.get(pos); if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) { // Find unknown match: final int characterId = characterDefinition.getCharacterClass(firstCharacter); final boolean isPunct = isPunctuation(firstCharacter); // NOTE: copied from UnknownDictionary.lookup: int unknownWordLength; if (!characterDefinition.isGroup(firstCharacter)) { unknownWordLength = 1; } else { // Extract unknown word. Characters with the same character class are considered to be part of unknown word unknownWordLength = 1; for (int posAhead=pos+1;unknownWordLength 0) { final Position endPosData = positions.get(pos); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; if (VERBOSE) { System.out.println(" end: " + endPosData.count + " nodes"); } for(int idx=0;idx lastBackTracePos) { //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); final Position posData = positions.get(pos); assert bestIDX < posData.count; int backPos = posData.backPos[bestIDX]; assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos; int length = pos - backPos; Type backType = posData.backType[bestIDX]; int backID = posData.backID[bestIDX]; int nextBestIDX = posData.backIndex[bestIDX]; if (outputCompounds && searchMode && altToken == null && backType != Type.USER) { // In searchMode, if best path had picked a too-long // token, we use the "penalty" to compute the allowed // max cost of an alternate back-trace. If we find an // alternate back trace with cost below that // threshold, we pursue it instead (but also output // the long token). //System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); final int penalty = computeSecondBestThreshold(backPos, pos-backPos); if (penalty > 0) { if (VERBOSE) { System.out.println(" compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); } // Use the penalty to set maxCost on the 2nd best // segmentation: int maxCost = posData.costs[bestIDX] + penalty; if (lastLeftWordID != -1) { maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID); } // Now, prune all too-long tokens from the graph: pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]); // Finally, find 2nd best back-trace and resume // backtrace there: int leastCost = Integer.MAX_VALUE; int leastIDX = -1; for(int idx=0;idx " + cost); } //System.out.println("penalty " + posData.backPos[idx] + " to " + pos); //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); if (cost < leastCost) { //System.out.println(" ** "); leastCost = cost; leastIDX = idx; } } //System.out.println(" leastIDX=" + leastIDX); if (VERBOSE) { System.out.println(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); } if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) { // We should have pruned the altToken from the graph: assert posData.backPos[leastIDX] != backPos; // Save the current compound token, to output when // this alternate path joins back: altToken = new Token(backID, fragment, backPos - lastBackTracePos, length, backType, backPos, getDict(backType)); // Redirect our backtrace to 2nd best: bestIDX = leastIDX; nextBestIDX = posData.backIndex[bestIDX]; backPos = posData.backPos[bestIDX]; length = pos - backPos; backType = posData.backType[bestIDX]; backID = posData.backID[bestIDX]; backCount = 0; //System.out.println(" do alt token!"); } else { // I think in theory it's possible there is no // 2nd best path, which is fine; in this case we // only output the compound token: //System.out.println(" no alt token! bestIDX=" + bestIDX); } } } final int offset = backPos - lastBackTracePos; assert offset >= 0; if (altToken != null && altToken.getPosition() >= backPos) { // We've backtraced to the position where the // compound token starts; add it now: // The pruning we did when we created the altToken // ensures that the back trace will align back with // the start of the altToken: assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos; // NOTE: not quite right: the compound token may // have had all punctuation back traced so far, but // then the decompounded token at this position is // not punctuation. In this case backCount is 0, // but we should maybe add the altToken anyway...? if (backCount > 0) { backCount++; altToken.setPositionLength(backCount); if (VERBOSE) { System.out.println(" add altToken=" + altToken); } pending.add(altToken); } else { // This means alt token was all punct tokens: if (VERBOSE) { System.out.println(" discard all-punctuation altToken=" + altToken); } assert discardPunctuation; } altToken = null; } final Dictionary dict = getDict(backType); if (backType == Type.USER) { // Expand the phraseID we recorded into the actual // segmentation: final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID); int wordID = wordIDAndLength[0]; int current = 0; for(int j=1; j < wordIDAndLength.length; j++) { final int len = wordIDAndLength[j]; //System.out.println(" add user: len=" + len); pending.add(new Token(wordID+j-1, fragment, current + offset, len, Type.USER, current + backPos, dict)); if (VERBOSE) { System.out.println(" add USER token=" + pending.get(pending.size()-1)); } current += len; } // Reverse the tokens we just added, because when we // serve them up from incrementToken we serve in // reverse: Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size())); backCount += wordIDAndLength.length-1; } else { if (extendedMode && backType == Type.UNKNOWN) { // In EXTENDED mode we convert unknown word into // unigrams: int unigramTokenCount = 0; for(int i=length-1;i>=0;i--) { int charLen = 1; if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) { i--; charLen = 2; } //System.out.println(" extended tok offset=" //+ (offset + i)); if (!discardPunctuation || !isPunctuation(fragment[offset+i])) { pending.add(new Token(CharacterDefinition.NGRAM, fragment, offset + i, charLen, Type.UNKNOWN, backPos + i, unkDictionary)); unigramTokenCount++; } } backCount += unigramTokenCount; } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) { pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict)); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size()-1)); } backCount++; } else { if (VERBOSE) { System.out.println(" skip punctuation token=" + new String(fragment, offset, length)); } } } lastLeftWordID = dict.getLeftId(backID); pos = backPos; bestIDX = nextBestIDX; } lastBackTracePos = endPos; if (VERBOSE) { System.out.println(" freeBefore pos=" + endPos); } // Notify the circular buffers that we are done with // these positions: buffer.freeBefore(endPos); positions.freeBefore(endPos); } Dictionary getDict(Type type) { return dictionaryMap.get(type); } private static boolean isPunctuation(char ch) { switch(Character.getType(ch)) { case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.CONTROL: case Character.FORMAT: case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: case Character.INITIAL_QUOTE_PUNCTUATION: case Character.FINAL_QUOTE_PUNCTUATION: return true; default: return false; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy