org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy
Show all versions of lucene-analyzers-kuromoji Show documentation
package org.apache.lucene.analysis.ja;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...
/**
* Tokenizer for Japanese that uses morphological analysis.
*
* This tokenizer sets a number of additional attributes:
*
* - {@link BaseFormAttribute} containing base form for inflected
* adjectives and verbs.
*
- {@link PartOfSpeechAttribute} containing part-of-speech.
*
- {@link ReadingAttribute} containing reading and pronunciation.
*
- {@link InflectionAttribute} containing additional part-of-speech
* information for inflected forms.
*
*
* This tokenizer uses a rolling Viterbi search to find the
* least cost segmentation (path) of the incoming characters.
* For tokens that appear to be compound (> length 2 for all
* Kanji, or > length 7 for non-Kanji), we see if there is a
* 2nd best segmentation of that token after applying
* penalties to the long tokens. If so, and the Mode is
* {@link Mode#SEARCH}, we output the alternate segmentation
* as well.
*/
public final class JapaneseTokenizer extends Tokenizer {
/**
* Tokenization mode: this determines how the tokenizer handles
* compound and unknown words.
*/
public static enum Mode {
/**
* Ordinary segmentation: no decomposition for compounds,
*/
NORMAL,
/**
* Segmentation geared towards search: this includes a
* decompounding process for long nouns, also including
* the full compound token as a synonym.
*/
SEARCH,
/**
* Extended mode outputs unigrams for unknown words.
* @lucene.experimental
*/
EXTENDED
}
/**
* Default tokenization mode. Currently this is {@link Mode#SEARCH}.
*/
public static final Mode DEFAULT_MODE = Mode.SEARCH;
/**
* Token type reflecting the original source of this token
*/
public enum Type {
/**
* Known words from the system dictionary.
*/
KNOWN,
/**
* Unknown words (heuristically segmented).
*/
UNKNOWN,
/**
* Known words from the user dictionary.
*/
USER
}
private static final boolean VERBOSE = false;
private static final int SEARCH_MODE_KANJI_LENGTH = 2;
private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
// For safety:
private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
private static final int MAX_BACKTRACE_GAP = 1024;
private final EnumMap dictionaryMap = new EnumMap(Type.class);
private final TokenInfoFST fst;
private final TokenInfoDictionary dictionary;
private final UnknownDictionary unkDictionary;
private final ConnectionCosts costs;
private final UserDictionary userDictionary;
private final CharacterDefinition characterDefinition;
private final FST.Arc arc = new FST.Arc();
private final FST.BytesReader fstReader;
private final IntsRef wordIdRef = new IntsRef();
private final FST.BytesReader userFSTReader;
private final TokenInfoFST userFST;
private final RollingCharBuffer buffer = new RollingCharBuffer();
private final WrappedPositionArray positions = new WrappedPositionArray();
private final boolean discardPunctuation;
private final boolean searchMode;
private final boolean extendedMode;
private final boolean outputCompounds;
// Index of the last character of unknown word:
private int unknownWordEndIndex = -1;
// True once we've hit the EOF from the input reader:
private boolean end;
// Last absolute position we backtraced from:
private int lastBackTracePos;
// Position of last token we returned; we use this to
// figure out whether to set posIncr to 0 or 1:
private int lastTokenPos;
// Next absolute position to process:
private int pos;
// Already parsed, but not yet passed to caller, tokens:
private final List pending = new ArrayList();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
/**
* Create a new JapaneseTokenizer.
*
* Uses the default AttributeFactory.
*
* @param input Reader containing text
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param mode tokenization mode.
*/
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
}
/**
* Create a new JapaneseTokenizer.
*
* @param factory the AttributeFactory to use
* @param input Reader containing text
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param mode tokenization mode.
*/
public JapaneseTokenizer
(AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
super(factory, input);
dictionary = TokenInfoDictionary.getInstance();
fst = dictionary.getFST();
unkDictionary = UnknownDictionary.getInstance();
characterDefinition = unkDictionary.getCharacterDefinition();
this.userDictionary = userDictionary;
costs = ConnectionCosts.getInstance();
fstReader = fst.getBytesReader();
if (userDictionary != null) {
userFST = userDictionary.getFST();
userFSTReader = userFST.getBytesReader();
} else {
userFST = null;
userFSTReader = null;
}
this.discardPunctuation = discardPunctuation;
switch(mode){
case SEARCH:
searchMode = true;
extendedMode = false;
outputCompounds = true;
break;
case EXTENDED:
searchMode = true;
extendedMode = true;
outputCompounds = false;
break;
default:
searchMode = false;
extendedMode = false;
outputCompounds = false;
break;
}
buffer.reset(this.input);
resetState();
dictionaryMap.put(Type.KNOWN, dictionary);
dictionaryMap.put(Type.UNKNOWN, unkDictionary);
dictionaryMap.put(Type.USER, userDictionary);
}
private GraphvizFormatter dotOut;
/** Expert: set this to produce graphviz (dot) output of
* the Viterbi lattice */
public void setGraphvizFormatter(GraphvizFormatter dotOut) {
this.dotOut = dotOut;
}
@Override
public void close() throws IOException {
super.close();
buffer.reset(input);
}
@Override
public void reset() throws IOException {
super.reset();
buffer.reset(input);
resetState();
}
private void resetState() {
positions.reset();
unknownWordEndIndex = -1;
pos = 0;
end = false;
lastBackTracePos = 0;
lastTokenPos = -1;
pending.clear();
// Add BOS:
positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
}
@Override
public void end() throws IOException {
super.end();
// Set final offset
int finalOffset = correctOffset(pos);
offsetAtt.setOffset(finalOffset, finalOffset);
}
// Returns the added cost that a 2nd best segmentation is
// allowed to have. Ie, if we see path with cost X,
// ending in a compound word, and this method returns
// threshold > 0, then we will also find the 2nd best
// segmentation and if its path score is within this
// threshold of X, we'll include it in the output:
private int computeSecondBestThreshold(int pos, int length) throws IOException {
// TODO: maybe we do something else here, instead of just
// using the penalty...? EG we can be more aggressive on
// when to also test for 2nd best path
return computePenalty(pos, length);
}
private int computePenalty(int pos, int length) throws IOException {
if (length > SEARCH_MODE_KANJI_LENGTH) {
boolean allKanji = true;
// check if node consists of only kanji
final int endPos = pos + length;
for (int pos2 = pos; pos2 < endPos; pos2++) {
if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
allKanji = false;
break;
}
}
if (allKanji) { // Process only Kanji keywords
return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
} else if (length > SEARCH_MODE_OTHER_LENGTH) {
return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
}
}
return 0;
}
// Holds all back pointers arriving to this position:
final static class Position {
int pos;
int count;
// maybe single int array * 5?
int[] costs = new int[8];
int[] lastRightID = new int[8];
int[] backPos = new int[8];
int[] backIndex = new int[8];
int[] backID = new int[8];
Type[] backType = new Type[8];
// Only used when finding 2nd best segmentation under a
// too-long token:
int forwardCount;
int[] forwardPos = new int[8];
int[] forwardID = new int[8];
int[] forwardIndex = new int[8];
Type[] forwardType = new Type[8];
public void grow() {
costs = ArrayUtil.grow(costs, 1+count);
lastRightID = ArrayUtil.grow(lastRightID, 1+count);
backPos = ArrayUtil.grow(backPos, 1+count);
backIndex = ArrayUtil.grow(backIndex, 1+count);
backID = ArrayUtil.grow(backID, 1+count);
// NOTE: sneaky: grow separately because
// ArrayUtil.grow will otherwise pick a different
// length than the int[]s we just grew:
final Type[] newBackType = new Type[backID.length];
System.arraycopy(backType, 0, newBackType, 0, backType.length);
backType = newBackType;
}
public void growForward() {
forwardPos = ArrayUtil.grow(forwardPos, 1+forwardCount);
forwardID = ArrayUtil.grow(forwardID, 1+forwardCount);
forwardIndex = ArrayUtil.grow(forwardIndex, 1+forwardCount);
// NOTE: sneaky: grow separately because
// ArrayUtil.grow will otherwise pick a different
// length than the int[]s we just grew:
final Type[] newForwardType = new Type[forwardPos.length];
System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length);
forwardType = newForwardType;
}
public void add(int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) {
// NOTE: this isn't quite a true Viterbi search,
// because we should check if lastRightID is
// already present here, and only update if the new
// cost is less than the current cost, instead of
// simply appending. However, that will likely hurt
// performance (usually we add a lastRightID only once),
// and it means we actually create the full graph
// intersection instead of a "normal" Viterbi lattice:
if (count == costs.length) {
grow();
}
this.costs[count] = cost;
this.lastRightID[count] = lastRightID;
this.backPos[count] = backPos;
this.backIndex[count] = backIndex;
this.backID[count] = backID;
this.backType[count] = backType;
count++;
}
public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) {
if (forwardCount == this.forwardID.length) {
growForward();
}
this.forwardPos[forwardCount] = forwardPos;
this.forwardIndex[forwardCount] = forwardIndex;
this.forwardID[forwardCount] = forwardID;
this.forwardType[forwardCount] = forwardType;
forwardCount++;
}
public void reset() {
count = 0;
// forwardCount naturally resets after it runs:
assert forwardCount == 0: "pos=" + pos + " forwardCount=" + forwardCount;
}
}
private void add(Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) throws IOException {
final int wordCost = dict.getWordCost(wordID);
final int leftID = dict.getLeftId(wordID);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
assert fromPosData.count > 0;
for(int idx=0;idx lastTokenPos;
posIncAtt.setPositionIncrement(1);
posLengthAtt.setPositionLength(1);
}
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token);
}
lastTokenPos = token.getPosition();
return true;
}
// TODO: make generic'd version of this "circular array"?
// It's a bit tricky because we do things to the Position
// (eg, set .pos = N on reuse)...
static final class WrappedPositionArray {
private Position[] positions = new Position[8];
public WrappedPositionArray() {
for(int i=0;i 0) {
if (nextWrite == -1) {
nextWrite = positions.length - 1;
}
positions[nextWrite--].reset();
count--;
}
nextWrite = 0;
nextPos = 0;
count = 0;
}
/** Get Position instance for this absolute position;
* this is allowed to be arbitrarily far "in the
* future" but cannot be before the last freeBefore. */
public Position get(int pos) {
while(pos >= nextPos) {
//System.out.println("count=" + count + " vs len=" + positions.length);
if (count == positions.length) {
Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
//System.out.println("grow positions " + newPositions.length);
System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
for(int i=positions.length;i= nextPos - count;
}
private int getIndex(int pos) {
int index = nextWrite - (nextPos - pos);
if (index < 0) {
index += positions.length;
}
return index;
}
public void freeBefore(int pos) {
final int toFree = count - (nextPos - pos);
assert toFree >= 0;
assert toFree <= count;
int index = nextWrite - count;
if (index < 0) {
index += positions.length;
}
for(int i=0;i= MAX_BACKTRACE_GAP) {
// Safety: if we've buffered too much, force a
// backtrace now. We find the least-cost partial
// path, across all paths, backtrace from it, and
// then prune all others. Note that this, in
// general, can produce the wrong result, if the
// total best path did not in fact back trace
// through this partial best path. But it's the
// best we can do... (short of not having a
// safety!).
// First pass: find least cost partial path so far,
// including ending at future positions:
int leastIDX = -1;
int leastCost = Integer.MAX_VALUE;
Position leastPosData = null;
for(int pos2=pos;pos2 posData.pos) {
pos++;
continue;
}
final char firstCharacter = (char) buffer.get(pos);
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
// Find unknown match:
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
if (!characterDefinition.isGroup(firstCharacter)) {
unknownWordLength = 1;
} else {
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
unknownWordLength = 1;
for (int posAhead=pos+1;unknownWordLength 0) {
final Position endPosData = positions.get(pos);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
if (VERBOSE) {
System.out.println(" end: " + endPosData.count + " nodes");
}
for(int idx=0;idx lastBackTracePos) {
//System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
final Position posData = positions.get(pos);
assert bestIDX < posData.count;
int backPos = posData.backPos[bestIDX];
assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
int length = pos - backPos;
Type backType = posData.backType[bestIDX];
int backID = posData.backID[bestIDX];
int nextBestIDX = posData.backIndex[bestIDX];
if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
// In searchMode, if best path had picked a too-long
// token, we use the "penalty" to compute the allowed
// max cost of an alternate back-trace. If we find an
// alternate back trace with cost below that
// threshold, we pursue it instead (but also output
// the long token).
//System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos);
final int penalty = computeSecondBestThreshold(backPos, pos-backPos);
if (penalty > 0) {
if (VERBOSE) {
System.out.println(" compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
}
// Use the penalty to set maxCost on the 2nd best
// segmentation:
int maxCost = posData.costs[bestIDX] + penalty;
if (lastLeftWordID != -1) {
maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
}
// Now, prune all too-long tokens from the graph:
pruneAndRescore(backPos, pos,
posData.backIndex[bestIDX]);
// Finally, find 2nd best back-trace and resume
// backtrace there:
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
for(int idx=0;idx " + cost);
}
//System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
//cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
if (cost < leastCost) {
//System.out.println(" ** ");
leastCost = cost;
leastIDX = idx;
}
}
//System.out.println(" leastIDX=" + leastIDX);
if (VERBOSE) {
System.out.println(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
}
if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
// We should have pruned the altToken from the graph:
assert posData.backPos[leastIDX] != backPos;
// Save the current compound token, to output when
// this alternate path joins back:
altToken = new Token(backID,
fragment,
backPos - lastBackTracePos,
length,
backType,
backPos,
getDict(backType));
// Redirect our backtrace to 2nd best:
bestIDX = leastIDX;
nextBestIDX = posData.backIndex[bestIDX];
backPos = posData.backPos[bestIDX];
length = pos - backPos;
backType = posData.backType[bestIDX];
backID = posData.backID[bestIDX];
backCount = 0;
//System.out.println(" do alt token!");
} else {
// I think in theory it's possible there is no
// 2nd best path, which is fine; in this case we
// only output the compound token:
//System.out.println(" no alt token! bestIDX=" + bestIDX);
}
}
}
final int offset = backPos - lastBackTracePos;
assert offset >= 0;
if (altToken != null && altToken.getPosition() >= backPos) {
// We've backtraced to the position where the
// compound token starts; add it now:
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
if (backCount > 0) {
backCount++;
altToken.setPositionLength(backCount);
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
pending.add(altToken);
} else {
// This means alt token was all punct tokens:
if (VERBOSE) {
System.out.println(" discard all-punctuation altToken=" + altToken);
}
assert discardPunctuation;
}
altToken = null;
}
final Dictionary dict = getDict(backType);
if (backType == Type.USER) {
// Expand the phraseID we recorded into the actual
// segmentation:
final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
int wordID = wordIDAndLength[0];
int current = 0;
for(int j=1; j < wordIDAndLength.length; j++) {
final int len = wordIDAndLength[j];
//System.out.println(" add user: len=" + len);
pending.add(new Token(wordID+j-1,
fragment,
current + offset,
len,
Type.USER,
current + backPos,
dict));
if (VERBOSE) {
System.out.println(" add USER token=" + pending.get(pending.size()-1));
}
current += len;
}
// Reverse the tokens we just added, because when we
// serve them up from incrementToken we serve in
// reverse:
Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
pending.size()));
backCount += wordIDAndLength.length-1;
} else {
if (extendedMode && backType == Type.UNKNOWN) {
// In EXTENDED mode we convert unknown word into
// unigrams:
int unigramTokenCount = 0;
for(int i=length-1;i>=0;i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
i--;
charLen = 2;
}
//System.out.println(" extended tok offset="
//+ (offset + i));
if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
pending.add(new Token(CharacterDefinition.NGRAM,
fragment,
offset + i,
charLen,
Type.UNKNOWN,
backPos + i,
unkDictionary));
unigramTokenCount++;
}
}
backCount += unigramTokenCount;
} else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
pending.add(new Token(backID,
fragment,
offset,
length,
backType,
backPos,
dict));
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size()-1));
}
backCount++;
} else {
if (VERBOSE) {
System.out.println(" skip punctuation token=" + new String(fragment, offset, length));
}
}
}
lastLeftWordID = dict.getLeftId(backID);
pos = backPos;
bestIDX = nextBestIDX;
}
lastBackTracePos = endPos;
if (VERBOSE) {
System.out.println(" freeBefore pos=" + endPos);
}
// Notify the circular buffers that we are done with
// these positions:
buffer.freeBefore(endPos);
positions.freeBefore(endPos);
}
Dictionary getDict(Type type) {
return dictionaryMap.get(type);
}
private static boolean isPunctuation(char ch) {
switch(Character.getType(ch)) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.FORMAT:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
return true;
default:
return false;
}
}
}