org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy
Show all versions of lucene-analysis-kuromoji Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...
/**
* Tokenizer for Japanese that uses morphological analysis.
*
* This tokenizer sets a number of additional attributes:
*
*
* - {@link BaseFormAttribute} containing base form for inflected adjectives and verbs.
*
- {@link PartOfSpeechAttribute} containing part-of-speech.
*
- {@link ReadingAttribute} containing reading and pronunciation.
*
- {@link InflectionAttribute} containing additional part-of-speech information for inflected
* forms.
*
*
* This tokenizer uses a rolling Viterbi search to find the least cost segmentation (path) of the
* incoming characters. For tokens that appear to be compound (> length 2 for all Kanji, or >
* length 7 for non-Kanji), we see if there is a 2nd best segmentation of that token after applying
* penalties to the long tokens. If so, and the Mode is {@link Mode#SEARCH}, we output the alternate
* segmentation as well.
*/
public final class JapaneseTokenizer extends Tokenizer {
/** Tokenization mode: this determines how the tokenizer handles compound and unknown words. */
public enum Mode {
/** Ordinary segmentation: no decomposition for compounds, */
NORMAL,
/**
* Segmentation geared towards search: this includes a decompounding process for long nouns,
* also including the full compound token as a synonym.
*/
SEARCH,
/**
* Extended mode outputs unigrams for unknown words.
*
* @lucene.experimental
*/
EXTENDED
}
/** Default tokenization mode. Currently this is {@link Mode#SEARCH}. */
public static final Mode DEFAULT_MODE = Mode.SEARCH;
/** Token type reflecting the original source of this token */
public enum Type {
/** Known words from the system dictionary. */
KNOWN,
/** Unknown words (heuristically segmented). */
UNKNOWN,
/** Known words from the user dictionary. */
USER
}
private static final boolean VERBOSE = false;
private static final int SEARCH_MODE_KANJI_LENGTH = 2;
private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
// For safety:
private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
private static final int MAX_BACKTRACE_GAP = 1024;
private final EnumMap dictionaryMap = new EnumMap<>(Type.class);
private final TokenInfoFST fst;
private final TokenInfoDictionary dictionary;
private final UnknownDictionary unkDictionary;
private final ConnectionCosts costs;
private final UserDictionary userDictionary;
private final CharacterDefinition characterDefinition;
private final FST.Arc arc = new FST.Arc<>();
private final FST.BytesReader fstReader;
private final IntsRef wordIdRef = new IntsRef();
private final FST.BytesReader userFSTReader;
private final TokenInfoFST userFST;
private final RollingCharBuffer buffer = new RollingCharBuffer();
private final WrappedPositionArray positions = new WrappedPositionArray();
private final boolean discardPunctuation;
private final boolean searchMode;
private final boolean extendedMode;
private final boolean outputCompounds;
private boolean outputNBest = false;
// Allowable cost difference for N-best output:
private int nBestCost = 0;
// True once we've hit the EOF from the input reader:
private boolean end;
// Last absolute position we backtraced from:
private int lastBackTracePos;
// Position of last token we returned; we use this to
// figure out whether to set posIncr to 0 or 1:
private int lastTokenPos;
// Next absolute position to process:
private int pos;
// Already parsed, but not yet passed to caller, tokens:
private final List pending = new ArrayList<>();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt =
addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
/**
* Create a new JapaneseTokenizer.
*
* Uses the default AttributeFactory.
*
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param mode tokenization mode.
*/
public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, true, mode);
}
/**
* Create a new JapaneseTokenizer.
*
*
Uses the default AttributeFactory.
*
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param discardCompoundToken true if compound tokens should be dropped from the output when
* tokenization mode is not NORMAL.
* @param mode tokenization mode.
*/
public JapaneseTokenizer(
UserDictionary userDictionary,
boolean discardPunctuation,
boolean discardCompoundToken,
Mode mode) {
this(
DEFAULT_TOKEN_ATTRIBUTE_FACTORY,
userDictionary,
discardPunctuation,
discardCompoundToken,
mode);
}
/**
* Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
*
* @param factory the AttributeFactory to use
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param mode tokenization mode.
*/
public JapaneseTokenizer(
AttributeFactory factory,
UserDictionary userDictionary,
boolean discardPunctuation,
Mode mode) {
this(
factory,
TokenInfoDictionary.getInstance(),
UnknownDictionary.getInstance(),
ConnectionCosts.getInstance(),
userDictionary,
discardPunctuation,
true,
mode);
}
/**
* Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
*
* @param factory the AttributeFactory to use
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param discardCompoundToken true if compound tokens should be dropped from the output when
* tokenization mode is not NORMAL.
* @param mode tokenization mode.
*/
public JapaneseTokenizer(
AttributeFactory factory,
UserDictionary userDictionary,
boolean discardPunctuation,
boolean discardCompoundToken,
Mode mode) {
this(
factory,
TokenInfoDictionary.getInstance(),
UnknownDictionary.getInstance(),
ConnectionCosts.getInstance(),
userDictionary,
discardPunctuation,
discardCompoundToken,
mode);
}
/**
* Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
* This constructor provides an entry point for users that want to construct custom language
* models that can be used as input to {@link
* org.apache.lucene.analysis.ja.util.DictionaryBuilder}.
*
* @param factory the AttributeFactory to use
* @param systemDictionary a custom known token dictionary
* @param unkDictionary a custom unknown token dictionary
* @param connectionCosts custom token transition costs
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param discardCompoundToken true if compound tokens should be dropped from the output when
* tokenization mode is not NORMAL.
* @param mode tokenization mode.
* @lucene.experimental
*/
@IgnoreRandomChains(reason = "Parameters are too complex to be tested")
public JapaneseTokenizer(
AttributeFactory factory,
TokenInfoDictionary systemDictionary,
UnknownDictionary unkDictionary,
ConnectionCosts connectionCosts,
UserDictionary userDictionary,
boolean discardPunctuation,
boolean discardCompoundToken,
Mode mode) {
super(factory);
this.dictionary = systemDictionary;
this.fst = dictionary.getFST();
this.unkDictionary = unkDictionary;
this.characterDefinition = unkDictionary.getCharacterDefinition();
this.userDictionary = userDictionary;
this.costs = connectionCosts;
fstReader = fst.getBytesReader();
if (userDictionary != null) {
userFST = userDictionary.getFST();
userFSTReader = userFST.getBytesReader();
} else {
userFST = null;
userFSTReader = null;
}
this.discardPunctuation = discardPunctuation;
switch (mode) {
case SEARCH:
searchMode = true;
extendedMode = false;
outputCompounds = !discardCompoundToken;
break;
case EXTENDED:
searchMode = true;
extendedMode = true;
outputCompounds = !discardCompoundToken;
break;
case NORMAL:
default:
searchMode = false;
extendedMode = false;
outputCompounds = false;
break;
}
buffer.reset(this.input);
resetState();
dictionaryMap.put(Type.KNOWN, dictionary);
dictionaryMap.put(Type.UNKNOWN, unkDictionary);
dictionaryMap.put(Type.USER, userDictionary);
}
private GraphvizFormatter dotOut;
/** Expert: set this to produce graphviz (dot) output of the Viterbi lattice */
public void setGraphvizFormatter(GraphvizFormatter dotOut) {
this.dotOut = dotOut;
}
@Override
public void close() throws IOException {
super.close();
buffer.reset(input);
}
@Override
public void reset() throws IOException {
super.reset();
buffer.reset(input);
resetState();
}
private void resetState() {
positions.reset();
pos = 0;
end = false;
lastBackTracePos = 0;
lastTokenPos = -1;
pending.clear();
// Add BOS:
positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
}
@Override
public void end() throws IOException {
super.end();
// Set final offset
int finalOffset = correctOffset(pos);
offsetAtt.setOffset(finalOffset, finalOffset);
}
// Returns the added cost that a 2nd best segmentation is
// allowed to have. Ie, if we see path with cost X,
// ending in a compound word, and this method returns
// threshold > 0, then we will also find the 2nd best
// segmentation and if its path score is within this
// threshold of X, we'll include it in the output:
private int computeSecondBestThreshold(int pos, int length) throws IOException {
// TODO: maybe we do something else here, instead of just
// using the penalty...? EG we can be more aggressive on
// when to also test for 2nd best path
return computePenalty(pos, length);
}
private int computePenalty(int pos, int length) throws IOException {
if (length > SEARCH_MODE_KANJI_LENGTH) {
boolean allKanji = true;
// check if node consists of only kanji
final int endPos = pos + length;
for (int pos2 = pos; pos2 < endPos; pos2++) {
if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
allKanji = false;
break;
}
}
if (allKanji) { // Process only Kanji keywords
return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
} else if (length > SEARCH_MODE_OTHER_LENGTH) {
return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
}
}
return 0;
}
// Holds all back pointers arriving to this position:
static final class Position {
int pos;
int count;
// maybe single int array * 5?
int[] costs = new int[8];
int[] lastRightID = new int[8];
int[] backPos = new int[8];
int[] backIndex = new int[8];
int[] backID = new int[8];
Type[] backType = new Type[8];
// Only used when finding 2nd best segmentation under a
// too-long token:
int forwardCount;
int[] forwardPos = new int[8];
int[] forwardID = new int[8];
int[] forwardIndex = new int[8];
Type[] forwardType = new Type[8];
public void grow() {
costs = ArrayUtil.grow(costs, 1 + count);
lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
backPos = ArrayUtil.grow(backPos, 1 + count);
backIndex = ArrayUtil.grow(backIndex, 1 + count);
backID = ArrayUtil.grow(backID, 1 + count);
// NOTE: sneaky: grow separately because
// ArrayUtil.grow will otherwise pick a different
// length than the int[]s we just grew:
final Type[] newBackType = new Type[backID.length];
System.arraycopy(backType, 0, newBackType, 0, backType.length);
backType = newBackType;
}
public void growForward() {
forwardPos = ArrayUtil.grow(forwardPos, 1 + forwardCount);
forwardID = ArrayUtil.grow(forwardID, 1 + forwardCount);
forwardIndex = ArrayUtil.grow(forwardIndex, 1 + forwardCount);
// NOTE: sneaky: grow separately because
// ArrayUtil.grow will otherwise pick a different
// length than the int[]s we just grew:
final Type[] newForwardType = new Type[forwardPos.length];
System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length);
forwardType = newForwardType;
}
public void add(
int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) {
// NOTE: this isn't quite a true Viterbi search,
// because we should check if lastRightID is
// already present here, and only update if the new
// cost is less than the current cost, instead of
// simply appending. However, that will likely hurt
// performance (usually we add a lastRightID only once),
// and it means we actually create the full graph
// intersection instead of a "normal" Viterbi lattice:
if (count == costs.length) {
grow();
}
this.costs[count] = cost;
this.lastRightID[count] = lastRightID;
this.backPos[count] = backPos;
this.backIndex[count] = backIndex;
this.backID[count] = backID;
this.backType[count] = backType;
count++;
}
public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) {
if (forwardCount == this.forwardID.length) {
growForward();
}
this.forwardPos[forwardCount] = forwardPos;
this.forwardIndex[forwardCount] = forwardIndex;
this.forwardID[forwardCount] = forwardID;
this.forwardType[forwardCount] = forwardType;
forwardCount++;
}
public void reset() {
count = 0;
// forwardCount naturally resets after it runs:
assert forwardCount == 0 : "pos=" + pos + " forwardCount=" + forwardCount;
}
}
private void add(
Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty)
throws IOException {
final int wordCost = dict.getWordCost(wordID);
final int leftID = dict.getLeftId(wordID);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
assert fromPosData.count > 0;
for (int idx = 0; idx < fromPosData.count; idx++) {
// Cost is path cost so far, plus word cost (added at
// end of loop), plus bigram cost:
final int cost = fromPosData.costs[idx] + costs.get(fromPosData.lastRightID[idx], leftID);
if (VERBOSE) {
System.out.println(
" fromIDX="
+ idx
+ ": cost="
+ cost
+ " (prevCost="
+ fromPosData.costs[idx]
+ " wordCost="
+ wordCost
+ " bgCost="
+ costs.get(fromPosData.lastRightID[idx], leftID)
+ " leftID="
+ leftID
+ ")");
}
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
if (VERBOSE) {
System.out.println(" **");
}
}
}
leastCost += wordCost;
if (VERBOSE) {
System.out.println(
" + cost="
+ leastCost
+ " wordID="
+ wordID
+ " leftID="
+ leftID
+ " leastIDX="
+ leastIDX
+ " toPos="
+ endPos
+ " toPos.idx="
+ positions.get(endPos).count);
}
if (addPenalty && type != Type.USER) {
final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
if (VERBOSE) {
if (penalty > 0) {
System.out.println(" + penalty=" + penalty + " cost=" + (leastCost + penalty));
}
}
leastCost += penalty;
}
// positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX,
// wordID, type);
assert leftID == dict.getRightId(wordID);
positions.get(endPos).add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type);
}
@Override
public boolean incrementToken() throws IOException {
// parse() is able to return w/o producing any new
// tokens, when the tokens it had produced were entirely
// punctuation. So we loop here until we get a real
// token or we end:
while (pending.size() == 0) {
if (end) {
return false;
}
// Push Viterbi forward some more:
parse();
}
final Token token = pending.remove(pending.size() - 1);
int position = token.getPosition();
int length = token.getLength();
clearAttributes();
assert length > 0;
// System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
// token.getSurfaceForm().length);
termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
basicFormAtt.setToken(token);
posAtt.setToken(token);
readingAtt.setToken(token);
inflectionAtt.setToken(token);
if (token.getPosition() == lastTokenPos) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(token.getPositionLength());
} else if (outputNBest) {
// The position length is always calculated if outputNBest is true.
assert token.getPosition() > lastTokenPos;
posIncAtt.setPositionIncrement(1);
posLengthAtt.setPositionLength(token.getPositionLength());
} else {
assert token.getPosition() > lastTokenPos;
posIncAtt.setPositionIncrement(1);
posLengthAtt.setPositionLength(1);
}
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token);
}
lastTokenPos = token.getPosition();
return true;
}
// TODO: make generic'd version of this "circular array"?
// It's a bit tricky because we do things to the Position
// (eg, set .pos = N on reuse)...
static final class WrappedPositionArray {
private Position[] positions = new Position[8];
public WrappedPositionArray() {
for (int i = 0; i < positions.length; i++) {
positions[i] = new Position();
}
}
// Next array index to write to in positions:
private int nextWrite;
// Next position to write:
private int nextPos;
// How many valid Position instances are held in the
// positions array:
private int count;
public void reset() {
nextWrite--;
while (count > 0) {
if (nextWrite == -1) {
nextWrite = positions.length - 1;
}
positions[nextWrite--].reset();
count--;
}
nextWrite = 0;
nextPos = 0;
count = 0;
}
/**
* Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
* the future" but cannot be before the last freeBefore.
*/
public Position get(int pos) {
while (pos >= nextPos) {
// System.out.println("count=" + count + " vs len=" + positions.length);
if (count == positions.length) {
Position[] newPositions =
new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
// System.out.println("grow positions " + newPositions.length);
System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
for (int i = positions.length; i < newPositions.length; i++) {
newPositions[i] = new Position();
}
nextWrite = positions.length;
positions = newPositions;
}
if (nextWrite == positions.length) {
nextWrite = 0;
}
// Should have already been reset:
assert positions[nextWrite].count == 0;
positions[nextWrite++].pos = nextPos++;
count++;
}
assert inBounds(pos);
final int index = getIndex(pos);
assert positions[index].pos == pos;
return positions[index];
}
public int getNextPos() {
return nextPos;
}
// For assert:
private boolean inBounds(int pos) {
return pos < nextPos && pos >= nextPos - count;
}
private int getIndex(int pos) {
int index = nextWrite - (nextPos - pos);
if (index < 0) {
index += positions.length;
}
return index;
}
public void freeBefore(int pos) {
final int toFree = count - (nextPos - pos);
assert toFree >= 0;
assert toFree <= count;
int index = nextWrite - count;
if (index < 0) {
index += positions.length;
}
for (int i = 0; i < toFree; i++) {
if (index == positions.length) {
index = 0;
}
// System.out.println(" fb idx=" + index);
positions[index].reset();
index++;
}
count -= toFree;
}
}
/* Incrementally parse some more characters. This runs
* the viterbi search forwards "enough" so that we
* generate some more tokens. How much forward depends on
* the chars coming in, since some chars could cause
* longer-lasting ambiguity in the parsing. Once the
* ambiguity is resolved, then we back trace, produce
* the pending tokens, and return. */
private void parse() throws IOException {
if (VERBOSE) {
System.out.println("\nPARSE");
}
// Index of the last character of unknown word:
int unknownWordEndIndex = -1;
// Advances over each position (character):
while (true) {
if (buffer.get(pos) == -1) {
// End
break;
}
final Position posData = positions.get(pos);
final boolean isFrontier = positions.getNextPos() == pos + 1;
if (posData.count == 0) {
// No arcs arrive here; move to next position:
if (VERBOSE) {
System.out.println(" no arcs in; skip pos=" + pos);
}
pos++;
continue;
}
if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
// if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
// We are at a "frontier", and only one node is
// alive, so whatever the eventual best path is must
// come through this node. So we can safely commit
// to the prefix of the best path at this point:
if (outputNBest) {
backtraceNBest(posData, false);
}
backtrace(posData, 0);
if (outputNBest) {
fixupPendingList();
}
// Re-base cost so we don't risk int overflow:
posData.costs[0] = 0;
if (pending.size() != 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
}
}
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
// Safety: if we've buffered too much, force a
// backtrace now. We find the least-cost partial
// path, across all paths, backtrace from it, and
// then prune all others. Note that this, in
// general, can produce the wrong result, if the
// total best path did not in fact back trace
// through this partial best path. But it's the
// best we can do... (short of not having a
// safety!).
// First pass: find least cost partial path so far,
// including ending at future positions:
int leastIDX = -1;
int leastCost = Integer.MAX_VALUE;
Position leastPosData = null;
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2);
for (int idx = 0; idx < posData2.count; idx++) {
// System.out.println(" idx=" + idx + " cost=" + cost);
final int cost = posData2.costs[idx];
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
leastPosData = posData2;
}
}
}
// We will always have at least one live path:
assert leastIDX != -1;
if (outputNBest) {
backtraceNBest(leastPosData, false);
}
// Second pass: prune all but the best path:
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2);
if (posData2 != leastPosData) {
posData2.reset();
} else {
if (leastIDX != 0) {
posData2.costs[0] = posData2.costs[leastIDX];
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
posData2.backPos[0] = posData2.backPos[leastIDX];
posData2.backIndex[0] = posData2.backIndex[leastIDX];
posData2.backID[0] = posData2.backID[leastIDX];
posData2.backType[0] = posData2.backType[leastIDX];
}
posData2.count = 1;
}
}
backtrace(leastPosData, 0);
if (outputNBest) {
fixupPendingList();
}
// Re-base cost so we don't risk int overflow:
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
if (pos != leastPosData.pos) {
// We jumped into a future position:
assert pos < leastPosData.pos;
pos = leastPosData.pos;
}
if (pending.size() != 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
continue;
}
}
if (VERBOSE) {
System.out.println(
"\n extend @ pos="
+ pos
+ " char="
+ (char) buffer.get(pos)
+ " hex="
+ Integer.toHexString(buffer.get(pos)));
}
if (VERBOSE) {
System.out.println(" " + posData.count + " arcs in");
}
boolean anyMatches = false;
// First try user dict:
if (userFST != null) {
userFST.getFirstArc(arc);
int output = 0;
for (int posAhead = posData.pos; ; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
if (userFST.findTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) {
break;
}
output += arc.output().intValue();
if (arc.isFinal()) {
if (VERBOSE) {
System.out.println(
" USER word "
+ new String(buffer.get(pos, posAhead - pos + 1))
+ " toPos="
+ (posAhead + 1));
}
add(
userDictionary,
posData,
posAhead + 1,
output + arc.nextFinalOutput().intValue(),
Type.USER,
false);
anyMatches = true;
}
}
}
// TODO: we can be more aggressive about user
// matches? if we are "under" a user match then don't
// extend KNOWN/UNKNOWN paths?
if (!anyMatches) {
// Next, try known dictionary matches
fst.getFirstArc(arc);
int output = 0;
for (int posAhead = posData.pos; ; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
// System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
if (fst.findTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) {
break;
}
output += arc.output().intValue();
// Optimization: for known words that are too-long
// (compound), we should pre-compute the 2nd
// best segmentation and store it in the
// dictionary instead of recomputing it each time a
// match is found.
if (arc.isFinal()) {
dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
if (VERBOSE) {
System.out.println(
" KNOWN word "
+ new String(buffer.get(pos, posAhead - pos + 1))
+ " toPos="
+ (posAhead + 1)
+ " "
+ wordIdRef.length
+ " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
add(
dictionary,
posData,
posAhead + 1,
wordIdRef.ints[wordIdRef.offset + ofs],
Type.KNOWN,
false);
anyMatches = true;
}
}
}
}
// In the case of normal mode, it doesn't process unknown word greedily.
if (!searchMode && unknownWordEndIndex > posData.pos) {
pos++;
continue;
}
final char firstCharacter = (char) buffer.get(pos);
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
// Find unknown match:
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
if (!characterDefinition.isGroup(firstCharacter)) {
unknownWordLength = 1;
} else {
// Extract unknown word. Characters with the same character class are considered to be
// part of unknown word
unknownWordLength = 1;
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
if (characterId == characterDefinition.getCharacterClass((char) ch)
&& isPunctuation((char) ch) == isPunct) {
unknownWordLength++;
} else {
break;
}
}
}
unkDictionary.lookupWordIds(
characterId, wordIdRef); // characters in input text are supposed to be the same
if (VERBOSE) {
System.out.println(
" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
add(
unkDictionary,
posData,
posData.pos + unknownWordLength,
wordIdRef.ints[wordIdRef.offset + ofs],
Type.UNKNOWN,
false);
}
unknownWordEndIndex = posData.pos + unknownWordLength;
}
pos++;
}
end = true;
if (pos > 0) {
final Position endPosData = positions.get(pos);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
if (VERBOSE) {
System.out.println(" end: " + endPosData.count + " nodes");
}
for (int idx = 0; idx < endPosData.count; idx++) {
// Add EOS cost:
final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
// System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" +
// endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
// backPos=" + endPosData.backPos[idx]);
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
}
}
if (outputNBest) {
backtraceNBest(endPosData, true);
}
backtrace(endPosData, leastIDX);
if (outputNBest) {
fixupPendingList();
}
} else {
// No characters in the input string; return no tokens!
}
}
// Eliminates arcs from the lattice that are compound
// tokens (have a penalty) or are not congruent with the
// compound token we've matched (ie, span across the
// startPos). This should be fairly efficient, because we
// just keep the already intersected structure of the
// graph, eg we don't have to consult the FSTs again:
private void pruneAndRescore(int startPos, int endPos, int bestStartIDX) throws IOException {
if (VERBOSE) {
System.out.println(
" pruneAndRescore startPos="
+ startPos
+ " endPos="
+ endPos
+ " bestStartIDX="
+ bestStartIDX);
}
// First pass: walk backwards, building up the forward
// arcs and pruning inadmissible arcs:
for (int pos = endPos; pos > startPos; pos--) {
final Position posData = positions.get(pos);
if (VERBOSE) {
System.out.println(" back pos=" + pos);
}
for (int arcIDX = 0; arcIDX < posData.count; arcIDX++) {
final int backPos = posData.backPos[arcIDX];
if (backPos >= startPos) {
// Keep this arc:
// System.out.println(" keep backPos=" + backPos);
positions
.get(backPos)
.addForward(pos, arcIDX, posData.backID[arcIDX], posData.backType[arcIDX]);
} else {
if (VERBOSE) {
System.out.println(" prune");
}
}
}
if (pos != startPos) {
posData.count = 0;
}
}
// Second pass: walk forward, re-scoring:
for (int pos = startPos; pos < endPos; pos++) {
final Position posData = positions.get(pos);
if (VERBOSE) {
System.out.println(" forward pos=" + pos + " count=" + posData.forwardCount);
}
if (posData.count == 0) {
// No arcs arrive here...
if (VERBOSE) {
System.out.println(" skip");
}
posData.forwardCount = 0;
continue;
}
if (pos == startPos) {
// On the initial position, only consider the best
// path so we "force congruence": the
// sub-segmentation is "in context" of what the best
// path (compound token) had matched:
final int rightID;
if (startPos == 0) {
rightID = 0;
} else {
rightID =
getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
}
final int pathCost = posData.costs[bestStartIDX];
for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) {
final Type forwardType = posData.forwardType[forwardArcIDX];
final Dictionary dict2 = getDict(forwardType);
final int wordID = posData.forwardID[forwardArcIDX];
final int toPos = posData.forwardPos[forwardArcIDX];
final int newCost =
pathCost
+ dict2.getWordCost(wordID)
+ costs.get(rightID, dict2.getLeftId(wordID))
+ computePenalty(pos, toPos - pos);
if (VERBOSE) {
System.out.println(
" + "
+ forwardType
+ " word "
+ new String(buffer.get(pos, toPos - pos))
+ " toPos="
+ toPos
+ " cost="
+ newCost
+ " penalty="
+ computePenalty(pos, toPos - pos)
+ " toPos.idx="
+ positions.get(toPos).count);
}
positions
.get(toPos)
.add(newCost, dict2.getRightId(wordID), pos, bestStartIDX, wordID, forwardType);
}
} else {
// On non-initial positions, we maximize score
// across all arriving lastRightIDs:
for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) {
final Type forwardType = posData.forwardType[forwardArcIDX];
final int toPos = posData.forwardPos[forwardArcIDX];
if (VERBOSE) {
System.out.println(
" + "
+ forwardType
+ " word "
+ new String(buffer.get(pos, toPos - pos))
+ " toPos="
+ toPos);
}
add(
getDict(forwardType),
posData,
toPos,
posData.forwardID[forwardArcIDX],
forwardType,
true);
}
}
posData.forwardCount = 0;
}
}
// yet another lattice data structure
private static final class Lattice {
char[] fragment;
EnumMap dictionaryMap;
boolean useEOS;
int rootCapacity = 0;
int rootSize = 0;
int rootBase = 0;
// root pointers of node chain by leftChain_ that have same start offset.
int[] lRoot;
// root pointers of node chain by rightChain_ that have same end offset.
int[] rRoot;
int capacity = 0;
int nodeCount = 0;
// The variables below are elements of lattice node that indexed by node number.
Type[] nodeDicType;
int[] nodeWordID;
// nodeMark - -1:excluded, 0:unused, 1:bestpath, 2:2-best-path, ... N:N-best-path
int[] nodeMark;
int[] nodeLeftID;
int[] nodeRightID;
int[] nodeWordCost;
int[] nodeLeftCost;
int[] nodeRightCost;
// nodeLeftNode, nodeRightNode - are left/right node number with minimum cost path.
int[] nodeLeftNode;
int[] nodeRightNode;
// nodeLeft, nodeRight - start/end offset
int[] nodeLeft;
int[] nodeRight;
int[] nodeLeftChain;
int[] nodeRightChain;
private void setupRoot(int baseOffset, int lastOffset) {
assert baseOffset <= lastOffset;
int size = lastOffset - baseOffset + 1;
if (rootCapacity < size) {
int oversize = ArrayUtil.oversize(size, Integer.BYTES);
lRoot = new int[oversize];
rRoot = new int[oversize];
rootCapacity = oversize;
}
Arrays.fill(lRoot, 0, size, -1);
Arrays.fill(rRoot, 0, size, -1);
rootSize = size;
rootBase = baseOffset;
}
// Reserve at least N nodes.
private void reserve(int n) {
if (capacity < n) {
int oversize = ArrayUtil.oversize(n, Integer.BYTES);
nodeDicType = new Type[oversize];
nodeWordID = new int[oversize];
nodeMark = new int[oversize];
nodeLeftID = new int[oversize];
nodeRightID = new int[oversize];
nodeWordCost = new int[oversize];
nodeLeftCost = new int[oversize];
nodeRightCost = new int[oversize];
nodeLeftNode = new int[oversize];
nodeRightNode = new int[oversize];
nodeLeft = new int[oversize];
nodeRight = new int[oversize];
nodeLeftChain = new int[oversize];
nodeRightChain = new int[oversize];
capacity = oversize;
}
}
private void setupNodePool(int n) {
reserve(n);
nodeCount = 0;
if (VERBOSE) {
System.out.printf("DEBUG: setupNodePool: n = %d\n", n);
System.out.printf("DEBUG: setupNodePool: lattice.capacity = %d\n", capacity);
}
}
private int addNode(Type dicType, int wordID, int left, int right) {
if (VERBOSE) {
System.out.printf(
"DEBUG: addNode: dicType=%s, wordID=%d, left=%d, right=%d, str=%s\n",
dicType.toString(),
wordID,
left,
right,
left == -1 ? "BOS" : right == -1 ? "EOS" : new String(fragment, left, right - left));
}
assert nodeCount < capacity;
assert left == -1 || right == -1 || left < right;
assert left == -1 || (0 <= left && left < rootSize);
assert right == -1 || (0 <= right && right < rootSize);
int node = nodeCount++;
if (VERBOSE) {
System.out.printf("DEBUG: addNode: node=%d\n", node);
}
nodeDicType[node] = dicType;
nodeWordID[node] = wordID;
nodeMark[node] = 0;
if (wordID < 0) {
nodeWordCost[node] = 0;
nodeLeftCost[node] = 0;
nodeRightCost[node] = 0;
nodeLeftID[node] = 0;
nodeRightID[node] = 0;
} else {
Dictionary dic = dictionaryMap.get(dicType);
nodeWordCost[node] = dic.getWordCost(wordID);
nodeLeftID[node] = dic.getLeftId(wordID);
nodeRightID[node] = dic.getRightId(wordID);
}
if (VERBOSE) {
System.out.printf(
"DEBUG: addNode: wordCost=%d, leftID=%d, rightID=%d\n",
nodeWordCost[node], nodeLeftID[node], nodeRightID[node]);
}
nodeLeft[node] = left;
nodeRight[node] = right;
if (0 <= left) {
nodeLeftChain[node] = lRoot[left];
lRoot[left] = node;
} else {
nodeLeftChain[node] = -1;
}
if (0 <= right) {
nodeRightChain[node] = rRoot[right];
rRoot[right] = node;
} else {
nodeRightChain[node] = -1;
}
return node;
}
// Sum of positions.get(i).count in [beg, end) range.
// using stream:
// return IntStream.range(beg, end).map(i -> positions.get(i).count).sum();
private int positionCount(WrappedPositionArray positions, int beg, int end) {
int count = 0;
for (int i = beg; i < end; ++i) {
count += positions.get(i).count;
}
return count;
}
void setup(
char[] fragment,
EnumMap dictionaryMap,
WrappedPositionArray positions,
int prevOffset,
int endOffset,
boolean useEOS) {
assert positions.get(prevOffset).count == 1;
if (VERBOSE) {
System.out.printf("DEBUG: setup: prevOffset=%d, endOffset=%d\n", prevOffset, endOffset);
}
this.fragment = fragment;
this.dictionaryMap = dictionaryMap;
this.useEOS = useEOS;
// Initialize lRoot and rRoot.
setupRoot(prevOffset, endOffset);
// "+ 2" for first/last record.
setupNodePool(positionCount(positions, prevOffset + 1, endOffset + 1) + 2);
// substitute for BOS = 0
Position first = positions.get(prevOffset);
if (addNode(first.backType[0], first.backID[0], -1, 0) != 0) {
assert false;
}
// EOS = 1
if (addNode(Type.KNOWN, -1, endOffset - rootBase, -1) != 1) {
assert false;
}
for (int offset = endOffset; prevOffset < offset; --offset) {
int right = offset - rootBase;
// optimize: exclude disconnected nodes.
if (0 <= lRoot[right]) {
Position pos = positions.get(offset);
for (int i = 0; i < pos.count; ++i) {
addNode(pos.backType[i], pos.backID[i], pos.backPos[i] - rootBase, right);
}
}
}
}
// set mark = -1 for unreachable nodes.
void markUnreachable() {
for (int index = 1; index < rootSize - 1; ++index) {
if (rRoot[index] < 0) {
for (int node = lRoot[index]; 0 <= node; node = nodeLeftChain[node]) {
if (VERBOSE) {
System.out.printf("DEBUG: markUnreachable: node=%d\n", node);
}
nodeMark[node] = -1;
}
}
}
}
int connectionCost(ConnectionCosts costs, int left, int right) {
int leftID = nodeLeftID[right];
return ((leftID == 0 && !useEOS) ? 0 : costs.get(nodeRightID[left], leftID));
}
void calcLeftCost(ConnectionCosts costs) {
for (int index = 0; index < rootSize; ++index) {
for (int node = lRoot[index]; 0 <= node; node = nodeLeftChain[node]) {
if (0 <= nodeMark[node]) {
int leastNode = -1;
int leastCost = Integer.MAX_VALUE;
for (int leftNode = rRoot[index]; 0 <= leftNode; leftNode = nodeRightChain[leftNode]) {
if (0 <= nodeMark[leftNode]) {
int cost =
nodeLeftCost[leftNode]
+ nodeWordCost[leftNode]
+ connectionCost(costs, leftNode, node);
if (cost < leastCost) {
leastCost = cost;
leastNode = leftNode;
}
}
}
assert 0 <= leastNode;
nodeLeftNode[node] = leastNode;
nodeLeftCost[node] = leastCost;
if (VERBOSE) {
System.out.printf(
"DEBUG: calcLeftCost: node=%d, leftNode=%d, leftCost=%d\n",
node, nodeLeftNode[node], nodeLeftCost[node]);
}
}
}
}
}
void calcRightCost(ConnectionCosts costs) {
for (int index = rootSize - 1; 0 <= index; --index) {
for (int node = rRoot[index]; 0 <= node; node = nodeRightChain[node]) {
if (0 <= nodeMark[node]) {
int leastNode = -1;
int leastCost = Integer.MAX_VALUE;
for (int rightNode = lRoot[index];
0 <= rightNode;
rightNode = nodeLeftChain[rightNode]) {
if (0 <= nodeMark[rightNode]) {
int cost =
nodeRightCost[rightNode]
+ nodeWordCost[rightNode]
+ connectionCost(costs, node, rightNode);
if (cost < leastCost) {
leastCost = cost;
leastNode = rightNode;
}
}
}
assert 0 <= leastNode;
nodeRightNode[node] = leastNode;
nodeRightCost[node] = leastCost;
if (VERBOSE) {
System.out.printf(
"DEBUG: calcRightCost: node=%d, rightNode=%d, rightCost=%d\n",
node, nodeRightNode[node], nodeRightCost[node]);
}
}
}
}
}
// Mark all nodes that have same text and different par-of-speech or reading.
void markSameSpanNode(int refNode, int value) {
int left = nodeLeft[refNode];
int right = nodeRight[refNode];
for (int node = lRoot[left]; 0 <= node; node = nodeLeftChain[node]) {
if (nodeRight[node] == right) {
nodeMark[node] = value;
}
}
}
List bestPathNodeList() {
List list = new ArrayList<>();
for (int node = nodeRightNode[0]; node != 1; node = nodeRightNode[node]) {
list.add(node);
markSameSpanNode(node, 1);
}
return list;
}
private int cost(int node) {
return nodeLeftCost[node] + nodeWordCost[node] + nodeRightCost[node];
}
List nBestNodeList(int N) {
List list = new ArrayList<>();
int leastCost = Integer.MAX_VALUE;
int leastLeft = -1;
int leastRight = -1;
for (int node = 2; node < nodeCount; ++node) {
if (nodeMark[node] == 0) {
int cost = cost(node);
if (cost < leastCost) {
leastCost = cost;
leastLeft = nodeLeft[node];
leastRight = nodeRight[node];
list.clear();
list.add(node);
} else if (cost == leastCost
&& (nodeLeft[node] != leastLeft || nodeRight[node] != leastRight)) {
list.add(node);
}
}
}
for (int node : list) {
markSameSpanNode(node, N);
}
return list;
}
int bestCost() {
return nodeLeftCost[1];
}
int probeDelta(int start, int end) {
int left = start - rootBase;
int right = end - rootBase;
if (left < 0 || rootSize < right) {
return Integer.MAX_VALUE;
}
int probedCost = Integer.MAX_VALUE;
for (int node = lRoot[left]; 0 <= node; node = nodeLeftChain[node]) {
if (nodeRight[node] == right) {
probedCost = Math.min(probedCost, cost(node));
}
}
return probedCost - bestCost();
}
void debugPrint() {
if (VERBOSE) {
for (int node = 0; node < nodeCount; ++node) {
System.out.printf(
"DEBUG NODE: node=%d, mark=%d, cost=%d, left=%d, right=%d\n",
node, nodeMark[node], cost(node), nodeLeft[node], nodeRight[node]);
}
}
}
}
private Lattice lattice = null;
private void registerNode(int node, char[] fragment) {
int left = lattice.nodeLeft[node];
int right = lattice.nodeRight[node];
Type type = lattice.nodeDicType[node];
if (!discardPunctuation || !isPunctuation(fragment[left])) {
if (type == Type.USER) {
// The code below are based on backtrace().
//
// Expand the phraseID we recorded into the actual segmentation:
final int[] wordIDAndLength = userDictionary.lookupSegmentation(lattice.nodeWordID[node]);
int wordID = wordIDAndLength[0];
pending.add(
new Token(
wordID,
fragment,
left,
right - left,
Type.USER,
lattice.rootBase + left,
userDictionary));
// Output compound
int current = 0;
for (int j = 1; j < wordIDAndLength.length; j++) {
final int len = wordIDAndLength[j];
if (len < right - left) {
pending.add(
new Token(
wordID + j - 1,
fragment,
current + left,
len,
Type.USER,
lattice.rootBase + current + left,
userDictionary));
}
current += len;
}
} else {
pending.add(
new Token(
lattice.nodeWordID[node],
fragment,
left,
right - left,
type,
lattice.rootBase + left,
getDict(type)));
}
}
}
// Sort pending tokens, and set position increment values.
private void fixupPendingList() {
// Sort for removing same tokens.
// USER token should be ahead from normal one.
Collections.sort(
pending,
new Comparator() {
@Override
public int compare(Token a, Token b) {
int aOff = a.getOffset();
int bOff = b.getOffset();
if (aOff != bOff) {
return aOff - bOff;
}
int aLen = a.getLength();
int bLen = b.getLength();
if (aLen != bLen) {
return aLen - bLen;
}
// order of Type is KNOWN, UNKNOWN, USER,
// so we use reversed comparison here.
return b.getType().ordinal() - a.getType().ordinal();
}
});
// Remove same token.
for (int i = 1; i < pending.size(); ++i) {
Token a = pending.get(i - 1);
Token b = pending.get(i);
if (a.getOffset() == b.getOffset() && a.getLength() == b.getLength()) {
pending.remove(i);
// It is important to decrement "i" here, because a next may be removed.
--i;
}
}
// offset=>position map
HashMap map = new HashMap<>();
for (Token t : pending) {
map.put(t.getOffset(), 0);
map.put(t.getOffset() + t.getLength(), 0);
}
// Get uniqe and sorted list of all edge position of tokens.
Integer[] offsets = map.keySet().toArray(new Integer[0]);
Arrays.sort(offsets);
// setup all value of map. It specify N-th position from begin.
for (int i = 0; i < offsets.length; ++i) {
map.put(offsets[i], i);
}
// We got all position length now.
for (Token t : pending) {
t.setPositionLength(map.get(t.getOffset() + t.getLength()) - map.get(t.getOffset()));
}
// Make PENDING to be reversed order to fit its usage.
// If you would like to speedup, you can try reversed order sort
// at first of this function.
Collections.reverse(pending);
}
private int probeDelta(String inText, String requiredToken) throws IOException {
int start = inText.indexOf(requiredToken);
if (start < 0) {
// -1 when no requiredToken.
return -1;
}
int delta = Integer.MAX_VALUE;
int saveNBestCost = nBestCost;
setReader(new StringReader(inText));
reset();
try {
setNBestCost(1);
int prevRootBase = -1;
while (incrementToken()) {
if (lattice.rootBase != prevRootBase) {
prevRootBase = lattice.rootBase;
delta = Math.min(delta, lattice.probeDelta(start, start + requiredToken.length()));
}
}
} finally {
// reset & end
end();
// setReader & close
close();
setNBestCost(saveNBestCost);
}
if (VERBOSE) {
System.out.printf("JapaneseTokenizer: delta = %d: %s-%s\n", delta, inText, requiredToken);
}
return delta == Integer.MAX_VALUE ? -1 : delta;
}
public int calcNBestCost(String examples) {
int maxDelta = 0;
for (String example : examples.split("/")) {
if (!example.isEmpty()) {
String[] pair = example.split("-");
if (pair.length != 2) {
throw new RuntimeException("Unexpected example form: " + example + " (expected two '-')");
} else {
try {
maxDelta = Math.max(maxDelta, probeDelta(pair[0], pair[1]));
} catch (IOException e) {
throw new RuntimeException(
"Internal error calculating best costs from examples. Got ", e);
}
}
}
}
return maxDelta;
}
public void setNBestCost(int value) {
nBestCost = value;
outputNBest = 0 < nBestCost;
}
private void backtraceNBest(final Position endPosData, final boolean useEOS) throws IOException {
if (lattice == null) {
lattice = new Lattice();
}
final int endPos = endPosData.pos;
char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
lattice.setup(fragment, dictionaryMap, positions, lastBackTracePos, endPos, useEOS);
lattice.markUnreachable();
lattice.calcLeftCost(costs);
lattice.calcRightCost(costs);
int bestCost = lattice.bestCost();
if (VERBOSE) {
System.out.printf("DEBUG: 1-BEST COST: %d\n", bestCost);
}
for (int node : lattice.bestPathNodeList()) {
registerNode(node, fragment);
}
for (int n = 2; ; ++n) {
List nbest = lattice.nBestNodeList(n);
if (nbest.isEmpty()) {
break;
}
int cost = lattice.cost(nbest.get(0));
if (VERBOSE) {
System.out.printf("DEBUG: %d-BEST COST: %d\n", n, cost);
}
if (bestCost + nBestCost < cost) {
break;
}
for (int node : nbest) {
registerNode(node, fragment);
}
}
if (VERBOSE) {
lattice.debugPrint();
}
}
// Backtrace from the provided position, back to the last
// time we back-traced, accumulating the resulting tokens to
// the pending list. The pending list is then in-reverse
// (last token should be returned first).
private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
final int endPos = endPosData.pos;
/**
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
* empty buffer
*/
if (endPos == lastBackTracePos) {
return;
}
if (VERBOSE) {
System.out.println(
"\n backtrace: endPos="
+ endPos
+ " pos="
+ pos
+ "; "
+ (pos - lastBackTracePos)
+ " characters; last="
+ lastBackTracePos
+ " cost="
+ endPosData.costs[fromIDX]);
}
final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
if (dotOut != null) {
dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
}
int pos = endPos;
int bestIDX = fromIDX;
Token altToken = null;
// We trace backwards, so this will be the leftWordID of
// the token after the one we are now on:
int lastLeftWordID = -1;
int backCount = 0;
// TODO: sort of silly to make Token instances here; the
// back trace has all info needed to generate the
// token. So, we could just directly set the attrs,
// from the backtrace, in incrementToken w/o ever
// creating Token; we'd have to defer calling freeBefore
// until after the backtrace was fully "consumed" by
// incrementToken.
while (pos > lastBackTracePos) {
// System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
final Position posData = positions.get(pos);
assert bestIDX < posData.count;
int backPos = posData.backPos[bestIDX];
assert backPos >= lastBackTracePos
: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
int length = pos - backPos;
Type backType = posData.backType[bestIDX];
int backID = posData.backID[bestIDX];
int nextBestIDX = posData.backIndex[bestIDX];
if (searchMode && altToken == null && backType != Type.USER) {
// In searchMode, if best path had picked a too-long
// token, we use the "penalty" to compute the allowed
// max cost of an alternate back-trace. If we find an
// alternate back trace with cost below that
// threshold, we pursue it instead (but also output
// the long token).
// System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos);
final int penalty = computeSecondBestThreshold(backPos, pos - backPos);
if (penalty > 0) {
if (VERBOSE) {
System.out.println(
" compound="
+ new String(buffer.get(backPos, pos - backPos))
+ " backPos="
+ backPos
+ " pos="
+ pos
+ " penalty="
+ penalty
+ " cost="
+ posData.costs[bestIDX]
+ " bestIDX="
+ bestIDX
+ " lastLeftID="
+ lastLeftWordID);
}
// Use the penalty to set maxCost on the 2nd best
// segmentation:
int maxCost = posData.costs[bestIDX] + penalty;
if (lastLeftWordID != -1) {
maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
}
// Now, prune all too-long tokens from the graph:
pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]);
// Finally, find 2nd best back-trace and resume
// backtrace there:
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
for (int idx = 0; idx < posData.count; idx++) {
int cost = posData.costs[idx];
// System.out.println(" idx=" + idx + " prevCost=" + cost);
if (lastLeftWordID != -1) {
cost +=
costs.get(
getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
lastLeftWordID);
// System.out.println(" += bgCost=" +
// costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
// lastLeftWordID) + " -> " + cost);
}
// System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
// cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
if (cost < leastCost) {
// System.out.println(" ** ");
leastCost = cost;
leastIDX = idx;
}
}
// System.out.println(" leastIDX=" + leastIDX);
if (VERBOSE) {
System.out.println(
" afterPrune: "
+ posData.count
+ " arcs arriving; leastCost="
+ leastCost
+ " vs threshold="
+ maxCost
+ " lastLeftWordID="
+ lastLeftWordID);
}
if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
// We should have pruned the altToken from the graph:
assert posData.backPos[leastIDX] != backPos;
// Save the current compound token, to output when
// this alternate path joins back:
altToken =
new Token(
backID,
fragment,
backPos - lastBackTracePos,
length,
backType,
backPos,
getDict(backType));
// Redirect our backtrace to 2nd best:
bestIDX = leastIDX;
nextBestIDX = posData.backIndex[bestIDX];
backPos = posData.backPos[bestIDX];
length = pos - backPos;
backType = posData.backType[bestIDX];
backID = posData.backID[bestIDX];
backCount = 0;
// System.out.println(" do alt token!");
} else {
// I think in theory it's possible there is no
// 2nd best path, which is fine; in this case we
// only output the compound token:
// System.out.println(" no alt token! bestIDX=" + bestIDX);
}
}
}
final int offset = backPos - lastBackTracePos;
assert offset >= 0;
if (altToken != null && altToken.getPosition() >= backPos) {
if (outputCompounds) {
// We've backtraced to the position where the
// compound token starts; add it now:
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
if (backCount > 0) {
backCount++;
altToken.setPositionLength(backCount);
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
pending.add(altToken);
} else {
// This means alt token was all punct tokens:
if (VERBOSE) {
System.out.println(" discard all-punctuation altToken=" + altToken);
}
assert discardPunctuation;
}
}
altToken = null;
}
final Dictionary dict = getDict(backType);
if (backType == Type.USER) {
// Expand the phraseID we recorded into the actual
// segmentation:
final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
int wordID = wordIDAndLength[0];
int current = 0;
for (int j = 1; j < wordIDAndLength.length; j++) {
final int len = wordIDAndLength[j];
// System.out.println(" add user: len=" + len);
pending.add(
new Token(
wordID + j - 1,
fragment,
current + offset,
len,
Type.USER,
current + backPos,
dict));
if (VERBOSE) {
System.out.println(" add USER token=" + pending.get(pending.size() - 1));
}
current += len;
}
// Reverse the tokens we just added, because when we
// serve them up from incrementToken we serve in
// reverse:
Collections.reverse(
pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size()));
backCount += wordIDAndLength.length - 1;
} else {
if (extendedMode && backType == Type.UNKNOWN) {
// In EXTENDED mode we convert unknown word into
// unigrams:
int unigramTokenCount = 0;
for (int i = length - 1; i >= 0; i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) {
i--;
charLen = 2;
}
// System.out.println(" extended tok offset="
// + (offset + i));
if (!discardPunctuation || !isPunctuation(fragment[offset + i])) {
pending.add(
new Token(
CharacterDefinition.NGRAM,
fragment,
offset + i,
charLen,
Type.UNKNOWN,
backPos + i,
unkDictionary));
unigramTokenCount++;
}
}
backCount += unigramTokenCount;
} else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict));
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
backCount++;
} else {
if (VERBOSE) {
System.out.println(
" skip punctuation token=" + new String(fragment, offset, length));
}
}
}
lastLeftWordID = dict.getLeftId(backID);
pos = backPos;
bestIDX = nextBestIDX;
}
lastBackTracePos = endPos;
if (VERBOSE) {
System.out.println(" freeBefore pos=" + endPos);
}
// Notify the circular buffers that we are done with
// these positions:
buffer.freeBefore(endPos);
positions.freeBefore(endPos);
}
Dictionary getDict(Type type) {
return dictionaryMap.get(type);
}
private static boolean isPunctuation(char ch) {
switch (Character.getType(ch)) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.FORMAT:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
return true;
default:
return false;
}
}
}