All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.ja.JapaneseTokenizer Maven / Gradle / Ivy

There is a newer version: 8.11.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.TokenInfoFST;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;

// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...

/**
 * Tokenizer for Japanese that uses morphological analysis.
 * 

* This tokenizer sets a number of additional attributes: *

    *
  • {@link BaseFormAttribute} containing base form for inflected * adjectives and verbs. *
  • {@link PartOfSpeechAttribute} containing part-of-speech. *
  • {@link ReadingAttribute} containing reading and pronunciation. *
  • {@link InflectionAttribute} containing additional part-of-speech * information for inflected forms. *
*

* This tokenizer uses a rolling Viterbi search to find the * least cost segmentation (path) of the incoming characters. * For tokens that appear to be compound (> length 2 for all * Kanji, or > length 7 for non-Kanji), we see if there is a * 2nd best segmentation of that token after applying * penalties to the long tokens. If so, and the Mode is * {@link Mode#SEARCH}, we output the alternate segmentation * as well. */ public final class JapaneseTokenizer extends Tokenizer { /** * Tokenization mode: this determines how the tokenizer handles * compound and unknown words. */ public static enum Mode { /** * Ordinary segmentation: no decomposition for compounds, */ NORMAL, /** * Segmentation geared towards search: this includes a * decompounding process for long nouns, also including * the full compound token as a synonym. */ SEARCH, /** * Extended mode outputs unigrams for unknown words. * @lucene.experimental */ EXTENDED } /** * Default tokenization mode. Currently this is {@link Mode#SEARCH}. */ public static final Mode DEFAULT_MODE = Mode.SEARCH; /** * Token type reflecting the original source of this token */ public enum Type { /** * Known words from the system dictionary. */ KNOWN, /** * Unknown words (heuristically segmented). */ UNKNOWN, /** * Known words from the user dictionary. */ USER } private static final boolean VERBOSE = false; private static final int SEARCH_MODE_KANJI_LENGTH = 2; private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH private static final int SEARCH_MODE_KANJI_PENALTY = 3000; private static final int SEARCH_MODE_OTHER_PENALTY = 1700; // For safety: private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; private static final int MAX_BACKTRACE_GAP = 1024; private final EnumMap dictionaryMap = new EnumMap<>(Type.class); private final TokenInfoFST fst; private final TokenInfoDictionary dictionary; private final UnknownDictionary unkDictionary; private final ConnectionCosts costs; private final UserDictionary userDictionary; private final CharacterDefinition characterDefinition; private final FST.Arc arc = new FST.Arc<>(); private final FST.BytesReader fstReader; private final IntsRef wordIdRef = new IntsRef(); private final FST.BytesReader userFSTReader; private final TokenInfoFST userFST; private final RollingCharBuffer buffer = new RollingCharBuffer(); private final WrappedPositionArray positions = new WrappedPositionArray(); private final boolean discardPunctuation; private final boolean searchMode; private final boolean extendedMode; private final boolean outputCompounds; private boolean outputNBest = false; // Allowable cost difference for N-best output: private int nBestCost = 0; // True once we've hit the EOF from the input reader: private boolean end; // Last absolute position we backtraced from: private int lastBackTracePos; // Position of last token we returned; we use this to // figure out whether to set posIncr to 0 or 1: private int lastTokenPos; // Next absolute position to process: private int pos; // Already parsed, but not yet passed to caller, tokens: private final List pending = new ArrayList<>(); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class); private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class); private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class); /** * Create a new JapaneseTokenizer. *

* Uses the default AttributeFactory. * * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, mode); } /** * Create a new JapaneseTokenizer. * * @param factory the AttributeFactory to use * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer (AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { super(factory); dictionary = TokenInfoDictionary.getInstance(); fst = dictionary.getFST(); unkDictionary = UnknownDictionary.getInstance(); characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; } this.discardPunctuation = discardPunctuation; switch(mode){ case SEARCH: searchMode = true; extendedMode = false; outputCompounds = true; break; case EXTENDED: searchMode = true; extendedMode = true; outputCompounds = false; break; default: searchMode = false; extendedMode = false; outputCompounds = false; break; } buffer.reset(this.input); resetState(); dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); } private GraphvizFormatter dotOut; /** Expert: set this to produce graphviz (dot) output of * the Viterbi lattice */ public void setGraphvizFormatter(GraphvizFormatter dotOut) { this.dotOut = dotOut; } @Override public void close() throws IOException { super.close(); buffer.reset(input); } @Override public void reset() throws IOException { super.reset(); buffer.reset(input); resetState(); } private void resetState() { positions.reset(); pos = 0; end = false; lastBackTracePos = 0; lastTokenPos = -1; pending.clear(); // Add BOS: positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN); } @Override public void end() throws IOException { super.end(); // Set final offset int finalOffset = correctOffset(pos); offsetAtt.setOffset(finalOffset, finalOffset); } // Returns the added cost that a 2nd best segmentation is // allowed to have. Ie, if we see path with cost X, // ending in a compound word, and this method returns // threshold > 0, then we will also find the 2nd best // segmentation and if its path score is within this // threshold of X, we'll include it in the output: private int computeSecondBestThreshold(int pos, int length) throws IOException { // TODO: maybe we do something else here, instead of just // using the penalty...? EG we can be more aggressive on // when to also test for 2nd best path return computePenalty(pos, length); } private int computePenalty(int pos, int length) throws IOException { if (length > SEARCH_MODE_KANJI_LENGTH) { boolean allKanji = true; // check if node consists of only kanji final int endPos = pos + length; for (int pos2 = pos; pos2 < endPos; pos2++) { if (!characterDefinition.isKanji((char) buffer.get(pos2))) { allKanji = false; break; } } if (allKanji) { // Process only Kanji keywords return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY; } else if (length > SEARCH_MODE_OTHER_LENGTH) { return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY; } } return 0; } // Holds all back pointers arriving to this position: final static class Position { int pos; int count; // maybe single int array * 5? int[] costs = new int[8]; int[] lastRightID = new int[8]; int[] backPos = new int[8]; int[] backIndex = new int[8]; int[] backID = new int[8]; Type[] backType = new Type[8]; // Only used when finding 2nd best segmentation under a // too-long token: int forwardCount; int[] forwardPos = new int[8]; int[] forwardID = new int[8]; int[] forwardIndex = new int[8]; Type[] forwardType = new Type[8]; public void grow() { costs = ArrayUtil.grow(costs, 1+count); lastRightID = ArrayUtil.grow(lastRightID, 1+count); backPos = ArrayUtil.grow(backPos, 1+count); backIndex = ArrayUtil.grow(backIndex, 1+count); backID = ArrayUtil.grow(backID, 1+count); // NOTE: sneaky: grow separately because // ArrayUtil.grow will otherwise pick a different // length than the int[]s we just grew: final Type[] newBackType = new Type[backID.length]; System.arraycopy(backType, 0, newBackType, 0, backType.length); backType = newBackType; } public void growForward() { forwardPos = ArrayUtil.grow(forwardPos, 1+forwardCount); forwardID = ArrayUtil.grow(forwardID, 1+forwardCount); forwardIndex = ArrayUtil.grow(forwardIndex, 1+forwardCount); // NOTE: sneaky: grow separately because // ArrayUtil.grow will otherwise pick a different // length than the int[]s we just grew: final Type[] newForwardType = new Type[forwardPos.length]; System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length); forwardType = newForwardType; } public void add(int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) { // NOTE: this isn't quite a true Viterbi search, // because we should check if lastRightID is // already present here, and only update if the new // cost is less than the current cost, instead of // simply appending. However, that will likely hurt // performance (usually we add a lastRightID only once), // and it means we actually create the full graph // intersection instead of a "normal" Viterbi lattice: if (count == costs.length) { grow(); } this.costs[count] = cost; this.lastRightID[count] = lastRightID; this.backPos[count] = backPos; this.backIndex[count] = backIndex; this.backID[count] = backID; this.backType[count] = backType; count++; } public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) { if (forwardCount == this.forwardID.length) { growForward(); } this.forwardPos[forwardCount] = forwardPos; this.forwardIndex[forwardCount] = forwardIndex; this.forwardID[forwardCount] = forwardID; this.forwardType[forwardCount] = forwardType; forwardCount++; } public void reset() { count = 0; // forwardCount naturally resets after it runs: assert forwardCount == 0: "pos=" + pos + " forwardCount=" + forwardCount; } } private void add(Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) throws IOException { final int wordCost = dict.getWordCost(wordID); final int leftID = dict.getLeftId(wordID); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; assert fromPosData.count > 0; for(int idx=0;idx lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(token.getPositionLength()); } else { assert token.getPosition() > lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(1); } if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); } lastTokenPos = token.getPosition(); return true; } // TODO: make generic'd version of this "circular array"? // It's a bit tricky because we do things to the Position // (eg, set .pos = N on reuse)... static final class WrappedPositionArray { private Position[] positions = new Position[8]; public WrappedPositionArray() { for(int i=0;i 0) { if (nextWrite == -1) { nextWrite = positions.length - 1; } positions[nextWrite--].reset(); count--; } nextWrite = 0; nextPos = 0; count = 0; } /** Get Position instance for this absolute position; * this is allowed to be arbitrarily far "in the * future" but cannot be before the last freeBefore. */ public Position get(int pos) { while(pos >= nextPos) { //System.out.println("count=" + count + " vs len=" + positions.length); if (count == positions.length) { Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; //System.out.println("grow positions " + newPositions.length); System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite); System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite); for(int i=positions.length;i= nextPos - count; } private int getIndex(int pos) { int index = nextWrite - (nextPos - pos); if (index < 0) { index += positions.length; } return index; } public void freeBefore(int pos) { final int toFree = count - (nextPos - pos); assert toFree >= 0; assert toFree <= count; int index = nextWrite - count; if (index < 0) { index += positions.length; } for(int i=0;i= MAX_BACKTRACE_GAP) { // Safety: if we've buffered too much, force a // backtrace now. We find the least-cost partial // path, across all paths, backtrace from it, and // then prune all others. Note that this, in // general, can produce the wrong result, if the // total best path did not in fact back trace // through this partial best path. But it's the // best we can do... (short of not having a // safety!). // First pass: find least cost partial path so far, // including ending at future positions: int leastIDX = -1; int leastCost = Integer.MAX_VALUE; Position leastPosData = null; for(int pos2=pos;pos2 posData.pos) { pos++; continue; } final char firstCharacter = (char) buffer.get(pos); if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) { // Find unknown match: final int characterId = characterDefinition.getCharacterClass(firstCharacter); final boolean isPunct = isPunctuation(firstCharacter); // NOTE: copied from UnknownDictionary.lookup: int unknownWordLength; if (!characterDefinition.isGroup(firstCharacter)) { unknownWordLength = 1; } else { // Extract unknown word. Characters with the same character class are considered to be part of unknown word unknownWordLength = 1; for (int posAhead=pos+1;unknownWordLength 0) { final Position endPosData = positions.get(pos); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; if (VERBOSE) { System.out.println(" end: " + endPosData.count + " nodes"); } for(int idx=0;idx positions.get(i).count).sum(); private int positionCount(WrappedPositionArray positions, int beg, int end) { int count = 0; for (int i = beg; i < end; ++i) { count += positions.get(i).count; } return count; } void setup(char[] fragment, EnumMap dictionaryMap, WrappedPositionArray positions, int prevOffset, int endOffset, boolean useEOS) { assert positions.get(prevOffset).count == 1; if (VERBOSE) { System.out.printf("DEBUG: setup: prevOffset=%d, endOffset=%d\n", prevOffset, endOffset); } this.fragment = fragment; this.dictionaryMap = dictionaryMap; this.useEOS = useEOS; // Initialize lRoot and rRoot. setupRoot(prevOffset, endOffset); // "+ 2" for first/last record. setupNodePool(positionCount(positions, prevOffset + 1, endOffset + 1) + 2); // substitute for BOS = 0 Position first = positions.get(prevOffset); if (addNode(first.backType[0], first.backID[0], -1, 0) != 0) { assert false; } // EOS = 1 if (addNode(Type.KNOWN, -1, endOffset - rootBase, -1) != 1) { assert false; } for (int offset = endOffset; prevOffset < offset; --offset) { int right = offset - rootBase; // optimize: exclude disconnected nodes. if (0 <= lRoot[right]) { Position pos = positions.get(offset); for (int i = 0; i < pos.count; ++i) { addNode(pos.backType[i], pos.backID[i], pos.backPos[i] - rootBase, right); } } } } // set mark = -1 for unreachable nodes. void markUnreachable() { for (int index = 1; index < rootSize - 1; ++index) { if (rRoot[index] < 0) { for (int node = lRoot[index]; 0 <= node; node = nodeLeftChain[node]) { if (VERBOSE) { System.out.printf("DEBUG: markUnreachable: node=%d\n", node); } nodeMark[node] = -1; } } } } int connectionCost(ConnectionCosts costs, int left, int right) { int leftID = nodeLeftID[right]; return ((leftID == 0 && !useEOS) ? 0 : costs.get(nodeRightID[left], leftID)); } void calcLeftCost(ConnectionCosts costs) { for (int index = 0; index < rootSize; ++index) { for (int node = lRoot[index]; 0 <= node; node = nodeLeftChain[node]) { if (0 <= nodeMark[node]) { int leastNode = -1; int leastCost = Integer.MAX_VALUE; for (int leftNode = rRoot[index]; 0 <= leftNode; leftNode = nodeRightChain[leftNode]) { if (0 <= nodeMark[leftNode]) { int cost = nodeLeftCost[leftNode] + nodeWordCost[leftNode] + connectionCost(costs, leftNode, node); if (cost < leastCost) { leastCost = cost; leastNode = leftNode; } } } assert 0 <= leastNode; nodeLeftNode[node] = leastNode; nodeLeftCost[node] = leastCost; if (VERBOSE) { System.out.printf("DEBUG: calcLeftCost: node=%d, leftNode=%d, leftCost=%d\n", node, nodeLeftNode[node], nodeLeftCost[node]); } } } } } void calcRightCost(ConnectionCosts costs) { for (int index = rootSize - 1; 0 <= index; --index) { for (int node = rRoot[index]; 0 <= node; node = nodeRightChain[node]) { if (0 <= nodeMark[node]) { int leastNode = -1; int leastCost = Integer.MAX_VALUE; for (int rightNode = lRoot[index]; 0 <= rightNode; rightNode = nodeLeftChain[rightNode]) { if (0 <= nodeMark[rightNode]) { int cost = nodeRightCost[rightNode] + nodeWordCost[rightNode] + connectionCost(costs, node, rightNode); if (cost < leastCost) { leastCost = cost; leastNode = rightNode; } } } assert 0 <= leastNode; nodeRightNode[node] = leastNode; nodeRightCost[node] = leastCost; if (VERBOSE) { System.out.printf("DEBUG: calcRightCost: node=%d, rightNode=%d, rightCost=%d\n", node, nodeRightNode[node], nodeRightCost[node]); } } } } } // Mark all nodes that have same text and different par-of-speech or reading. void markSameSpanNode(int refNode, int value) { int left = nodeLeft[refNode]; int right = nodeRight[refNode]; for (int node = lRoot[left]; 0 <= node; node = nodeLeftChain[node]) { if (nodeRight[node] == right) { nodeMark[node] = value; } } } List bestPathNodeList() { List list = new ArrayList<>(); for (int node = nodeRightNode[0]; node != 1; node = nodeRightNode[node]) { list.add(node); markSameSpanNode(node, 1); } return list; } private int cost(int node) { return nodeLeftCost[node] + nodeWordCost[node] + nodeRightCost[node]; } List nBestNodeList(int N) { List list = new ArrayList<>(); int leastCost = Integer.MAX_VALUE; int leastLeft = -1; int leastRight = -1; for (int node = 2; node < nodeCount; ++node) { if (nodeMark[node] == 0) { int cost = cost(node); if (cost < leastCost) { leastCost = cost; leastLeft = nodeLeft[node]; leastRight = nodeRight[node]; list.clear(); list.add(node); } else if (cost == leastCost && (nodeLeft[node] != leastLeft || nodeRight[node] != leastRight)) { list.add(node); } } } for (int node : list) { markSameSpanNode(node, N); } return list; } int bestCost() { return nodeLeftCost[1]; } int probeDelta(int start, int end) { int left = start - rootBase; int right = end - rootBase; if (left < 0 || rootSize < right) { return Integer.MAX_VALUE; } int probedCost = Integer.MAX_VALUE; for (int node = lRoot[left]; 0 <= node; node = nodeLeftChain[node]) { if (nodeRight[node] == right) { probedCost = Math.min(probedCost, cost(node)); } } return probedCost - bestCost(); } void debugPrint() { if (VERBOSE) { for (int node = 0; node < nodeCount; ++node) { System.out.printf("DEBUG NODE: node=%d, mark=%d, cost=%d, left=%d, right=%d\n", node, nodeMark[node], cost(node), nodeLeft[node], nodeRight[node]); } } } } private Lattice lattice = null; private void registerNode(int node, char[] fragment) { int left = lattice.nodeLeft[node]; int right = lattice.nodeRight[node]; Type type = lattice.nodeDicType[node]; if (!discardPunctuation || !isPunctuation(fragment[left])) { if (type == Type.USER) { // The code below are based on backtrace(). // // Expand the phraseID we recorded into the actual segmentation: final int[] wordIDAndLength = userDictionary.lookupSegmentation(lattice.nodeWordID[node]); int wordID = wordIDAndLength[0]; pending.add(new Token(wordID, fragment, left, right - left, Type.USER, lattice.rootBase + left, userDictionary)); // Output compound int current = 0; for (int j = 1; j < wordIDAndLength.length; j++) { final int len = wordIDAndLength[j]; if (len < right - left) { pending.add(new Token(wordID + j - 1, fragment, current + left, len, Type.USER, lattice.rootBase + current + left, userDictionary)); } current += len; } } else { pending.add(new Token(lattice.nodeWordID[node], fragment, left, right - left, type, lattice.rootBase + left, getDict(type))); } } } // Sort pending tokens, and set position increment values. private void fixupPendingList() { // Sort for removing same tokens. // USER token should be ahead from normal one. Collections.sort(pending, new Comparator() { @Override public int compare(Token a, Token b) { int aOff = a.getOffset(); int bOff = b.getOffset(); if (aOff != bOff) { return aOff - bOff; } int aLen = a.getLength(); int bLen = b.getLength(); if (aLen != bLen) { return aLen - bLen; } // order of Type is KNOWN, UNKNOWN, USER, // so we use reversed comparison here. return b.getType().ordinal() - a.getType().ordinal(); } }); // Remove same token. for (int i = 1; i < pending.size(); ++i) { Token a = pending.get(i - 1); Token b = pending.get(i); if (a.getOffset() == b.getOffset() && a.getLength() == b.getLength()) { pending.remove(i); // It is important to decrement "i" here, because a next may be removed. --i; } } // offset=>position map HashMap map = new HashMap<>(); for (Token t : pending) { map.put(t.getOffset(), 0); map.put(t.getOffset() + t.getLength(), 0); } // Get uniqe and sorted list of all edge position of tokens. Integer[] offsets = map.keySet().toArray(new Integer[0]); Arrays.sort(offsets); // setup all value of map. It specify N-th position from begin. for (int i = 0; i < offsets.length; ++i) { map.put(offsets[i], i); } // We got all position length now. for (Token t : pending) { t.setPositionLength(map.get(t.getOffset() + t.getLength()) - map.get(t.getOffset())); } // Make PENDING to be reversed order to fit its usage. // If you would like to speedup, you can try reversed order sort // at first of this function. Collections.reverse(pending); } private int probeDelta(String inText, String requiredToken) throws IOException { int start = inText.indexOf(requiredToken); if (start < 0) { // -1 when no requiredToken. return -1; } int delta = Integer.MAX_VALUE; int saveNBestCost = nBestCost; setReader(new StringReader(inText)); reset(); try { setNBestCost(1); int prevRootBase = -1; while (incrementToken()) { if (lattice.rootBase != prevRootBase) { prevRootBase = lattice.rootBase; delta = Math.min(delta, lattice.probeDelta(start, start + requiredToken.length())); } } } finally { // reset & end end(); // setReader & close close(); setNBestCost(saveNBestCost); } if (VERBOSE) { System.out.printf("JapaneseTokenizer: delta = %d: %s-%s\n", delta, inText, requiredToken); } return delta == Integer.MAX_VALUE ? -1 : delta; } public int calcNBestCost(String examples) { int maxDelta = 0; for (String example : examples.split("/")) { if (!example.isEmpty()) { String[] pair = example.split("-"); if (pair.length != 2) { throw new RuntimeException("Unexpected example form: " + example + " (expected two '-')"); } else { try { maxDelta = Math.max(maxDelta, probeDelta(pair[0], pair[1])); } catch (IOException e) { throw new RuntimeException("Internal error calculating best costs from examples. Got ", e); } } } } return maxDelta; } public void setNBestCost(int value) { nBestCost = value; outputNBest = 0 < nBestCost; } private void backtraceNBest(final Position endPosData, final boolean useEOS) throws IOException { if (lattice == null) { lattice = new Lattice(); } final int endPos = endPosData.pos; char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos); lattice.setup(fragment, dictionaryMap, positions, lastBackTracePos, endPos, useEOS); lattice.markUnreachable(); lattice.calcLeftCost(costs); lattice.calcRightCost(costs); int bestCost = lattice.bestCost(); if (VERBOSE) { System.out.printf("DEBUG: 1-BEST COST: %d\n", bestCost); } for (int node : lattice.bestPathNodeList()) { registerNode(node, fragment); } for (int n = 2;; ++n) { List nbest = lattice.nBestNodeList(n); if (nbest.isEmpty()) { break; } int cost = lattice.cost(nbest.get(0)); if (VERBOSE) { System.out.printf("DEBUG: %d-BEST COST: %d\n", n, cost); } if (bestCost + nBestCost < cost) { break; } for (int node : nbest) { registerNode(node, fragment); } } if (VERBOSE) { lattice.debugPrint(); } } // Backtrace from the provided position, back to the last // time we back-traced, accumulating the resulting tokens to // the pending list. The pending list is then in-reverse // (last token should be returned first). private void backtrace(final Position endPosData, final int fromIDX) throws IOException { final int endPos = endPosData.pos; if (VERBOSE) { System.out.println("\n backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]); } final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos); if (dotOut != null) { dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end); } int pos = endPos; int bestIDX = fromIDX; Token altToken = null; // We trace backwards, so this will be the leftWordID of // the token after the one we are now on: int lastLeftWordID = -1; int backCount = 0; // TODO: sort of silly to make Token instances here; the // back trace has all info needed to generate the // token. So, we could just directly set the attrs, // from the backtrace, in incrementToken w/o ever // creating Token; we'd have to defer calling freeBefore // until after the backtrace was fully "consumed" by // incrementToken. while (pos > lastBackTracePos) { //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); final Position posData = positions.get(pos); assert bestIDX < posData.count; int backPos = posData.backPos[bestIDX]; assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos; int length = pos - backPos; Type backType = posData.backType[bestIDX]; int backID = posData.backID[bestIDX]; int nextBestIDX = posData.backIndex[bestIDX]; if (outputCompounds && searchMode && altToken == null && backType != Type.USER) { // In searchMode, if best path had picked a too-long // token, we use the "penalty" to compute the allowed // max cost of an alternate back-trace. If we find an // alternate back trace with cost below that // threshold, we pursue it instead (but also output // the long token). //System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); final int penalty = computeSecondBestThreshold(backPos, pos-backPos); if (penalty > 0) { if (VERBOSE) { System.out.println(" compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); } // Use the penalty to set maxCost on the 2nd best // segmentation: int maxCost = posData.costs[bestIDX] + penalty; if (lastLeftWordID != -1) { maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID); } // Now, prune all too-long tokens from the graph: pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]); // Finally, find 2nd best back-trace and resume // backtrace there: int leastCost = Integer.MAX_VALUE; int leastIDX = -1; for(int idx=0;idx " + cost); } //System.out.println("penalty " + posData.backPos[idx] + " to " + pos); //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); if (cost < leastCost) { //System.out.println(" ** "); leastCost = cost; leastIDX = idx; } } //System.out.println(" leastIDX=" + leastIDX); if (VERBOSE) { System.out.println(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); } if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) { // We should have pruned the altToken from the graph: assert posData.backPos[leastIDX] != backPos; // Save the current compound token, to output when // this alternate path joins back: altToken = new Token(backID, fragment, backPos - lastBackTracePos, length, backType, backPos, getDict(backType)); // Redirect our backtrace to 2nd best: bestIDX = leastIDX; nextBestIDX = posData.backIndex[bestIDX]; backPos = posData.backPos[bestIDX]; length = pos - backPos; backType = posData.backType[bestIDX]; backID = posData.backID[bestIDX]; backCount = 0; //System.out.println(" do alt token!"); } else { // I think in theory it's possible there is no // 2nd best path, which is fine; in this case we // only output the compound token: //System.out.println(" no alt token! bestIDX=" + bestIDX); } } } final int offset = backPos - lastBackTracePos; assert offset >= 0; if (altToken != null && altToken.getPosition() >= backPos) { // We've backtraced to the position where the // compound token starts; add it now: // The pruning we did when we created the altToken // ensures that the back trace will align back with // the start of the altToken: assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos; // NOTE: not quite right: the compound token may // have had all punctuation back traced so far, but // then the decompounded token at this position is // not punctuation. In this case backCount is 0, // but we should maybe add the altToken anyway...? if (backCount > 0) { backCount++; altToken.setPositionLength(backCount); if (VERBOSE) { System.out.println(" add altToken=" + altToken); } pending.add(altToken); } else { // This means alt token was all punct tokens: if (VERBOSE) { System.out.println(" discard all-punctuation altToken=" + altToken); } assert discardPunctuation; } altToken = null; } final Dictionary dict = getDict(backType); if (backType == Type.USER) { // Expand the phraseID we recorded into the actual // segmentation: final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID); int wordID = wordIDAndLength[0]; int current = 0; for(int j=1; j < wordIDAndLength.length; j++) { final int len = wordIDAndLength[j]; //System.out.println(" add user: len=" + len); pending.add(new Token(wordID+j-1, fragment, current + offset, len, Type.USER, current + backPos, dict)); if (VERBOSE) { System.out.println(" add USER token=" + pending.get(pending.size()-1)); } current += len; } // Reverse the tokens we just added, because when we // serve them up from incrementToken we serve in // reverse: Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size())); backCount += wordIDAndLength.length-1; } else { if (extendedMode && backType == Type.UNKNOWN) { // In EXTENDED mode we convert unknown word into // unigrams: int unigramTokenCount = 0; for(int i=length-1;i>=0;i--) { int charLen = 1; if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) { i--; charLen = 2; } //System.out.println(" extended tok offset=" //+ (offset + i)); if (!discardPunctuation || !isPunctuation(fragment[offset+i])) { pending.add(new Token(CharacterDefinition.NGRAM, fragment, offset + i, charLen, Type.UNKNOWN, backPos + i, unkDictionary)); unigramTokenCount++; } } backCount += unigramTokenCount; } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) { pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict)); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size()-1)); } backCount++; } else { if (VERBOSE) { System.out.println(" skip punctuation token=" + new String(fragment, offset, length)); } } } lastLeftWordID = dict.getLeftId(backID); pos = backPos; bestIDX = nextBestIDX; } lastBackTracePos = endPos; if (VERBOSE) { System.out.println(" freeBefore pos=" + endPos); } // Notify the circular buffers that we are done with // these positions: buffer.freeBefore(endPos); positions.freeBefore(endPos); } Dictionary getDict(Type type) { return dictionaryMap.get(type); } private static boolean isPunctuation(char ch) { switch(Character.getType(ch)) { case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.CONTROL: case Character.FORMAT: case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: case Character.INITIAL_QUOTE_PUNCTUATION: case Character.FINAL_QUOTE_PUNCTUATION: return true; default: return false; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy