org.apache.lucene.analysis.ja.GraphvizFormatter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-kuromoji Show documentation
Show all versions of lucene-analysis-kuromoji Show documentation
Apache Lucene (module: kuromoji)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.Dictionary;
// TODO: would be nice to show 2nd best path in a diff't
// color...
/** Outputs the dot (graphviz) string for the viterbi lattice. */
public class GraphvizFormatter {
private static final String BOS_LABEL = "BOS";
private static final String EOS_LABEL = "EOS";
private static final String FONT_NAME = "Helvetica";
private final ConnectionCosts costs;
private final Map bestPathMap;
private final StringBuilder sb = new StringBuilder();
public GraphvizFormatter(ConnectionCosts costs) {
this.costs = costs;
this.bestPathMap = new HashMap<>();
sb.append(formatHeader());
sb.append(" init [style=invis]\n");
sb.append(" init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
}
public String finish() {
sb.append(formatTrailer());
return sb.toString();
}
// Backtraces another incremental fragment:
void onBacktrace(
JapaneseTokenizer tok,
WrappedPositionArray positions,
int lastBackTracePos,
Position endPosData,
int fromIDX,
char[] fragment,
boolean isEnd) {
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
if (isEnd) {
sb.append(" fini [style=invis]\n");
sb.append(" ");
sb.append(getNodeID(endPosData.pos, fromIDX));
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
}
}
// Records which arcs make up the best bath:
private void setBestPathMap(
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
bestPathMap.clear();
int pos = endPosData.pos;
int bestIDX = fromIDX;
while (pos > startPos) {
final Position posData = positions.get(pos);
final int backPos = posData.backPos[bestIDX];
final int backIDX = posData.backIndex[bestIDX];
final String toNodeID = getNodeID(pos, bestIDX);
final String fromNodeID = getNodeID(backPos, backIDX);
assert !bestPathMap.containsKey(fromNodeID);
assert !bestPathMap.containsValue(toNodeID);
bestPathMap.put(fromNodeID, toNodeID);
pos = backPos;
bestIDX = backIDX;
}
}
private String formatNodes(
JapaneseTokenizer tok,
WrappedPositionArray positions,
int startPos,
Position endPosData,
char[] fragment) {
StringBuilder sb = new StringBuilder();
// Output nodes
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
final Position posData = positions.get(pos);
for (int idx = 0; idx < posData.count; idx++) {
sb.append(" ");
sb.append(getNodeID(pos, idx));
sb.append(" [label=\"");
sb.append(pos);
sb.append(": ");
sb.append(posData.lastRightID[idx]);
sb.append("\"]\n");
}
}
// Output arcs
for (int pos = endPosData.pos; pos > startPos; pos--) {
final Position posData = positions.get(pos);
for (int idx = 0; idx < posData.count; idx++) {
final Position backPosData = positions.get(posData.backPos[idx]);
final String toNodeID = getNodeID(pos, idx);
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
sb.append(" ");
sb.append(fromNodeID);
sb.append(" -> ");
sb.append(toNodeID);
final String attrs;
if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
// This arc is on best path
attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
} else {
attrs = "";
}
final Dictionary dict = tok.getDict(posData.backType[idx]);
final int wordCost = dict.getWordCost(posData.backID[idx]);
final int bgCost =
costs.get(
backPosData.lastRightID[posData.backIndex[idx]],
dict.getLeftId(posData.backID[idx]));
final String surfaceForm =
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
sb.append(" [label=\"");
sb.append(surfaceForm);
sb.append(' ');
sb.append(wordCost);
if (bgCost >= 0) {
sb.append('+');
}
sb.append(bgCost);
sb.append("\"");
sb.append(attrs);
sb.append("]\n");
}
}
return sb.toString();
}
private String formatHeader() {
StringBuilder sb = new StringBuilder();
sb.append("digraph viterbi {\n");
sb.append(
" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
// sb.append(" // A2 paper size\n");
// sb.append(" size = \"34.4,16.5\";\n");
// sb.append(" // try to fill paper\n");
// sb.append(" ratio = fill;\n");
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
sb.append(
" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\""
+ FONT_NAME
+ "\" ]\n");
return sb.toString();
}
private String formatTrailer() {
return "}";
}
private String getNodeID(int pos, int idx) {
return pos + "." + idx;
}
}