de.datexis.ner.eval.HTMLExport Maven / Gradle / Ivy
package de.datexis.ner.eval;
import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.model.Sentence;
import de.datexis.model.Token;
import de.datexis.model.tag.BIO2Tag;
import de.datexis.model.tag.BIOESTag;
import de.datexis.model.tag.Tag;
import de.datexis.ner.MentionAnnotation;
import java.io.IOException;
import java.util.Arrays;
import java.util.Locale;
import org.apache.commons.io.FileUtils;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Export a Document with MentionAnnotations as HTML
* @author sarnold
*/
public class HTMLExport {
protected static final Logger log = LoggerFactory.getLogger(HTMLExport.class);
StringBuilder html;
public HTMLExport(Dataset data, Class tagset) {
this(data.getDocuments(), tagset, Annotation.Source.GOLD, Annotation.Source.PRED);
}
public HTMLExport(Document doc, Class tagset, Annotation.Source source) {
this(Arrays.asList(doc), tagset, source, source);
}
public HTMLExport(Iterable docs, Class tagset) {
this(docs, tagset, Annotation.Source.GOLD, Annotation.Source.PRED);
}
public HTMLExport(Iterable docs, Class tagset, Annotation.Source expected) {
this(docs, tagset, expected, Annotation.Source.PRED);
}
public HTMLExport(Iterable docs, Class tagset, Annotation.Source expected, Annotation.Source predicted) {
log.info("Exporting HTML...");
html = new StringBuilder();
appendHeader();
for(Document doc : docs) {
if(!doc.isTagAvaliable(expected, tagset)) MentionAnnotation.createTagsFromAnnotations(doc, expected, tagset);
if(!doc.isTagAvaliable(predicted, tagset)) MentionAnnotation.createTagsFromAnnotations(doc, predicted, tagset);
appendDocumentLabels(doc, tagset, expected, predicted);
}
appendFooter();
}
public String getHTML() {
return html.toString();
}
protected void appendDocumentLabels(Document doc, Class target, Annotation.Source expected, Annotation.Source predicted) {
if(expected.equals(predicted)) {
html.append(annotateDocumentLabels(doc, target, expected));
} else {
html.append(annotateDocumentLabelsDiff(doc, target, expected, predicted));
}
}
protected String annotateDocumentLabelsDiff(Document doc, Class target) {
return annotateDocumentLabelsDiff(doc, target, Annotation.Source.GOLD, Annotation.Source.PRED);
}
protected String annotateDocumentLabelsDiff(Document doc, Class target, Annotation.Source expected, Annotation.Source predicted) {
StringBuilder html = new StringBuilder();
html.append("");
int cursor = doc.getBegin();
for(Sentence s : doc.getSentences()) {
String last = "";
for(Token t : s.getTokens()) {
Tag gold = t.getTag(expected, target);
Tag pred = t.getTag(predicted, target);
//if(!ELStringUtils.skipSpaceAfter.contains(last) && !ELStringUtils.skipSpaceBefore.contains(t.getText())) html.append(" ");
if(t.isEmpty()) continue;
if(cursor > t.getBegin()) {
// reset in case of wrong offsets
html.append(" ");
cursor = t.getBegin();
}
while(cursor < t.getBegin()) {
html.append(" ");
cursor++;
}
cursor = t.getEnd();
if(!gold.getTag().equals("O") || !pred.getTag().equals("O")) {
// positive - labels
html.append("")
.append("");
INDArray vector = pred.getVector();
if(gold.getTag().equals("O") && !pred.getTag().equals("O")) {
// false positive - blue
html.append("")
.append(t.getText())
.append("");
} else if(!gold.getTag().equals("O") && !pred.getTag().equals("O")) {
if(gold.getTag().equals(pred.getTag())) {
// true positive - green
html.append("")
.append(t.getText())
.append("");
} else {
// boundary error - yellow
html.append("")
.append(t.getText())
.append("");
}
} else if(!gold.getTag().equals("O") && pred.getTag().equals("O")) {
// false negative - red
html.append("")
.append(t.getText())
.append("");
}
html.append("")
.append("");
} else {
// true negative - white
html.append("")
.append(t.getText())
.append("");
}
last = t.getText();
}
html.append("\n");
}
html.append("
\n");
return html.toString();
}
protected String annotateDocumentLabels(Document doc, Class tagset) {
return annotateDocumentLabels(doc, tagset, Annotation.Source.PRED);
}
protected String annotateDocumentLabels(Document doc, Class tagset, Annotation.Source source) {
StringBuilder html = new StringBuilder();
html.append("");
int cursor = doc.getBegin();
for(Sentence s : doc.getSentences()) {
for(Token t : s.getTokens()) {
Tag pred = t.getTag(source, tagset);
if(t.isEmpty()) continue;
if(cursor > t.getBegin()) {
// reset in case of wrong offsets
html.append(" ");
cursor = t.getBegin();
}
while(cursor < t.getBegin()) {
html.append(" ");
cursor++;
}
cursor = t.getEnd();
if(!pred.getTag().equals("O")) {
INDArray vector = pred.getVector();
html.append("");
html.append(t.getText());
html.append("");
} else {
html.append("");
html.append(t.getText());
html.append("");
}
}
html.append("\n");
}
html.append("
\n");
return html.toString();
}
protected void appendHeader() {
html.append("\n\n\n \n");
//html.append("\n");
html.append("\n");
html.append("\n\n");
}
private void appendFooter() {
html.append("\n");
}
public void saveHTML(Resource path, String name) {
Resource file = path.resolve(name + ".html");
try {
FileUtils.writeStringToFile(file.toFile(), getHTML());
} catch(IOException ex) {
log.error("Could not write output: " + ex.toString());
}
}
}