All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.datexis.ner.eval.HTMLExport Maven / Gradle / Ivy

package de.datexis.ner.eval;

import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.model.Sentence;
import de.datexis.model.Token;
import de.datexis.model.tag.BIO2Tag;
import de.datexis.model.tag.BIOESTag;
import de.datexis.model.tag.Tag;
import de.datexis.ner.MentionAnnotation;
import java.io.IOException;
import java.util.Arrays;
import java.util.Locale;
import org.apache.commons.io.FileUtils;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Export a Document with MentionAnnotations as HTML
 * @author sarnold
 */
public class HTMLExport {
  
  protected static final Logger log = LoggerFactory.getLogger(HTMLExport.class);
  
  StringBuilder html;
  
  public HTMLExport(Dataset data, Class tagset) {
    this(data.getDocuments(), tagset, Annotation.Source.GOLD, Annotation.Source.PRED);
  }
  
  public HTMLExport(Document doc, Class tagset, Annotation.Source source) {
    this(Arrays.asList(doc), tagset, source, source);
  }

  public HTMLExport(Iterable docs, Class tagset) {
   this(docs, tagset, Annotation.Source.GOLD, Annotation.Source.PRED);
  }
  
  public HTMLExport(Iterable docs, Class tagset, Annotation.Source expected) {
   this(docs, tagset, expected, Annotation.Source.PRED);
  }
  
  public HTMLExport(Iterable docs, Class tagset, Annotation.Source expected, Annotation.Source predicted) {
    log.info("Exporting HTML...");
    html = new StringBuilder();
    appendHeader();
    for(Document doc : docs) {
      if(!doc.isTagAvaliable(expected, tagset)) MentionAnnotation.createTagsFromAnnotations(doc, expected, tagset);
      if(!doc.isTagAvaliable(predicted, tagset)) MentionAnnotation.createTagsFromAnnotations(doc, predicted, tagset);
      appendDocumentLabels(doc, tagset, expected, predicted);
    }
    appendFooter();
  }

  public String getHTML() {
    return html.toString();
  }
  
  protected void appendDocumentLabels(Document doc, Class target, Annotation.Source expected, Annotation.Source predicted) {
    if(expected.equals(predicted)) {
      html.append(annotateDocumentLabels(doc, target, expected));
    } else {
      html.append(annotateDocumentLabelsDiff(doc, target, expected, predicted));
    }
  }
  
  protected String annotateDocumentLabelsDiff(Document doc, Class target) {
    return annotateDocumentLabelsDiff(doc, target, Annotation.Source.GOLD, Annotation.Source.PRED);
  }

  protected String annotateDocumentLabelsDiff(Document doc, Class target, Annotation.Source expected, Annotation.Source predicted) {
    StringBuilder html = new StringBuilder();
    html.append("

"); int cursor = doc.getBegin(); for(Sentence s : doc.getSentences()) { String last = ""; for(Token t : s.getTokens()) { Tag gold = t.getTag(expected, target); Tag pred = t.getTag(predicted, target); //if(!ELStringUtils.skipSpaceAfter.contains(last) && !ELStringUtils.skipSpaceBefore.contains(t.getText())) html.append(" "); if(t.isEmpty()) continue; if(cursor > t.getBegin()) { // reset in case of wrong offsets html.append(" "); cursor = t.getBegin(); } while(cursor < t.getBegin()) { html.append(" "); cursor++; } cursor = t.getEnd(); if(!gold.getTag().equals("O") || !pred.getTag().equals("O")) { // positive - labels html.append("") .append(""); INDArray vector = pred.getVector(); if(gold.getTag().equals("O") && !pred.getTag().equals("O")) { // false positive - blue html.append("") .append(t.getText()) .append(""); } else if(!gold.getTag().equals("O") && !pred.getTag().equals("O")) { if(gold.getTag().equals(pred.getTag())) { // true positive - green html.append("") .append(t.getText()) .append(""); } else { // boundary error - yellow html.append("") .append(t.getText()) .append(""); } } else if(!gold.getTag().equals("O") && pred.getTag().equals("O")) { // false negative - red html.append("") .append(t.getText()) .append(""); } html.append("") .append(""); } else { // true negative - white html.append("") .append(t.getText()) .append(""); } last = t.getText(); } html.append("\n"); } html.append("

\n"); return html.toString(); } protected String annotateDocumentLabels(Document doc, Class tagset) { return annotateDocumentLabels(doc, tagset, Annotation.Source.PRED); } protected String annotateDocumentLabels(Document doc, Class tagset, Annotation.Source source) { StringBuilder html = new StringBuilder(); html.append("

"); int cursor = doc.getBegin(); for(Sentence s : doc.getSentences()) { for(Token t : s.getTokens()) { Tag pred = t.getTag(source, tagset); if(t.isEmpty()) continue; if(cursor > t.getBegin()) { // reset in case of wrong offsets html.append(" "); cursor = t.getBegin(); } while(cursor < t.getBegin()) { html.append(" "); cursor++; } cursor = t.getEnd(); if(!pred.getTag().equals("O")) { INDArray vector = pred.getVector(); html.append(""); html.append(t.getText()); html.append(""); } else { html.append(""); html.append(t.getText()); html.append(""); } } html.append("\n"); } html.append("

\n"); return html.toString(); } protected void appendHeader() { html.append("\n\n\n \n"); //html.append("\n"); html.append("\n"); html.append("\n\n"); } private void appendFooter() { html.append("\n"); } public void saveHTML(Resource path, String name) { Resource file = path.resolve(name + ".html"); try { FileUtils.writeStringToFile(file.toFile(), getHTML()); } catch(IOException ex) { log.error("Could not write output: " + ex.toString()); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy