cc.mallet.extract.DocumentViewer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;


import java.io.File;
import java.io.PrintWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.ColorUtils;

/**
 * Diagnosis class that outputs HTML pages that allows you to view errors on a more
 *  global per-instance basis.
 *
 * Created: Mar 30, 2005
 *
 * @author "+name+": Extraction from Document");
    out.println ("");
    out.println ("");
    out.println ("");
    out.println ("");

    outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());
    outputRightWrongLegend (out);

    DualLabeledSpans spans = intersectSpans (docExtr);
    for (int i = 0; i < spans.size(); i++) {
      LabeledSpan predSpan = spans.get (i, 0);
      LabeledSpan trueSpan = spans.get (i, 1);

      Label predLabel = predSpan.getLabel ();
      Label trueLabel = trueSpan.getLabel ();

      boolean predNonBgrnd = !predSpan.isBackground ();
      boolean trueNonBgrnd = !trueSpan.isBackground ();
      boolean isBackground = !predNonBgrnd && !trueNonBgrnd;
      
      String spanClass = null;
      if (predNonBgrnd && trueNonBgrnd) {
        if (predLabel == trueLabel) {
          spanClass = "correct";
        } else {
          spanClass = "wrong";
        }
      } else if (predNonBgrnd) {
        spanClass = "pred";
      } else if (trueNonBgrnd) {
        spanClass = "true";
      }

      if (!isBackground) out.print ("");
      if (!isBackground) out.print ("");
      if (spanClass != null) { out.print (""); }

      String text = predSpan.getSpan ().getText ();
      text = text.replaceAll ("<", "<");
      text = text.replaceAll ("\n", "\n");
      out.print (text);

      if (spanClass != null) { out.print (""); }
      if (!isBackground) out.print ("");
      out.println ();
    }

    out.println ("");
  }

  private static void outputRightWrongLegend (PrintWriter out)
  {
    out.println ("
LEGEND
");
    out.println ("Correct
");
    out.println ("Wrong
");
    out.println ("False Negative (True field but predicted background)
");
    out.println ("False Positive (True background but predicted field)
");
    out.println ("");
  }
  private static void outputClassLegend (PrintWriter out, LabelAlphabet dict)
  {
    out.println ("");
    out.println ("LEGEND");
    String[] fields = determineFieldNames (dict);
    String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
    for (int i = 0; i < fields.length; i++) {
      out.println (""+fields[i]+"
");
    }
    out.println ("");
  }

  private static String[] determineFieldNames (LabelAlphabet dict)
  {
    List l = new ArrayList ();
    for (int i = 0; i < dict.size (); i++) {
      String lname = dict.lookupLabel (i).toString ();
      if (!lname.startsWith ("B-") && !lname.startsWith ("I-")) {
        l.add (lname);
      }
    }
    return (String[]) l.toArray (new String [l.size ()]);
  }

  private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr)
  {
    int predIdx = 0;
    int trueIdx = 0;
    LabeledSpans trueSpans = docExtr.getTargetSpans ();
    LabeledSpans predSpans = docExtr.getExtractedSpans ();

    LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ());
    LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ());

    while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) {
      LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx);
      LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx);

      LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan);
      LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan);
      retPredSpans.add (newPredSpan);
      retTrueSpans.add (newTrueSpan);

      if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) {
        predIdx++;
      }
      if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) {
        trueIdx++;
      }
    }

    assert (retPredSpans.size() == retTrueSpans.size());

    return new DualLabeledSpans (retPredSpans, retTrueSpans);
  }

  private static void outputIndex (File directory, Extraction extraction) throws IOException
  {
    PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html")));
    out.println ("Extraction Results");
    for (int i = 0; i < extraction.getNumDocuments(); i++) {
      String name = extraction.getDocumentExtraction (i).getName ();
      out.println ("  "+name+"");
    }
    out.println ("");
    out.close ();
  }
}