All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.extract.DocumentViewer Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;


import java.io.File;
import java.io.PrintWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.ColorUtils;

/**
 * Diagnosis class that outputs HTML pages that allows you to view errors on a more
 *  global per-instance basis.
 *
 * Created: Mar 30, 2005
 *
 * @author "+name+": Extraction from Document");
    out.println ("");
    out.println ("");
    out.println ("");
    out.println ("");

    outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());
    outputRightWrongLegend (out);

    DualLabeledSpans spans = intersectSpans (docExtr);
    for (int i = 0; i < spans.size(); i++) {
      LabeledSpan predSpan = spans.get (i, 0);
      LabeledSpan trueSpan = spans.get (i, 1);

      Label predLabel = predSpan.getLabel ();
      Label trueLabel = trueSpan.getLabel ();

      boolean predNonBgrnd = !predSpan.isBackground ();
      boolean trueNonBgrnd = !trueSpan.isBackground ();
      boolean isBackground = !predNonBgrnd && !trueNonBgrnd;
      
      String spanClass = null;
      if (predNonBgrnd && trueNonBgrnd) {
        if (predLabel == trueLabel) {
          spanClass = "correct";
        } else {
          spanClass = "wrong";
        }
      } else if (predNonBgrnd) {
        spanClass = "pred";
      } else if (trueNonBgrnd) {
        spanClass = "true";
      }

      if (!isBackground) out.print ("");
      if (!isBackground) out.print ("");
      if (spanClass != null) { out.print (""); }

      String text = predSpan.getSpan ().getText ();
      text = text.replaceAll ("<", "<");
      text = text.replaceAll ("\n", "\n

"); out.print (text); if (spanClass != null) { out.print (""); } if (!isBackground) out.print (""); out.println (); } out.println (""); } private static void outputRightWrongLegend (PrintWriter out) { out.println ("

LEGEND
"); out.println ("Correct
"); out.println ("Wrong
"); out.println ("False Negative (True field but predicted background)
"); out.println ("False Positive (True background but predicted field)
"); out.println ("
"); } private static void outputClassLegend (PrintWriter out, LabelAlphabet dict) { out.println ("
"); out.println ("

LEGEND

"); String[] fields = determineFieldNames (dict); String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1); for (int i = 0; i < fields.length; i++) { out.println (""+fields[i]+"
"); } out.println ("
"); } private static String[] determineFieldNames (LabelAlphabet dict) { List l = new ArrayList (); for (int i = 0; i < dict.size (); i++) { String lname = dict.lookupLabel (i).toString (); if (!lname.startsWith ("B-") && !lname.startsWith ("I-")) { l.add (lname); } } return (String[]) l.toArray (new String [l.size ()]); } private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr) { int predIdx = 0; int trueIdx = 0; LabeledSpans trueSpans = docExtr.getTargetSpans (); LabeledSpans predSpans = docExtr.getExtractedSpans (); LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ()); LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ()); while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) { LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx); LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx); LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan); LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan); retPredSpans.add (newPredSpan); retTrueSpans.add (newTrueSpan); if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) { predIdx++; } if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) { trueIdx++; } } assert (retPredSpans.size() == retTrueSpans.size()); return new DualLabeledSpans (retPredSpans, retTrueSpans); } private static void outputIndex (File directory, Extraction extraction) throws IOException { PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html"))); out.println ("Extraction Results
    "); for (int i = 0; i < extraction.getNumDocuments(); i++) { String name = extraction.getDocumentExtraction (i).getName (); out.println ("
  1. "+name+"
  2. "); } out.println ("
"); out.close (); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy