cc.mallet.extract.test.TestPerDocumentF1Evaluator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract.test;

import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;

import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.OutputStreamWriter;

import cc.mallet.extract.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.ArrayIterator;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.Sequence;
import cc.mallet.util.CharSequenceLexer;

/**
 * Created: Nov 18, 2004
 *
 * @author the big red fox did it",
    "it was done by the dog",
    "the cat ate the canary",
    "the hamburger was eaten by the kid",
    "the dog was eaten with zest",
    "four score and seven years ago"

  };

  private static String[] testTrue = {
    "the big red fox did it",
    "it was done by the dog",
    "the cat ate the canary",
    "the hamburger was eaten by the kid",
    "the dog was eaten with zest",
    "four score and seven years ago"
  };


  private Extraction createExtractionFrom (String[] predStrings, String[] trueStrings)
  {
    Pipe pipe = new SerialPipes (new Pipe[] {
      new SGML2TokenSequence (new CharSequenceLexer (CharSequenceLexer.LEX_NONWHITESPACE_CLASSES	), "O"),
      new Target2LabelSequence (),
      new PrintInputAndTarget (),
    });

    InstanceList pred = new InstanceList (pipe);
    pred.addThruPipe (new ArrayIterator (predStrings));

    InstanceList targets = new InstanceList (pipe);
    targets.addThruPipe (new ArrayIterator (trueStrings));

    LabelAlphabet dict = (LabelAlphabet) pipe.getTargetAlphabet ();
    Extraction extraction = new Extraction (null, dict);

    for (int i = 0; i < pred.size(); i++) {
      Instance aPred = pred.get (i);
      Instance aTarget = targets.get (i);
      Tokenization input = (Tokenization) aPred.getData ();
      Sequence predSeq = (Sequence) aPred.getTarget ();
      Sequence targetSeq = (Sequence) aTarget.getTarget ();
      DocumentExtraction docextr = new DocumentExtraction ("TEST"+i, dict, input, predSeq, targetSeq, "O");
      extraction.addDocumentExtraction (docextr);
    }

    return extraction;
  }

  private static final String testAExpected = "Testing per-document F1\nName\tP\tR\tF1\n" +
          "eater\t0.6667\t0.5\t0.5714\n" +
          "O\t0\t1\t0\n" +
          "meal\t0.25\t0.3333\t0.2857\n" +
          "OVERALL (micro-averaged) P=0.4286 R=0.4286 F1=0.4286\n" +
          "OVERALL (macro-averaged) F1=0.4286\n\n";

  public void testPerDocEval ()
  {
    Extraction extraction = createExtractionFrom (testPred, testTrue);
    PerDocumentF1Evaluator eval = new PerDocumentF1Evaluator ();
    ByteArrayOutputStream out = new ByteArrayOutputStream ();
    eval.setErrorOutputStream (System.out);
    eval.evaluate ("Testing", extraction, new PrintWriter (new OutputStreamWriter (out), true));

    String output = out.toString ();
    assertEquals (testAExpected, output);
  }

  private static final String[] mpdPred = {
    "Wizard of Oz by John Smith and Adam Felber",
    "Jisp Boo Fuzz by the estimable Rich Q. Doe and Frank Wilson",
    "Howdy Doody if you think this is Mr. nonsense don't you huh",
  };

  private static final String[] mpdTrue = {
    "Wizard of Oz by John Smith and Adam Felber",
    "Jisp Boo Fuzz by the estimable Rich Q. Doe and Frank Wilson",
    "Howdy Doody if you think this is Mr.  nonsense don't you huh",
  };

  private static final String mpdExpected = "Testing SEGMENT counts\nName\tCorrect\tPred\tTarget\n" +
          "title\t2\t4\t5\n" +
          "O\t0\t0\t0\n" +
          "author\t3\t4\t5\n" +
          "\nTesting per-field F1\n" +
          "Name\tP\tR\tF1\n" +
          "title\t0.5\t0.4\t0.4444\n" +
          "O\t0\t1\t0\n" +
          "author\t0.75\t0.6\t0.6667\n" +
          "OVERALL (micro-averaged) P=0.625 R=0.5 F1=0.5556\n" +
          "OVERALL (macro-averaged) F1=0.5556\n\n";

  public void testPerFieldEval ()
  {
    Extraction extraction = createExtractionFrom (mpdPred, mpdTrue);
    PerFieldF1Evaluator eval = new PerFieldF1Evaluator ();
    ByteArrayOutputStream out = new ByteArrayOutputStream ();
    eval.evaluate ("Testing", extraction, new PrintStream (out));
    assertEquals (mpdExpected, out.toString());
  }

    public void testToStdout ()
  {
    Extraction extraction = createExtractionFrom (mpdPred, mpdTrue);
    PerFieldF1Evaluator eval = new PerFieldF1Evaluator ();
    eval.evaluate (extraction);
    System.out.println ("*** Please verify that something was output above.");
  }

  private static final String[] punctPred = {
    "Wizard of Oz, by John Smith and Adam Felber",
    "Jisp Boo Fuzz by the estimable Rich Q. Doe and Frank Wilson",
    "Howdy Doody!, if you think this is Mr. nonsense don't you huh",
  };

  private static final String[] punctTrue = {
    "Wizard of Oz, by John Smith and Adam Felber",
    "Jisp Boo Fuzz by the estimable Rich Q. Doe and Frank Wilson",
    "Howdy Doody!, if you think this is Mr.  nonsense don't you huh",
  };

  //xxx  Currently fails because grabbing the field span for Howdy Doody! grabs the  as
  //  well.  I think this is because getting the text subspan goes to the start of the next,
  //  rather than the end of the last.  It seems like that should be changed, but I'd need to
  //  think about the ikmplications for Rexa before doing this.
  public void testPunctuationIgnoringEvaluator ()
  {
    Extraction extraction = createExtractionFrom (punctPred, punctTrue);
    PerFieldF1Evaluator eval = new PerFieldF1Evaluator ();
    eval.setComparator (new PunctuationIgnoringComparator ());
    eval.setErrorOutputStream (System.out);

    ByteArrayOutputStream out = new ByteArrayOutputStream ();
    eval.evaluate ("Testing", extraction, new PrintStream (out));
    assertEquals (mpdExpected, out.toString());
  }

  public void testFieldCleaning ()
  {
    Extraction extraction = createExtractionFrom (punctPred, punctTrue);
    extraction.cleanFields (new RegexFieldCleaner ("<.*?>|,|!"));

    PerFieldF1Evaluator eval = new PerFieldF1Evaluator ();
    ByteArrayOutputStream out = new ByteArrayOutputStream ();
    eval.evaluate ("Testing", extraction, new PrintStream (out));
    assertEquals (mpdExpected, out.toString());
  }

  public static void main (String[] args) throws Throwable
  {
    TestSuite theSuite;
    if (args.length > 0) {
      theSuite = new TestSuite ();
      for (int i = 0; i < args.length; i++) {
        theSuite.addTest (new TestPerDocumentF1Evaluator (args[i]));
      }
    } else {
      theSuite = (TestSuite) suite ();
    }

    junit.textui.TestRunner.run (theSuite);
  }

}
Related Artifacts