it.unipi.di.acube.batframework.utils.DumpData Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bat-framework Show documentation
A framework to compare entity annotation systems.
The newest version!
/**
 * (C) Copyright 2012-2013 A-cube lab - Università di Pisa - Dipartimento di Informatica. 
 * BAT-Framework is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 * BAT-Framework is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with BAT-Framework.  If not, see .
 */

package it.unipi.di.acube.batframework.utils;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.ScoredAnnotation;
import it.unipi.di.acube.batframework.data.ScoredTag;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.metrics.MatchRelation;

import java.io.IOException;
import java.util.*;

/**
 * Utility methods to dump a dataset.
 */
public class DumpData {

	/**
	 * Dump an entire dataset.
	 * 
	 * @param texts
	 *            the instances of the dataset.
	 * @param gs
	 *            the gold standard (must be of the same size as {@code texts}).
	 * @param api
	 *            the API to Wikipedia (needed to print information about
	 *            annotations/tags).
	 * @param 
	 *            the type of this dataset's gold standard.
	 * @throws IOException
	 *             if something went wrong while querying the Wikipedia API.
	 */
	public static  void dumpDataset(List texts,
			List> gs, WikipediaInterface api) throws IOException {
		for (int i = 0; i < texts.size(); i++)
			dumpCompare(texts.get(i), gs.get(i), null, api);
	}

	/**
	 * Dump an entire output of a tagger for a dataset.
	 * 
	 * @param texts
	 *            the instances of the dataset.
	 * @param output
	 *            the output computed by a tagger (must be of the same size as
	 *            {@code texts}).
	 * @param api
	 *            the API to Wikipedia (needed to print information about
	 *            annotations/tags).
	 * @param 
	 *            the type of this output's gold standard.
	 * @throws IOException
	 *             if something went wrong while querying the Wikipedia API.
	 */
	public static  void dumpOutput(List texts,
			List> output, WikipediaInterface api)
			throws IOException {
		for (int i = 0; i < texts.size(); i++)
			dumpCompare(texts.get(i), null, output.get(i), api);
	}

	/**
	 * Dump, for each document of a dataset, the expected output (gold standard)
	 * and the actual output (found by an annotator).
	 * 
	 * @param texts
	 *            the instances of the dataset.
	 * @param expectedResult
	 *            the gold standard provided by a dataset, one for each instance
	 *            (must have the same size as {@code texts}).
	 * @param computedResult
	 *            the solution found by an annotator, one for each instance
	 *            (must have the same size as {@code texts}).
	 * @param api
	 *            the API to Wikipedia (needed to print information about
	 *            annotations/tags).
	 * @param 
	 *            the type of this list elements.
	 * @throws IOException
	 *             if something went wrong while querying the Wikipedia API.
	 */
	public static  void dumpCompareList(List texts,
			List> expectedResult, List> computedResult,
			WikipediaInterface api) throws IOException {
		dumpCompareList(texts, expectedResult, computedResult, api, true);

	}

	/**
	 * Dump, for each document of a dataset, the expected output (gold standard)
	 * and the actual output (found by an annotator).
	 * 
	 * @param texts
	 *            the instances of the dataset.
	 * @param expectedResult
	 *            the gold standard provided by a dataset, one for each instance
	 *            (must have the same size as {@code texts}).
	 * @param computedResult
	 *            the solution found by an annotator, one for each instance
	 *            (must have the same size as {@code texts}).
	 * @param api
	 *            the API to Wikipedia (needed to print information about
	 *            annotations/tags).
	 * @param printEmptyDocs
	 *            whether or not to print documents with an empty gold standard
	 *            and an empty solution.
	 * @param 
	 *            the type of these lists elements.
	 * @throws IOException
	 *             if something went wrong while querying the Wikipedia API.
	 */
	public static  void dumpCompareList(List texts,
			List> expectedResult, List> computedResult,
			WikipediaInterface api, boolean printEmptyDocs)
			throws IOException {
		dumpCompareList(texts, expectedResult, computedResult, api, printEmptyDocs, null);
	}
	
	/**
	 * Dump, for each document of a dataset, the expected output (gold standard)
	 * and the actual output (found by an annotator).
	 * 
	 * @param texts
	 *            the instances of the dataset.
	 * @param expectedResult
	 *            the gold standard provided by a dataset, one for each instance
	 *            (must have the same size as {@code texts}).
	 * @param computedResult
	 *            the solution found by an annotator, one for each instance
	 *            (must have the same size as {@code texts}).
	 * @param api
	 *            the API to Wikipedia (needed to print information about
	 *            annotations/tags).
	 * @param printEmptyDocs
	 *            whether or not to print documents with an empty gold standard
	 *            and an empty solution.
	 * @param mr
	 *            match relation used to dump annotations about TP/FP/FN
	 * @param 
	 *            the type of these lists elements.
	 * @throws IOException
	 *             if something went wrong while querying the Wikipedia API.
	 */
	public static  void dumpCompareList(List texts,
			List> expectedResult, List> computedResult,
			WikipediaInterface api, boolean printEmptyDocs, MatchRelation mr)
			throws IOException {
		for (int i = 0; i < texts.size(); i++) {
			if (printEmptyDocs
					|| (!printEmptyDocs && (!expectedResult.get(i).isEmpty() || !computedResult
							.get(i).isEmpty()))) {
				DumpData.dumpCompareMatch(texts.get(i), expectedResult.get(i),
						computedResult.get(i), mr, api);
				System.out.println();
			}
		}

	}

	/**
	 * Dumps the text, the annotations provided by the gold standard and those
	 * found by a tagger for a single document.
	 * 
	 * @param text
	 *            the document.
	 * @param expectedResult
	 *            the expected results provided by a dataset (if {@code null},
	 *            it is not printed).
	 * @param computedResult
	 *            the results found by an annotator (if {@code null}, it is not
	 *            printed).
	 * @param api
	 *            the API to Wikipedia (needed to print information about
	 *            annotations/tags).
	 * @param mr
	 *            a match relation to compare the results.
	 * @param 
	 *            the type of result.
	 * @throws IOException
	 *             if something went wrong while querying the Wikipedia API.
	 */
	public static  void dumpCompareMatch(String text,
			HashSet expectedResult, HashSet computedResult,
			MatchRelation mr, WikipediaInterface api) throws IOException {
		System.out.println("Text: " + text);
		if (expectedResult != null) {
			System.out.println();
			System.out.println("Gold standard: ");
			for (T a : expectedResult) {
				String note = "";
				if (mr != null) {
					note = "FN";
					for (T t : computedResult)
						if (mr.match(t, a))
							note = "";
				}
				printAnnotation(text, a, api, note);
			}
		}
		if (computedResult != null) {
			System.out.println();
			System.out.println("System output: ");
			List list = new Vector();
			for (T t : computedResult)
				list.add(t);
			Collections.sort(list);
			for (T a : list) {
				String note = "";
				if (mr != null) {
					note = "FP";
					for (T t : expectedResult)
						if (mr.match(a, t))
							note = "TP";
				}
				printAnnotation(text, a, api, note);
			}
		}
	}

	public static  void dumpCompare(String text,
			HashSet expectedResult, HashSet computedResult,
			WikipediaInterface api) throws IOException {
		dumpCompareMatch(text, expectedResult, computedResult, null, api);
	}

	private static  void printAnnotation(String text, T a,
			WikipediaInterface api, String note) throws IOException {
		if (a instanceof ScoredAnnotation)
			System.out.printf("\t%s: %s -> %s (wid=%d) (score=%.3f)%n", note,
					text.substring(((ScoredAnnotation) a).getPosition(),
							((Annotation) a).getPosition()
									+ ((ScoredAnnotation) a).getLength()), api
							.getTitlebyId(a.getConcept()), a.getConcept(),
					((ScoredAnnotation) a).getScore());
		else if (a instanceof Annotation)
			System.out.printf(
					"\t%s: %s (%d, %d) -> %s (%d)%n",
					note,
					text.substring(
							((Annotation) a).getPosition(),
							((Annotation) a).getPosition()
									+ ((Annotation) a).getLength()),
					((Annotation) a).getPosition(),
					((Annotation) a).getPosition()
							+ ((Annotation) a).getLength(), api.getTitlebyId(a
							.getConcept()), a.getConcept());
		else if (a instanceof ScoredTag)
			System.out.printf("\t%s: %s (wid=%d) (score=%.3f)%n", note,
					api.getTitlebyId(a.getConcept()), a.getConcept(),
					((ScoredTag) a).getScore());
		else if (a instanceof Tag)
			System.out.printf("\t%s: %s (%d)", note,
					api.getTitlebyId(a.getConcept()), a.getConcept());

	}

}