it.unipi.di.acube.batframework.datasetPlugins.GERDAQDataset Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bat-framework Show documentation
A framework to compare entity annotation systems.
The newest version!
/**
 *  Copyright 2014 Marco Cornolti
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package it.unipi.di.acube.batframework.datasetPlugins;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaInterface;

public class GERDAQDataset implements A2WDataset {
	private final static Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
	private List queries = new Vector();
	private List> tags = new Vector>();
	private List> annotations = new Vector>();
	private String name = null;

	public GERDAQDataset(String xmlFile, WikipediaInterface api, String nameSuffix) throws FileNotFoundException {
		this(new FileInputStream(new File(xmlFile)), api, nameSuffix);
	}

	public GERDAQDataset(InputStream stream, WikipediaInterface api, String nameSuffix) {
		this.name  = "GERDAQ-" + nameSuffix;
		DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
		DocumentBuilder dBuilder;
		Document doc;
		try {
			dBuilder = dbFactory.newDocumentBuilder();
			doc = dBuilder.parse(stream);
		} catch (SAXException | IOException | ParserConfigurationException e) {
			throw new RuntimeException(e);
		}
		doc.getDocumentElement().normalize();

		List>> queryMenToTitles = new Vector<>();
		List>> queryMenToWids = new Vector<>();
		List titlesToPrefetch = new Vector<>();
		List widsToPrefetch = new Vector<>();
		NodeList nList = doc.getElementsByTagName("instance");
		for (int i = 0; i < nList.getLength(); i++) {
			HashMap> mentionToTitles = new HashMap<>();
			HashMap> mentionToWids = new HashMap<>();
			String query = "";
			Node instanceNode = nList.item(i);
			Element eElement = (Element) instanceNode;
			NodeList instElemList = eElement.getChildNodes();
			for (int j = 0; j < instElemList.getLength(); j++) {
				Node instElemNode = instElemList.item(j);
				if (instElemNode.getNodeType() == Node.ELEMENT_NODE) {
					if (!instElemNode.getNodeName().equals("annotation"))
						throw new RuntimeException(
								"Found internal node that is not an annotation.");
					int pos = query.length();
					query += instElemNode.getTextContent();
					int len = query.length() - pos;
					Mention men = new Mention(pos, len);
					mentionToTitles.put(men, new Vector());
					mentionToWids.put(men, new Vector());
					NamedNodeMap attrs = instElemNode.getAttributes();
					int h = 0;
					Node n;
					while ((n = attrs.getNamedItem(String.format(
							"rank_%d_title", h))) != null) {
						String title = n.getTextContent();
						titlesToPrefetch.add(title);
						mentionToTitles.get(men).add(title);
						h++;
					}
					h = 0;
					while ((n = attrs.getNamedItem(String.format(
							"rank_%d_id", h))) != null) {
						int wid = Integer.parseInt(n.getTextContent());
						widsToPrefetch.add(wid);
						mentionToWids.get(men).add(wid);
						h++;
					}
				} else if (instElemNode.getNodeType() == Node.TEXT_NODE)
					query += instElemNode.getTextContent();

			}
			queries.add(query);
			queryMenToTitles.add(mentionToTitles);
			queryMenToWids.add(mentionToWids);
		}

		try {
			api.prefetchTitles(titlesToPrefetch);
			api.prefetchWids(widsToPrefetch);
		} catch (XPathExpressionException | IOException
				| ParserConfigurationException | SAXException e) {
			throw new RuntimeException(e);
		}
		try {
			for (int i = 0; i < queryMenToTitles.size(); i++) {
				HashSet qTags = new HashSet();
				HashSet qAnns = new HashSet();
				HashMap> menToTitles = queryMenToTitles.get(i);
				HashMap> menToWids = queryMenToWids.get(i);

				for (Mention m : menToTitles.keySet()) {
					int wid = menToWids.get(m).get(0);
					String title = menToTitles.get(m).get(0);
					String resolvedTitle = api.getTitlebyId(wid);
					int resolvedId = api.getIdByTitle(title);
					if (resolvedId != -1)
						resolvedId = api.dereference(resolvedId);


					if (resolvedTitle != null) {
						if (api.isRedirect(wid)) {
							LOG.warn("In dataset {}: Wikipedia ID {} is a redirect to {}.", this.getName(), wid, api.dereference(wid));
							wid = api.dereference(wid);
						}
						qAnns.add(new Annotation(m.getPosition(), m.getLength(), wid));
						if (!resolvedTitle.equals(title))
							LOG.warn("In dataset {}: The title associated with Wikipedia ID {} is not {} anymore, now it is {}.", this.getName(), wid, title, resolvedTitle);
					}
					else if (resolvedTitle == null && resolvedId != -1){
						LOG.warn("In dataset {}: Wikipedia ID {} does not exist anymore. Falling back to resolving title {}, that lead to Wikipedia ID {}", this.getName(), wid, title, resolvedId);
						qAnns.add(new Annotation(m.getPosition(), m.getLength(), resolvedId));
					}
					else {
						LOG.error("In dataset {}: Wikipedia ID {} does not exist anymore and nor does title {}. Discarding annotation.",  this.getName(), wid, title);
					}
				}

				for (Vector menTitles : queryMenToTitles.get(i)
						.values()) {
					for (String title : menTitles) {

						int id = api.getIdByTitle(title);
						if (id == -1)
							System.err.println("Error in dataset "
									+ this.getName()
									+ ": Could not find wikipedia title: "
									+ title);
						else
							qTags.add(new Tag(id));

					}
				}
				annotations.add(qAnns);
				tags.add(qTags);
			}
		} catch (DOMException | IOException e) {
			throw new RuntimeException(e);
		}

		if (queries.size() != tags.size() || tags.size() != annotations.size())
			throw new RuntimeException("Parsing error");
    }

	@Override
	public int getSize() {
		return queries.size();
	}

	@Override
	public String getName() {
		return name;
	}

	@Override
	public List getTextInstanceList() {
		return queries;
	}

	@Override
	public int getTagsCount() {
		int count = 0;
		for (HashSet tagSet : tags)
			count += tagSet.size();
		return count;
	}

	@Override
	public List> getC2WGoldStandardList() {
		return ProblemReduction.A2WToC2WList(annotations);
	}

	@Override
	public List> getMentionsInstanceList() {
		return ProblemReduction.A2WToD2WMentionsInstance(this
				.getA2WGoldStandardList());
	}

	@Override
	public List> getD2WGoldStandardList() {
		return annotations;
	}

	@Override
	public List> getA2WGoldStandardList() {
		return annotations;
	}

}