justhalf.nlp.reader.acereader.ACEDocument Maven / Gradle / Ivy

Go to download
package justhalf.nlp.reader.acereader;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.XMLUtils;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationType;

/**
 * Represents an ACE document
 * 
 * The data structure defined here is based on the specifications found here:
 * 
 * https://www.ldc.upenn.edu/collaborations/past-projects/ace/annotation-tasks-and-specifications
 * 
 * This class can represent either documents from ACE 2004 and ACE 2005, as marked by the flag
 * {@link #versionIsACE2004}, which is detected automatically based on the XML header of the
 * source SGM file.
 * 
 * The flag {@link #textInLowercase} marks whether the original source text is all in lowercase,
 * as in the case for Fisher transcript corpus in ACE 2004.
 * 
 * There are various list of canonical entities, grouped by class:
 * 
 * {@link #entities}: for named entities
 * {@link #relations}: for relations
 * {@link #events}: for events (only in ACE 2005)
 * {@link #timexes}: for time expression (only in ACE 2005)
 * {@link #values}: for other values (only in ACE 2005)
 * 
 * 
 * And also the associated mentions:
 * 
 * {@link #entityMentions}: for named entities
 * {@link #relationMentions}: for relations
 * {@link #eventMentions}: for events (only in ACE 2005)
 * {@link #timexMentions}: for time expression (only in ACE 2005)
 * {@link #valueMentions}: for other values (only in ACE 2005)
 * 
 * 
 * The canonical entities typically contain a list of mentions (except {@link ACERelationType#METONYMY}
 * relations), which will actually mark the relevant spans in the text.
 * 
 * Maps of entity IDs and entity mention IDs to the corresponding objects are
 * available as {@link #objectsById} and {@link #objectMentionsById}.
 * 
 * 
 * The {@link #uri} stores the filename as given in the URI attribute in source_file tag
 * in the APF file.
 * 
 * The text (the relevant annotated texts) and full text (everything in the document) are available as
 * {@link #text} and {@link #fullText}.
 * 
 * @author Aldrian Obaja ([email protected])
 *
 */
public class ACEDocument implements Serializable{
	
	private static final boolean CHECK_ESCAPED_ENTITIES = false;
	private static final boolean CHECK_OFFSET_TEXT = false;
	private static final boolean CHECK_OOB_MENTIONS = false;
	private static final boolean REMOVE_OOB_MENTIONS = true;
	private static final boolean TEST_STRICT_PARSING = false;
	private static final long serialVersionUID = -4698300709681532759L;

	public String text;
	public String fullText;
	public int offset;
	
	public String uri;
	public boolean versionIsACE2004;
	public boolean textInLowercase;
	public List entities;
	public List entityMentions;
	public List values;
	public List valueMentions;
	public List timexes;
	public List timexMentions;
	public List relations;
	public List relationMentions;
	public List events;
	public List eventMentions;
	public Map objectsById;
	public Map> objectMentionsById;
	
	public ACEDocument(String sgmFilename) throws IOException, SAXException {
		this(sgmFilename, false);
	}
	
	public ACEDocument(String sgmFilename, boolean excludeMetadata) throws IOException, SAXException {
		this(sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata);
	}
	
	public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException {
		this(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename),
			 IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(apfFilename),
			 excludeMetadata);
	}
	
	/**
	 * Read an ACE document from the given source sgmStream and annotations apfStream.

	 * 
	 * @param sgmStream
	 * @param apfStream
	 * @param excludeMetadata
	 * @throws IOException
	 * @throws SAXException
	 */
	public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{
		DOMParser parser = new DOMParser();
		String sgmText = IOUtils.slurpInputStream(sgmStream, "UTF-8");
		sgmText = sgmText.replaceAll("<(/)?BODY>", "<$1BODY_TEXT>");
		parser.parse(new InputSource(new StringReader(sgmText)));
		Document sgm = parser.getDocument();
		if(TEST_STRICT_PARSING){
			DocumentBuilder docBuilder = XMLUtils.getXmlParser();
			docBuilder.parse(new InputSource(new StringReader(sgmText)));
		}
		this.fullText = unescape(sgm.getDocumentElement().getTextContent());
		if(excludeMetadata){
			// This should supposedly be TEXT tag, but some annotations are present even outside the TEXT tag
			this.text = unescape(sgm.getElementsByTagName("BODY_TEXT").item(0).getTextContent());
		} else {
			this.text = this.fullText;
		}
		this.textInLowercase = this.text.equals(this.text.toLowerCase());
		this.offset = fullText.indexOf(text);
		
		this.entities = new ArrayList();
		this.entityMentions = new ArrayList();
		this.values = new ArrayList();
		this.valueMentions = new ArrayList();
		this.timexes = new ArrayList();
		this.timexMentions = new ArrayList();
		this.relations = new ArrayList();
		this.relationMentions = new ArrayList();
		this.events = new ArrayList();
		this.eventMentions = new ArrayList();
		
		this.objectsById = new HashMap();
		this.objectMentionsById = new HashMap>();
		String apfText = IOUtils.slurpInputStream(apfStream, "UTF-8");
		apfText = apfText.replaceAll("<(/)?head>", "<$1head_extent>");
		parser = new DOMParser();
		parser.parse(new InputSource(new StringReader(apfText)));
		Document apf = parser.getDocument();
		setMetadata(apf);
		extractEntities(apf);
		extractValues(apf);
		extractTimexes(apf);
		extractRelations(apf);
		extractEvents(apf);
	}
	
	private void setMetadata(Document apf){
		NamedNodeMap sourceAttributes = apf.getElementsByTagName("SOURCE_FILE").item(0).getAttributes();
		String version = getAttribute(sourceAttributes, "VERSION");
		this.versionIsACE2004 = version.equals("4.0"); // ACE 2005 doesn't have version
		this.uri = getAttribute(sourceAttributes, "URI");
	}
	
	private Span getSpan(Node charseq){
		NamedNodeMap attributes = charseq.getAttributes();
		int start = Integer.parseInt(getAttribute(attributes, "START"));
		int end = Integer.parseInt(getAttribute(attributes, "END"))+1;
		start -= this.offset;
		end -= this.offset;
		return new Span(start, end);
	}

	private void extractEntities(Document apf) throws NumberFormatException, DOMException {
		NodeList entities = apf.getElementsByTagName("ENTITY");
		for(int i=0; i> mentions){
		int lastDiff = 0;
		List> toBeRemoved = new ArrayList>();
		for(ACEObjectMention mention: mentions){
			if(mention instanceof ACEEntityMention){
				fixSpan(lastDiff, toBeRemoved, mention, ((ACEEntityMention)mention).headSpan, ((ACEEntityMention)mention).headText);
			}
			lastDiff = fixSpan(lastDiff, toBeRemoved, mention, mention.span, mention.text);
		}
		for(ACEObjectMention mention: toBeRemoved){
			mentions.remove(mention);
			if(mention instanceof ACEEntityMention){
				((ACEEntityMention)mention).entity.mentions.remove(mention);
			} else if(mention instanceof ACERelationMention){
				((ACERelationMention)mention).relation.mentions.remove(mention);
			} else if(mention instanceof ACETimexMention){
				((ACETimexMention)mention).timex.mentions.remove(mention);
			}
		}
		if(toBeRemoved.size() > 0){
			System.out.println("Removed "+toBeRemoved.size()+" out-of-bounds mentions from "+uri);
		}
	}

	private int fixSpan(int lastDiff, List> toBeRemoved, ACEObjectMention mention,
			Span span, String text) throws RuntimeException {
		String originalText = text;
		String unescapedOriginalText = unescape(originalText);
		String actualText = null;
		try{
			actualText = span.getText(this.text);
		} catch (StringIndexOutOfBoundsException e){
			actualText = "";
			if(CHECK_OOB_MENTIONS){
				if(!(mention instanceof ACETimexMention)){
					System.out.printf("%-45s[%d,%d]: %s\n", mention.getFullID(), span.start, span.end, unescapedOriginalText.replace("\n", " "));
				}
			}
		}
		if(CHECK_ESCAPED_ENTITIES){
			if(unescapedOriginalText.contains("&")){
				System.out.println(unescapedOriginalText.contains(";")+" "+unescapedOriginalText.replace("\n", " "));
			}
		}
		if(!actualText.equals(unescapedOriginalText)){
			int index = this.text.lastIndexOf(unescapedOriginalText, Math.min(this.text.length(), span.start-lastDiff));
			if(index == -1){
				if(REMOVE_OOB_MENTIONS){
					toBeRemoved.add(mention);
					return lastDiff;
				}
				System.err.println("Cannot find "+unescapedOriginalText+" in "+this.text);
				throw new RuntimeException();
			}
			int diff = span.start - index;
			span.start = index;
			span.end = index+unescapedOriginalText.length();
			lastDiff = diff;
			if(CHECK_OFFSET_TEXT){
				if(diff > unescapedOriginalText.length()){
					System.out.printf("%-45s[%4d->%4d]: %s_%s_%s\n", mention.getFullID(), index+diff, index,
							this.text.substring(Math.max(0, span.start-10), span.start).replace("\n", " "),
							unescapedOriginalText.replace("\n", " "),
							this.text.substring(span.end, Math.min(this.text.length(), span.end+10)).replace("\n", " "));
				}
			}
		}
		return lastDiff;
	}
	
	private ACEEntityMention getMention(Node entityMention, ACEEntity aceEntity){
		NamedNodeMap mentionAttributes = entityMention.getAttributes();
		String mentionId = getAttribute(mentionAttributes, "ID");
		String mentionType = getAttribute(mentionAttributes, "TYPE");
		String ldcMentionType = getAttribute(mentionAttributes, "LDCTYPE");
		String ldcAttr = getAttribute(mentionAttributes, "LDCATR");
		Node extent = ((Element)entityMention).getElementsByTagName("EXTENT").item(0);
		Node extentCharseq = ((Element)extent).getElementsByTagName("CHARSEQ").item(0);
		// All entities in ACE are contiguous
		Span span = getSpan(extentCharseq);
		String aceText = extentCharseq.getTextContent();
		Node head = ((Element)entityMention).getElementsByTagName("HEAD_EXTENT").item(0);
		Node headCharseq = head == null ? null : ((Element)head).getElementsByTagName("CHARSEQ").item(0);
		Span headSpan = headCharseq == null ? null : getSpan(headCharseq);
		String aceHeadText = headCharseq == null ? "" : headCharseq.getTextContent();
		ACEEntityMention mention = new ACEEntityMention(mentionId, mentionType, ldcMentionType, ldcAttr, aceEntity,
														span, headSpan, aceText, aceHeadText, SpanLabel.get(aceEntity.type.name()));
		return mention;
	}
	
	private void extractValues(Document apf){
		NodeList values = apf.getElementsByTagName("VALUE");
		for(int i=0; i found in the mention: "+mention.getTextContent());
	}
	
	private void getRelationMentionArguments(Node relationMention, ACERelation aceRelation,
											 ACEEntityMention[] _entityMentions, ACETimexMention[] _timestamp, String[] _timestampType){
		NodeList relationMentionArgs = ((Element)relationMention).getElementsByTagName(versionIsACE2004 ? "REL_MENTION_ARG" : "RELATION_MENTION_ARGUMENT");
		for(int i=0; i[] args = getEventMentionArguments(eventMention, aceEvent);
		NamedNodeMap attributes = eventMention.getAttributes();
		String id = getAttribute(attributes, "ID");
		Node charseq = getMentionCharseq(eventMention, "EXTENT");
		Span span = getSpan(charseq);
		String text = charseq.getTextContent();
		Node scopeCharseq = getMentionCharseq(eventMention, "LDC_SCOPE");
		Span scopeSpan = getSpan(scopeCharseq);
		String scopeText = scopeCharseq.getTextContent();
		Node anchorCharseq = getMentionCharseq(eventMention, "ANCHOR");
		Span anchorSpan = getSpan(anchorCharseq);
		String anchorText = anchorCharseq.getTextContent();
		return new ACEEventMention(id, span, text, aceEvent, scopeSpan, scopeText, anchorSpan, anchorText, args);
	}
	
	private ACEObjectMention[] getEventMentionArguments(Node eventMention, ACEEvent aceEvent){
		NodeList eventMentionArgs = ((Element)eventMention).getElementsByTagName("EVENT_MENTION_ARGUMENT");
		ACEObjectMention[] result = new ACEObjectMention[eventMentionArgs.getLength()];
		for(int i=0; i> mentions){
		for(ACEObjectMention mention: mentions){
			try{
				System.out.println(mention.toString(doc.text));
			} catch (RuntimeException e){
				System.out.println("===TEXT===");
				System.out.println(doc.text);
				System.out.println("===FULL TEXT===");
				System.out.println(doc.fullText);
				System.out.println("===SGM===");
				System.out.println(doc.uri);
				System.out.println("===TEXT LENGTH===");
				System.out.println(doc.text.length());
				System.out.println("===OFFSET===");
				System.out.println(doc.offset);
				System.out.println("===MENTION===");
				System.out.println(mention.text);
				System.out.println(mention.span);
				throw e;
			}
		}
	}
	
	public static String unescape(String xml){
		String result = xml.replaceAll("(?i)&", "&");
		result = result.replaceAll("(?i)<", "<");
		result = result.replaceAll("(?i)>", ">");
//		result = result.replaceAll("(?i)<", "<");
//		result = result.replaceAll("(?i)<", "<");
		return result;
	}
}