All Downloads are FREE. Search and download functionalities are using the official Maven repository.

justhalf.nlp.reader.acereader.ACEDocument Maven / Gradle / Ivy

package justhalf.nlp.reader.acereader;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.XMLUtils;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationType;

/**
 * 

Represents an ACE document

* *

The data structure defined here is based on the specifications found here: * * https://www.ldc.upenn.edu/collaborations/past-projects/ace/annotation-tasks-and-specifications

* *

This class can represent either documents from ACE 2004 and ACE 2005, as marked by the flag * {@link #versionIsACE2004}, which is detected automatically based on the XML header of the * source SGM file.

* *

The flag {@link #textInLowercase} marks whether the original source text is all in lowercase, * as in the case for Fisher transcript corpus in ACE 2004.

* * There are various list of canonical entities, grouped by class: *
    *
  • {@link #entities}: for named entities
  • *
  • {@link #relations}: for relations
  • *
  • {@link #events}: for events (only in ACE 2005)
  • *
  • {@link #timexes}: for time expression (only in ACE 2005)
  • *
  • {@link #values}: for other values (only in ACE 2005)
  • *
* * And also the associated mentions: *
    *
  • {@link #entityMentions}: for named entities
  • *
  • {@link #relationMentions}: for relations
  • *
  • {@link #eventMentions}: for events (only in ACE 2005)
  • *
  • {@link #timexMentions}: for time expression (only in ACE 2005)
  • *
  • {@link #valueMentions}: for other values (only in ACE 2005)
  • *
* *

The canonical entities typically contain a list of mentions (except {@link ACERelationType#METONYMY} * relations), which will actually mark the relevant spans in the text. * * Maps of entity IDs and entity mention IDs to the corresponding objects are * available as {@link #objectsById} and {@link #objectMentionsById}. *

* *

The {@link #uri} stores the filename as given in the URI attribute in source_file tag * in the APF file.

* *

The text (the relevant annotated texts) and full text (everything in the document) are available as * {@link #text} and {@link #fullText}.

* * @author Aldrian Obaja ([email protected]) * */ public class ACEDocument implements Serializable{ private static final boolean CHECK_ESCAPED_ENTITIES = false; private static final boolean CHECK_OFFSET_TEXT = false; private static final boolean CHECK_OOB_MENTIONS = false; private static final boolean REMOVE_OOB_MENTIONS = true; private static final boolean TEST_STRICT_PARSING = false; private static final long serialVersionUID = -4698300709681532759L; public String text; public String fullText; public int offset; public String uri; public boolean versionIsACE2004; public boolean textInLowercase; public List entities; public List entityMentions; public List values; public List valueMentions; public List timexes; public List timexMentions; public List relations; public List relationMentions; public List events; public List eventMentions; public Map objectsById; public Map> objectMentionsById; public ACEDocument(String sgmFilename) throws IOException, SAXException { this(sgmFilename, false); } public ACEDocument(String sgmFilename, boolean excludeMetadata) throws IOException, SAXException { this(sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata); } public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException { this(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename), IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(apfFilename), excludeMetadata); } /** * Read an ACE document from the given source sgmStream and annotations apfStream.
* * @param sgmStream * @param apfStream * @param excludeMetadata * @throws IOException * @throws SAXException */ public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{ DOMParser parser = new DOMParser(); String sgmText = IOUtils.slurpInputStream(sgmStream, "UTF-8"); sgmText = sgmText.replaceAll("<(/)?BODY>", "<$1BODY_TEXT>"); parser.parse(new InputSource(new StringReader(sgmText))); Document sgm = parser.getDocument(); if(TEST_STRICT_PARSING){ DocumentBuilder docBuilder = XMLUtils.getXmlParser(); docBuilder.parse(new InputSource(new StringReader(sgmText))); } this.fullText = unescape(sgm.getDocumentElement().getTextContent()); if(excludeMetadata){ // This should supposedly be TEXT tag, but some annotations are present even outside the TEXT tag this.text = unescape(sgm.getElementsByTagName("BODY_TEXT").item(0).getTextContent()); } else { this.text = this.fullText; } this.textInLowercase = this.text.equals(this.text.toLowerCase()); this.offset = fullText.indexOf(text); this.entities = new ArrayList(); this.entityMentions = new ArrayList(); this.values = new ArrayList(); this.valueMentions = new ArrayList(); this.timexes = new ArrayList(); this.timexMentions = new ArrayList(); this.relations = new ArrayList(); this.relationMentions = new ArrayList(); this.events = new ArrayList(); this.eventMentions = new ArrayList(); this.objectsById = new HashMap(); this.objectMentionsById = new HashMap>(); String apfText = IOUtils.slurpInputStream(apfStream, "UTF-8"); apfText = apfText.replaceAll("<(/)?head>", "<$1head_extent>"); parser = new DOMParser(); parser.parse(new InputSource(new StringReader(apfText))); Document apf = parser.getDocument(); setMetadata(apf); extractEntities(apf); extractValues(apf); extractTimexes(apf); extractRelations(apf); extractEvents(apf); } private void setMetadata(Document apf){ NamedNodeMap sourceAttributes = apf.getElementsByTagName("SOURCE_FILE").item(0).getAttributes(); String version = getAttribute(sourceAttributes, "VERSION"); this.versionIsACE2004 = version.equals("4.0"); // ACE 2005 doesn't have version this.uri = getAttribute(sourceAttributes, "URI"); } private Span getSpan(Node charseq){ NamedNodeMap attributes = charseq.getAttributes(); int start = Integer.parseInt(getAttribute(attributes, "START")); int end = Integer.parseInt(getAttribute(attributes, "END"))+1; start -= this.offset; end -= this.offset; return new Span(start, end); } private void extractEntities(Document apf) throws NumberFormatException, DOMException { NodeList entities = apf.getElementsByTagName("ENTITY"); for(int i=0; i> mentions){ int lastDiff = 0; List> toBeRemoved = new ArrayList>(); for(ACEObjectMention mention: mentions){ if(mention instanceof ACEEntityMention){ fixSpan(lastDiff, toBeRemoved, mention, ((ACEEntityMention)mention).headSpan, ((ACEEntityMention)mention).headText); } lastDiff = fixSpan(lastDiff, toBeRemoved, mention, mention.span, mention.text); } for(ACEObjectMention mention: toBeRemoved){ mentions.remove(mention); if(mention instanceof ACEEntityMention){ ((ACEEntityMention)mention).entity.mentions.remove(mention); } else if(mention instanceof ACERelationMention){ ((ACERelationMention)mention).relation.mentions.remove(mention); } else if(mention instanceof ACETimexMention){ ((ACETimexMention)mention).timex.mentions.remove(mention); } } if(toBeRemoved.size() > 0){ System.out.println("Removed "+toBeRemoved.size()+" out-of-bounds mentions from "+uri); } } private int fixSpan(int lastDiff, List> toBeRemoved, ACEObjectMention mention, Span span, String text) throws RuntimeException { String originalText = text; String unescapedOriginalText = unescape(originalText); String actualText = null; try{ actualText = span.getText(this.text); } catch (StringIndexOutOfBoundsException e){ actualText = ""; if(CHECK_OOB_MENTIONS){ if(!(mention instanceof ACETimexMention)){ System.out.printf("%-45s[%d,%d]: %s\n", mention.getFullID(), span.start, span.end, unescapedOriginalText.replace("\n", " ")); } } } if(CHECK_ESCAPED_ENTITIES){ if(unescapedOriginalText.contains("&")){ System.out.println(unescapedOriginalText.contains(";")+" "+unescapedOriginalText.replace("\n", " ")); } } if(!actualText.equals(unescapedOriginalText)){ int index = this.text.lastIndexOf(unescapedOriginalText, Math.min(this.text.length(), span.start-lastDiff)); if(index == -1){ if(REMOVE_OOB_MENTIONS){ toBeRemoved.add(mention); return lastDiff; } System.err.println("Cannot find "+unescapedOriginalText+" in "+this.text); throw new RuntimeException(); } int diff = span.start - index; span.start = index; span.end = index+unescapedOriginalText.length(); lastDiff = diff; if(CHECK_OFFSET_TEXT){ if(diff > unescapedOriginalText.length()){ System.out.printf("%-45s[%4d->%4d]: %s_%s_%s\n", mention.getFullID(), index+diff, index, this.text.substring(Math.max(0, span.start-10), span.start).replace("\n", " "), unescapedOriginalText.replace("\n", " "), this.text.substring(span.end, Math.min(this.text.length(), span.end+10)).replace("\n", " ")); } } } return lastDiff; } private ACEEntityMention getMention(Node entityMention, ACEEntity aceEntity){ NamedNodeMap mentionAttributes = entityMention.getAttributes(); String mentionId = getAttribute(mentionAttributes, "ID"); String mentionType = getAttribute(mentionAttributes, "TYPE"); String ldcMentionType = getAttribute(mentionAttributes, "LDCTYPE"); String ldcAttr = getAttribute(mentionAttributes, "LDCATR"); Node extent = ((Element)entityMention).getElementsByTagName("EXTENT").item(0); Node extentCharseq = ((Element)extent).getElementsByTagName("CHARSEQ").item(0); // All entities in ACE are contiguous Span span = getSpan(extentCharseq); String aceText = extentCharseq.getTextContent(); Node head = ((Element)entityMention).getElementsByTagName("HEAD_EXTENT").item(0); Node headCharseq = head == null ? null : ((Element)head).getElementsByTagName("CHARSEQ").item(0); Span headSpan = headCharseq == null ? null : getSpan(headCharseq); String aceHeadText = headCharseq == null ? "" : headCharseq.getTextContent(); ACEEntityMention mention = new ACEEntityMention(mentionId, mentionType, ldcMentionType, ldcAttr, aceEntity, span, headSpan, aceText, aceHeadText, SpanLabel.get(aceEntity.type.name())); return mention; } private void extractValues(Document apf){ NodeList values = apf.getElementsByTagName("VALUE"); for(int i=0; i found in the mention: "+mention.getTextContent()); } private void getRelationMentionArguments(Node relationMention, ACERelation aceRelation, ACEEntityMention[] _entityMentions, ACETimexMention[] _timestamp, String[] _timestampType){ NodeList relationMentionArgs = ((Element)relationMention).getElementsByTagName(versionIsACE2004 ? "REL_MENTION_ARG" : "RELATION_MENTION_ARGUMENT"); for(int i=0; i[] args = getEventMentionArguments(eventMention, aceEvent); NamedNodeMap attributes = eventMention.getAttributes(); String id = getAttribute(attributes, "ID"); Node charseq = getMentionCharseq(eventMention, "EXTENT"); Span span = getSpan(charseq); String text = charseq.getTextContent(); Node scopeCharseq = getMentionCharseq(eventMention, "LDC_SCOPE"); Span scopeSpan = getSpan(scopeCharseq); String scopeText = scopeCharseq.getTextContent(); Node anchorCharseq = getMentionCharseq(eventMention, "ANCHOR"); Span anchorSpan = getSpan(anchorCharseq); String anchorText = anchorCharseq.getTextContent(); return new ACEEventMention(id, span, text, aceEvent, scopeSpan, scopeText, anchorSpan, anchorText, args); } private ACEObjectMention[] getEventMentionArguments(Node eventMention, ACEEvent aceEvent){ NodeList eventMentionArgs = ((Element)eventMention).getElementsByTagName("EVENT_MENTION_ARGUMENT"); ACEObjectMention[] result = new ACEObjectMention[eventMentionArgs.getLength()]; for(int i=0; i> mentions){ for(ACEObjectMention mention: mentions){ try{ System.out.println(mention.toString(doc.text)); } catch (RuntimeException e){ System.out.println("===TEXT==="); System.out.println(doc.text); System.out.println("===FULL TEXT==="); System.out.println(doc.fullText); System.out.println("===SGM==="); System.out.println(doc.uri); System.out.println("===TEXT LENGTH==="); System.out.println(doc.text.length()); System.out.println("===OFFSET==="); System.out.println(doc.offset); System.out.println("===MENTION==="); System.out.println(mention.text); System.out.println(mention.span); throw e; } } } public static String unescape(String xml){ String result = xml.replaceAll("(?i)&", "&"); result = result.replaceAll("(?i)<", "<"); result = result.replaceAll("(?i)>", ">"); // result = result.replaceAll("(?i)<", "<"); // result = result.replaceAll("(?i)<", "<"); return result; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy