justhalf.nlp.reader.acereader.ACEDocument Maven / Gradle / Ivy
package justhalf.nlp.reader.acereader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.XMLUtils;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationType;
/**
* Represents an ACE document
*
* The data structure defined here is based on the specifications found here:
*
* https://www.ldc.upenn.edu/collaborations/past-projects/ace/annotation-tasks-and-specifications
*
* This class can represent either documents from ACE 2004 and ACE 2005, as marked by the flag
* {@link #versionIsACE2004}, which is detected automatically based on the XML header of the
* source SGM file.
*
* The flag {@link #textInLowercase} marks whether the original source text is all in lowercase,
* as in the case for Fisher transcript corpus in ACE 2004.
*
* There are various list of canonical entities, grouped by class:
*
* - {@link #entities}: for named entities
* - {@link #relations}: for relations
* - {@link #events}: for events (only in ACE 2005)
* - {@link #timexes}: for time expression (only in ACE 2005)
* - {@link #values}: for other values (only in ACE 2005)
*
*
* And also the associated mentions:
*
* - {@link #entityMentions}: for named entities
* - {@link #relationMentions}: for relations
* - {@link #eventMentions}: for events (only in ACE 2005)
* - {@link #timexMentions}: for time expression (only in ACE 2005)
* - {@link #valueMentions}: for other values (only in ACE 2005)
*
*
* The canonical entities typically contain a list of mentions (except {@link ACERelationType#METONYMY}
* relations), which will actually mark the relevant spans in the text.
*
* Maps of entity IDs and entity mention IDs to the corresponding objects are
* available as {@link #objectsById} and {@link #objectMentionsById}.
*
*
* The {@link #uri} stores the filename as given in the URI attribute in source_file tag
* in the APF file.
*
* The text (the relevant annotated texts) and full text (everything in the document) are available as
* {@link #text} and {@link #fullText}.
*
* @author Aldrian Obaja ([email protected])
*
*/
public class ACEDocument implements Serializable{
private static final boolean CHECK_ESCAPED_ENTITIES = false;
private static final boolean CHECK_OFFSET_TEXT = false;
private static final boolean CHECK_OOB_MENTIONS = false;
private static final boolean REMOVE_OOB_MENTIONS = true;
private static final boolean TEST_STRICT_PARSING = false;
private static final long serialVersionUID = -4698300709681532759L;
public String text;
public String fullText;
public int offset;
public String uri;
public boolean versionIsACE2004;
public boolean textInLowercase;
public List entities;
public List entityMentions;
public List values;
public List valueMentions;
public List timexes;
public List timexMentions;
public List relations;
public List relationMentions;
public List events;
public List eventMentions;
public Map objectsById;
public Map> objectMentionsById;
public ACEDocument(String sgmFilename) throws IOException, SAXException {
this(sgmFilename, false);
}
public ACEDocument(String sgmFilename, boolean excludeMetadata) throws IOException, SAXException {
this(sgmFilename, sgmFilename.replace(".sgm", ".apf.xml"), excludeMetadata);
}
public ACEDocument(String sgmFilename, String apfFilename, boolean excludeMetadata) throws IOException, SAXException {
this(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(sgmFilename),
IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(apfFilename),
excludeMetadata);
}
/**
* Read an ACE document from the given source sgmStream and annotations apfStream.
*
* @param sgmStream
* @param apfStream
* @param excludeMetadata
* @throws IOException
* @throws SAXException
*/
public ACEDocument(InputStream sgmStream, InputStream apfStream, boolean excludeMetadata) throws IOException, SAXException{
DOMParser parser = new DOMParser();
String sgmText = IOUtils.slurpInputStream(sgmStream, "UTF-8");
sgmText = sgmText.replaceAll("<(/)?BODY>", "<$1BODY_TEXT>");
parser.parse(new InputSource(new StringReader(sgmText)));
Document sgm = parser.getDocument();
if(TEST_STRICT_PARSING){
DocumentBuilder docBuilder = XMLUtils.getXmlParser();
docBuilder.parse(new InputSource(new StringReader(sgmText)));
}
this.fullText = unescape(sgm.getDocumentElement().getTextContent());
if(excludeMetadata){
// This should supposedly be TEXT tag, but some annotations are present even outside the TEXT tag
this.text = unescape(sgm.getElementsByTagName("BODY_TEXT").item(0).getTextContent());
} else {
this.text = this.fullText;
}
this.textInLowercase = this.text.equals(this.text.toLowerCase());
this.offset = fullText.indexOf(text);
this.entities = new ArrayList();
this.entityMentions = new ArrayList();
this.values = new ArrayList();
this.valueMentions = new ArrayList();
this.timexes = new ArrayList();
this.timexMentions = new ArrayList();
this.relations = new ArrayList();
this.relationMentions = new ArrayList();
this.events = new ArrayList();
this.eventMentions = new ArrayList();
this.objectsById = new HashMap();
this.objectMentionsById = new HashMap>();
String apfText = IOUtils.slurpInputStream(apfStream, "UTF-8");
apfText = apfText.replaceAll("<(/)?head>", "<$1head_extent>");
parser = new DOMParser();
parser.parse(new InputSource(new StringReader(apfText)));
Document apf = parser.getDocument();
setMetadata(apf);
extractEntities(apf);
extractValues(apf);
extractTimexes(apf);
extractRelations(apf);
extractEvents(apf);
}
private void setMetadata(Document apf){
NamedNodeMap sourceAttributes = apf.getElementsByTagName("SOURCE_FILE").item(0).getAttributes();
String version = getAttribute(sourceAttributes, "VERSION");
this.versionIsACE2004 = version.equals("4.0"); // ACE 2005 doesn't have version
this.uri = getAttribute(sourceAttributes, "URI");
}
private Span getSpan(Node charseq){
NamedNodeMap attributes = charseq.getAttributes();
int start = Integer.parseInt(getAttribute(attributes, "START"));
int end = Integer.parseInt(getAttribute(attributes, "END"))+1;
start -= this.offset;
end -= this.offset;
return new Span(start, end);
}
private void extractEntities(Document apf) throws NumberFormatException, DOMException {
NodeList entities = apf.getElementsByTagName("ENTITY");
for(int i=0; i> mentions){
int lastDiff = 0;
List> toBeRemoved = new ArrayList>();
for(ACEObjectMention> mention: mentions){
if(mention instanceof ACEEntityMention){
fixSpan(lastDiff, toBeRemoved, mention, ((ACEEntityMention)mention).headSpan, ((ACEEntityMention)mention).headText);
}
lastDiff = fixSpan(lastDiff, toBeRemoved, mention, mention.span, mention.text);
}
for(ACEObjectMention> mention: toBeRemoved){
mentions.remove(mention);
if(mention instanceof ACEEntityMention){
((ACEEntityMention)mention).entity.mentions.remove(mention);
} else if(mention instanceof ACERelationMention){
((ACERelationMention)mention).relation.mentions.remove(mention);
} else if(mention instanceof ACETimexMention){
((ACETimexMention)mention).timex.mentions.remove(mention);
}
}
if(toBeRemoved.size() > 0){
System.out.println("Removed "+toBeRemoved.size()+" out-of-bounds mentions from "+uri);
}
}
private int fixSpan(int lastDiff, List> toBeRemoved, ACEObjectMention> mention,
Span span, String text) throws RuntimeException {
String originalText = text;
String unescapedOriginalText = unescape(originalText);
String actualText = null;
try{
actualText = span.getText(this.text);
} catch (StringIndexOutOfBoundsException e){
actualText = "";
if(CHECK_OOB_MENTIONS){
if(!(mention instanceof ACETimexMention)){
System.out.printf("%-45s[%d,%d]: %s\n", mention.getFullID(), span.start, span.end, unescapedOriginalText.replace("\n", " "));
}
}
}
if(CHECK_ESCAPED_ENTITIES){
if(unescapedOriginalText.contains("&")){
System.out.println(unescapedOriginalText.contains(";")+" "+unescapedOriginalText.replace("\n", " "));
}
}
if(!actualText.equals(unescapedOriginalText)){
int index = this.text.lastIndexOf(unescapedOriginalText, Math.min(this.text.length(), span.start-lastDiff));
if(index == -1){
if(REMOVE_OOB_MENTIONS){
toBeRemoved.add(mention);
return lastDiff;
}
System.err.println("Cannot find "+unescapedOriginalText+" in "+this.text);
throw new RuntimeException();
}
int diff = span.start - index;
span.start = index;
span.end = index+unescapedOriginalText.length();
lastDiff = diff;
if(CHECK_OFFSET_TEXT){
if(diff > unescapedOriginalText.length()){
System.out.printf("%-45s[%4d->%4d]: %s_%s_%s\n", mention.getFullID(), index+diff, index,
this.text.substring(Math.max(0, span.start-10), span.start).replace("\n", " "),
unescapedOriginalText.replace("\n", " "),
this.text.substring(span.end, Math.min(this.text.length(), span.end+10)).replace("\n", " "));
}
}
}
return lastDiff;
}
private ACEEntityMention getMention(Node entityMention, ACEEntity aceEntity){
NamedNodeMap mentionAttributes = entityMention.getAttributes();
String mentionId = getAttribute(mentionAttributes, "ID");
String mentionType = getAttribute(mentionAttributes, "TYPE");
String ldcMentionType = getAttribute(mentionAttributes, "LDCTYPE");
String ldcAttr = getAttribute(mentionAttributes, "LDCATR");
Node extent = ((Element)entityMention).getElementsByTagName("EXTENT").item(0);
Node extentCharseq = ((Element)extent).getElementsByTagName("CHARSEQ").item(0);
// All entities in ACE are contiguous
Span span = getSpan(extentCharseq);
String aceText = extentCharseq.getTextContent();
Node head = ((Element)entityMention).getElementsByTagName("HEAD_EXTENT").item(0);
Node headCharseq = head == null ? null : ((Element)head).getElementsByTagName("CHARSEQ").item(0);
Span headSpan = headCharseq == null ? null : getSpan(headCharseq);
String aceHeadText = headCharseq == null ? "" : headCharseq.getTextContent();
ACEEntityMention mention = new ACEEntityMention(mentionId, mentionType, ldcMentionType, ldcAttr, aceEntity,
span, headSpan, aceText, aceHeadText, SpanLabel.get(aceEntity.type.name()));
return mention;
}
private void extractValues(Document apf){
NodeList values = apf.getElementsByTagName("VALUE");
for(int i=0; i found in the mention: "+mention.getTextContent());
}
private void getRelationMentionArguments(Node relationMention, ACERelation aceRelation,
ACEEntityMention[] _entityMentions, ACETimexMention[] _timestamp, String[] _timestampType){
NodeList relationMentionArgs = ((Element)relationMention).getElementsByTagName(versionIsACE2004 ? "REL_MENTION_ARG" : "RELATION_MENTION_ARGUMENT");
for(int i=0; i[] args = getEventMentionArguments(eventMention, aceEvent);
NamedNodeMap attributes = eventMention.getAttributes();
String id = getAttribute(attributes, "ID");
Node charseq = getMentionCharseq(eventMention, "EXTENT");
Span span = getSpan(charseq);
String text = charseq.getTextContent();
Node scopeCharseq = getMentionCharseq(eventMention, "LDC_SCOPE");
Span scopeSpan = getSpan(scopeCharseq);
String scopeText = scopeCharseq.getTextContent();
Node anchorCharseq = getMentionCharseq(eventMention, "ANCHOR");
Span anchorSpan = getSpan(anchorCharseq);
String anchorText = anchorCharseq.getTextContent();
return new ACEEventMention(id, span, text, aceEvent, scopeSpan, scopeText, anchorSpan, anchorText, args);
}
private ACEObjectMention>[] getEventMentionArguments(Node eventMention, ACEEvent aceEvent){
NodeList eventMentionArgs = ((Element)eventMention).getElementsByTagName("EVENT_MENTION_ARGUMENT");
ACEObjectMention>[] result = new ACEObjectMention>[eventMentionArgs.getLength()];
for(int i=0; i> mentions){
for(ACEObjectMention> mention: mentions){
try{
System.out.println(mention.toString(doc.text));
} catch (RuntimeException e){
System.out.println("===TEXT===");
System.out.println(doc.text);
System.out.println("===FULL TEXT===");
System.out.println(doc.fullText);
System.out.println("===SGM===");
System.out.println(doc.uri);
System.out.println("===TEXT LENGTH===");
System.out.println(doc.text.length());
System.out.println("===OFFSET===");
System.out.println(doc.offset);
System.out.println("===MENTION===");
System.out.println(mention.text);
System.out.println(mention.span);
throw e;
}
}
}
public static String unescape(String xml){
String result = xml.replaceAll("(?i)&", "&");
result = result.replaceAll("(?i)<", "<");
result = result.replaceAll("(?i)>", ">");
// result = result.replaceAll("(?i)<", "<");
// result = result.replaceAll("(?i)<", "<");
return result;
}
}