de.unihd.dbs.uima.consumer.tempeval3writer.TempEval3Writer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
package de.unihd.dbs.uima.consumer.tempeval3writer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Timex3;
public class TempEval3Writer extends CasConsumer_ImplBase {
private Class> component = this.getClass();
private static final String PARAM_OUTPUTDIR = "OutputDir";
// counter for outputting documents. gets increased in case there is no DCT/filename info
private static volatile Integer outCount = 0;
private File mOutputDir;
public void initialize() throws ResourceInitializationException {
mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
if (!mOutputDir.exists()) {
if(!mOutputDir.mkdirs()) {
Logger.printError(component, "Couldn't create non-existant folder "+mOutputDir.getAbsolutePath());
throw new ResourceInitializationException();
}
}
if(!mOutputDir.canWrite()) {
Logger.printError(component, "Folder "+mOutputDir.getAbsolutePath()+" is not writable.");
throw new ResourceInitializationException();
}
}
public void processCas(CAS aCAS) throws ResourceProcessException {
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new ResourceProcessException(e);
}
// get the DCT
Dct dct = null;
String filename = null;
try {
dct = (Dct) jcas.getAnnotationIndex(Dct.type).iterator().next();
filename = new File(dct.getFilename()).getName();
} catch(Exception e) {
filename = "doc_" + TempEval3Writer.getOutCount() + ".tml";
}
// assemble an XML document
Document xmlDoc = buildTimeMLDocument(jcas, dct, filename);
// write the document to file
writeTimeMLDocument(xmlDoc, filename);
}
/**
* Creates a DOM Document filled with all of the timex3s that are in the jcas.
* @param jcas
* @param dct the document's DCT
* @return
*/
@SuppressWarnings("rawtypes")
private Document buildTimeMLDocument(JCas jcas, Dct dct, String filename) {
DocumentBuilderFactory dbf = null;
DocumentBuilder db = null;
Document doc = null;
try {
dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
db = dbf.newDocumentBuilder();
doc = db.newDocument();
} catch (ParserConfigurationException e) {
e.printStackTrace();
Logger.printError(component, "XML Builder could not be instantiated");
}
// create the TimeML root element
Element rootEl = doc.createElement("TimeML");
rootEl.setAttributeNS("xmlns", "xsi", "http://www.w3.org/2001/XMLSchema-instance");
rootEl.setAttributeNS("xsi", "noNamespaceSchemaLocation", "http://timeml.org/timeMLdocs/TimeML_1.2.1.xsd");
doc.appendChild(rootEl);
// create DOCID tag
Element docidEl = doc.createElement("DOCID");
docidEl.appendChild(doc.createTextNode(filename));
rootEl.appendChild(docidEl);
// create DCT tag
if(dct != null) {
Element dctEl = doc.createElement("DCT");
Element timexForDCT = doc.createElement("TIMEX3");
timexForDCT.setAttribute("tid", "t0");
timexForDCT.setAttribute("type", "DATE");
timexForDCT.setAttribute("value", dct.getValue());
timexForDCT.setAttribute("temporalFunction", "false");
timexForDCT.setAttribute("functionInDocument", "CREATION_TIME");
timexForDCT.appendChild(doc.createTextNode(dct.getValue()));
dctEl.appendChild(timexForDCT); // NEW
rootEl.appendChild(dctEl);
}
// create and fill the TEXT tag
Integer offset = 0;
Element textEl = doc.createElement("TEXT");
rootEl.appendChild(textEl);
FSIterator it = jcas.getAnnotationIndex(Timex3.type).iterator();
// if there are no timexes, just add one text node as a child. otherwise, iterate through timexes
String docText = jcas.getDocumentText();
if(!it.hasNext()) {
textEl.appendChild(doc.createTextNode(docText));
} else {
HashSet timexesToSkip = new HashSet();
Timex3 prevT = null;
Timex3 thisT = null;
// iterate over timexes to find overlaps
while(it.hasNext()) {
thisT = (Timex3) it.next();
if((Class) thisT.getClass() != (Class) Timex3.class) // disregard types that inherit from Timex3
continue;
// check for whether this and the previous timex overlap. ex: [early (friday] morning)
if(prevT != null && prevT.getEnd() > thisT.getBegin()) {
Timex3 removedT = null; // only for debug message
// assuming longer value string means better granularity
if(prevT.getTimexValue().length() > thisT.getTimexValue().length()) {
timexesToSkip.add(thisT);
removedT = thisT;
/* prevT stays the same. */
} else {
timexesToSkip.add(prevT);
removedT = prevT;
prevT = thisT; // this iteration's prevT was removed; setting for new iteration
}
Logger.printError(component, "Two overlapping Timexes have been discovered:" + System.getProperty("line.separator")
+ "Timex A: " + prevT.getCoveredText() + " [\"" + prevT.getTimexValue() + "\" / " + prevT.getBegin() + ":" + prevT.getEnd() + "]"
+ System.getProperty("line.separator")
+ "Timex B: " + removedT.getCoveredText() + " [\"" + removedT.getTimexValue() + "\" / " + removedT.getBegin() + ":" + removedT.getEnd() + "]"
+ " [removed]" + System.getProperty("line.separator")
+ "The writer chose, for granularity: " + prevT.getCoveredText() + System.getProperty("line.separator")
+ "This usually happens with an incomplete ruleset. Please consider adding "
+ "a new rule that covers the entire expression.");
} else { // no overlap found? set current timex as next iteration's previous timex
prevT = thisT;
}
}
it.moveToFirst(); // reset iterator for another loop
// iterate over timexes to write the DOM tree.
while(it.hasNext()) {
Timex3 t = (Timex3) it.next();
if((Class) t.getClass() != (Class) Timex3.class) // disregard types that inherit from Timex3
continue;
// if this timex was marked to be removed, skip it entirely and go to the next one.
if(timexesToSkip.contains(t))
continue;
if(t.getBegin() > offset) {
// add a textnode that contains the text before the timex
textEl.appendChild(doc.createTextNode(docText.substring(offset, t.getBegin())));
}
// create the TIMEX3 element
Element newTimex = doc.createElement("TIMEX3");
// set its required attributes
newTimex.setAttribute("tid", t.getTimexId());
newTimex.setAttribute("type", t.getTimexType());
newTimex.setAttribute("value", t.getTimexValue());
// set its optional attributes
if(!t.getTimexMod().equals(""))
newTimex.setAttribute("mod", t.getTimexMod());
if(!t.getTimexQuant().equals(""))
newTimex.setAttribute("quant", t.getTimexQuant());
if(!t.getTimexFreq().equals(""))
newTimex.setAttribute("freq", t.getTimexFreq());
// set text
newTimex.appendChild(doc.createTextNode(t.getCoveredText()));
// add the timex tag to the text tag
textEl.appendChild(newTimex);
// set cursor to the end of the timex
offset = t.getEnd();
}
// append the rest of the document text
if(offset < docText.length())
textEl.appendChild(doc.createTextNode(docText.substring(offset)));
}
return doc;
}
/**
* writes a populated DOM xml(timeml) document to a given directory/file
* @param xmlDoc xml dom object
* @param filename name of the file that gets appended to the set output path
*/
private void writeTimeMLDocument(Document xmlDoc, String filename) {
// create output file handle
File outFile = new File(mOutputDir, filename+".tml");
BufferedWriter bw = null;
try {
// create a buffered writer for the output file
bw = new BufferedWriter(new FileWriter(outFile));
// prepare the transformer to convert from the xml doc to output text
Transformer transformer = TransformerFactory.newInstance().newTransformer();
// some pretty printing
transformer.setOutputProperty(OutputKeys.INDENT, "no");
DOMSource source = new DOMSource(xmlDoc);
StreamResult result = new StreamResult(bw);
// transform
transformer.transform(source, result);
} catch (IOException e) { // something went wrong with the bufferedwriter
e.printStackTrace();
Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be written.");
} catch (TransformerException e) { // the transformer malfunctioned (call optimus prime)
e.printStackTrace();
Logger.printError(component, "XML transformer could not be properly initialized.");
} finally { // clean up for the bufferedwriter
try {
bw.close();
} catch(IOException e) {
e.printStackTrace();
Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be closed.");
}
}
}
public static synchronized Integer getOutCount() {
return outCount++;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy