All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unihd.dbs.uima.consumer.tempeval3writer.TempEval3Writer Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
package de.unihd.dbs.uima.consumer.tempeval3writer;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Timex3;

public class TempEval3Writer extends CasConsumer_ImplBase {
	private Class component = this.getClass();

	private static final String PARAM_OUTPUTDIR = "OutputDir";
	
	// counter for outputting documents. gets increased in case there is no DCT/filename info 
	private static volatile Integer outCount = 0;

	private File mOutputDir;

	public void initialize() throws ResourceInitializationException {
		mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
		
		if (!mOutputDir.exists()) {
			if(!mOutputDir.mkdirs()) {
				Logger.printError(component, "Couldn't create non-existant folder "+mOutputDir.getAbsolutePath());
				throw new ResourceInitializationException();
			}
		}
		
		if(!mOutputDir.canWrite()) {
			Logger.printError(component, "Folder "+mOutputDir.getAbsolutePath()+" is not writable.");
			throw new ResourceInitializationException();
		}
	}
	
	public void processCas(CAS aCAS) throws ResourceProcessException {
		JCas jcas;
		try {
			jcas = aCAS.getJCas();
		} catch (CASException e) {
			throw new ResourceProcessException(e);
		}
		
		// get the DCT
		Dct dct = null;
		String filename = null;
		try {
			dct = (Dct) jcas.getAnnotationIndex(Dct.type).iterator().next();
			filename = new File(dct.getFilename()).getName();
		} catch(Exception e) {
			filename = "doc_" + TempEval3Writer.getOutCount() + ".tml";
		}

		// assemble an XML document
		Document xmlDoc = buildTimeMLDocument(jcas, dct, filename);
		
		// write the document to file
		writeTimeMLDocument(xmlDoc, filename);
	}

	/**
	 * Creates a DOM Document filled with all of the timex3s that are in the jcas.
	 * @param jcas
	 * @param dct the document's DCT
	 * @return
	 */
	@SuppressWarnings("rawtypes")
	private Document buildTimeMLDocument(JCas jcas, Dct dct, String filename) {
		DocumentBuilderFactory dbf = null;
		DocumentBuilder db = null;
		Document doc = null;
		
		try {
			dbf = DocumentBuilderFactory.newInstance();
			dbf.setNamespaceAware(true);
			db = dbf.newDocumentBuilder();
			doc = db.newDocument();
		} catch (ParserConfigurationException e) {
			e.printStackTrace();
			Logger.printError(component, "XML Builder could not be instantiated");
		}
		
		// create the TimeML root element
		Element rootEl = doc.createElement("TimeML");
		rootEl.setAttributeNS("xmlns", "xsi", "http://www.w3.org/2001/XMLSchema-instance");
		rootEl.setAttributeNS("xsi", "noNamespaceSchemaLocation", "http://timeml.org/timeMLdocs/TimeML_1.2.1.xsd");
		doc.appendChild(rootEl);
		
		// create DOCID tag
		Element docidEl = doc.createElement("DOCID");
		docidEl.appendChild(doc.createTextNode(filename));
		rootEl.appendChild(docidEl);
		
		// create DCT tag
		if(dct != null) {
			Element dctEl = doc.createElement("DCT");
	
			Element timexForDCT = doc.createElement("TIMEX3");
			timexForDCT.setAttribute("tid", "t0");
			timexForDCT.setAttribute("type", "DATE");
			timexForDCT.setAttribute("value", dct.getValue());
			timexForDCT.setAttribute("temporalFunction", "false");
			timexForDCT.setAttribute("functionInDocument", "CREATION_TIME");
			timexForDCT.appendChild(doc.createTextNode(dct.getValue()));
			
			dctEl.appendChild(timexForDCT); // NEW		
			rootEl.appendChild(dctEl);
		}
		
		// create and fill the TEXT tag
		Integer offset = 0;
		Element textEl = doc.createElement("TEXT");
		rootEl.appendChild(textEl);
		
		FSIterator it = jcas.getAnnotationIndex(Timex3.type).iterator();
		// if there are no timexes, just add one text node as a child. otherwise, iterate through timexes
		String docText = jcas.getDocumentText();
		if(!it.hasNext()) {
			textEl.appendChild(doc.createTextNode(docText));
		} else {
			HashSet timexesToSkip = new HashSet();
			Timex3 prevT = null;
			Timex3 thisT = null;
			// iterate over timexes to find overlaps
			while(it.hasNext()) {
				thisT = (Timex3) it.next();
				
				if((Class) thisT.getClass() != (Class) Timex3.class) // disregard types that inherit from Timex3
					continue;
				
				// check for whether this and the previous timex overlap. ex: [early (friday] morning)
				if(prevT != null && prevT.getEnd() > thisT.getBegin()) {
					
					Timex3 removedT = null; // only for debug message
					// assuming longer value string means better granularity
					if(prevT.getTimexValue().length() > thisT.getTimexValue().length()) {
						timexesToSkip.add(thisT);
						removedT = thisT;
						/* prevT stays the same. */
					} else {
						timexesToSkip.add(prevT);
						removedT = prevT;
						prevT = thisT; // this iteration's prevT was removed; setting for new iteration 
					}
					
					Logger.printError(component, "Two overlapping Timexes have been discovered:" + System.getProperty("line.separator")
							+ "Timex A: " + prevT.getCoveredText() + " [\"" + prevT.getTimexValue() + "\" / " + prevT.getBegin() + ":" + prevT.getEnd() + "]" 
							+ System.getProperty("line.separator")
							+ "Timex B: " + removedT.getCoveredText() + " [\"" + removedT.getTimexValue() + "\" / " + removedT.getBegin() + ":" + removedT.getEnd() + "]" 
							+ " [removed]" + System.getProperty("line.separator")
							+ "The writer chose, for granularity: " + prevT.getCoveredText() + System.getProperty("line.separator")
							+ "This usually happens with an incomplete ruleset. Please consider adding "
							+ "a new rule that covers the entire expression.");
				} else { // no overlap found? set current timex as next iteration's previous timex
					prevT = thisT;
				}
			}
			
			it.moveToFirst(); // reset iterator for another loop
			// iterate over timexes to write the DOM tree.
			while(it.hasNext()) {
				Timex3 t = (Timex3) it.next();
				if((Class) t.getClass() != (Class) Timex3.class) // disregard types that inherit from Timex3
					continue;
				
				// if this timex was marked to be removed, skip it entirely and go to the next one.
				if(timexesToSkip.contains(t))
					continue;
				
				if(t.getBegin() > offset) { 
					// add a textnode that contains the text before the timex
					textEl.appendChild(doc.createTextNode(docText.substring(offset, t.getBegin())));
				}
				
				// create the TIMEX3 element
				Element newTimex = doc.createElement("TIMEX3");
				
				// set its required attributes
				newTimex.setAttribute("tid", t.getTimexId());
				newTimex.setAttribute("type", t.getTimexType());
				newTimex.setAttribute("value", t.getTimexValue());
				
				// set its optional attributes
				if(!t.getTimexMod().equals(""))
					newTimex.setAttribute("mod", t.getTimexMod());
				if(!t.getTimexQuant().equals(""))
					newTimex.setAttribute("quant", t.getTimexQuant());
				if(!t.getTimexFreq().equals(""))
					newTimex.setAttribute("freq", t.getTimexFreq());
				
				// set text
				newTimex.appendChild(doc.createTextNode(t.getCoveredText()));
				
				// add the timex tag to the text tag
				textEl.appendChild(newTimex);
				
				// set cursor to the end of the timex
				offset = t.getEnd();
			}
			
			// append the rest of the document text
			if(offset < docText.length())
				textEl.appendChild(doc.createTextNode(docText.substring(offset)));
		}
		
		return doc;
	}

	/**
	 * writes a populated DOM xml(timeml) document to a given directory/file 
	 * @param xmlDoc xml dom object
	 * @param filename name of the file that gets appended to the set output path
	 */
	private void writeTimeMLDocument(Document xmlDoc, String filename) {
		// create output file handle
		File outFile = new File(mOutputDir, filename+".tml"); 
		
		BufferedWriter bw = null;
		try {
			// create a buffered writer for the output file
			bw = new BufferedWriter(new FileWriter(outFile));
			
			// prepare the transformer to convert from the xml doc to output text
			Transformer transformer = TransformerFactory.newInstance().newTransformer();
			// some pretty printing
			transformer.setOutputProperty(OutputKeys.INDENT, "no");
			DOMSource source = new DOMSource(xmlDoc);
			StreamResult result = new StreamResult(bw);
			
			// transform
			transformer.transform(source, result);
		} catch (IOException e) { // something went wrong with the bufferedwriter
			e.printStackTrace();
			Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be written.");
		} catch (TransformerException e) { // the transformer malfunctioned (call optimus prime)
			e.printStackTrace();
			Logger.printError(component, "XML transformer could not be properly initialized.");
		} finally { // clean up for the bufferedwriter
			try {
				bw.close();
			} catch(IOException e) {
				e.printStackTrace();
				Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be closed.");
			}
		}
	}

	public static synchronized Integer getOutCount() {
		return outCount++;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy