de.unihd.dbs.uima.consumer.eventi2014writer.Eventi2014Writer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
* Eventi2014Writer.java
*
* Copyright (c) 2014, Database Research Group, Institute of Computer Science, Heidelberg University.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* author: Jannik Strötgen
* email: [email protected]
*
* The Eventi2014 Writer writes Eventi-style output.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.uima.consumer.eventi2014writer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.Timex3Interval;
import de.unihd.dbs.uima.types.heideltime.Token;
public class Eventi2014Writer extends CasConsumer_ImplBase {
private Class> component = this.getClass();
private static final String PARAM_OUTPUTDIR = "OutputDir";
// counter for outputting documents. gets increased in case there is no DCT/filename info
private static volatile Integer outCount = 0;
private File mOutputDir;
public void initialize() throws ResourceInitializationException {
mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
if (!mOutputDir.exists()) {
if(!mOutputDir.mkdirs()) {
Logger.printError(component, "Couldn't create non-existant folder "+mOutputDir.getAbsolutePath());
throw new ResourceInitializationException();
}
}
if(!mOutputDir.canWrite()) {
Logger.printError(component, "Folder "+mOutputDir.getAbsolutePath()+" is not writable.");
throw new ResourceInitializationException();
}
}
public void processCas(CAS aCAS) throws ResourceProcessException {
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new ResourceProcessException(e);
}
// prepare everything for document
String fullDocument = "";
// get the DCT
Dct dct = null;
String filename = null;
String dctTag = "";
try {
dct = (Dct) jcas.getAnnotationIndex(Dct.type).iterator().next();
String[] parts = dct.getFilename().split("---");
filename = parts[0];
dctTag = parts[1];
} catch(Exception e) {
e.printStackTrace();
filename = "doc_" + Eventi2014Writer.getOutCount();
}
// create the document according to the formatting requirements of EVENTI 2014
// first line:
String firstLine = "\n";
fullDocument = firstLine.replaceAll("FILENAME", filename);
// get the tokens and add them to fullDocument
FSIterator itToken = jcas.getAnnotationIndex(Token.type).iterator();
int oldTokNum = 0;
int oldTokID = 0;
while (itToken.hasNext()){
Token t = (Token) itToken.next();
String[] parts = t.getFilename().split("---");
String sentNum = parts[1];
String tokNum = parts[2];
while (oldTokID < (t.getTokenId() - 1)){
oldTokNum++;
oldTokID++;
String tokenLine = "TOKENSTRING \n";
tokenLine = tokenLine.replace("TOKENID", oldTokID+"");
tokenLine = tokenLine.replace("SENTENCEID", sentNum);
tokenLine = tokenLine.replace("TOKENNUMBER", oldTokNum+"");
tokenLine = tokenLine.replace("TOKENSTRING", "");
fullDocument = fullDocument + tokenLine;
}
String tokenLine = "TOKENSTRING \n";
tokenLine = tokenLine.replace("TOKENID", t.getTokenId()+"");
tokenLine = tokenLine.replace("SENTENCEID", sentNum);
tokenLine = tokenLine.replace("TOKENNUMBER", tokNum);
tokenLine = tokenLine.replace("TOKENSTRING", t.getCoveredText());
oldTokNum = Integer.parseInt(tokNum);
oldTokID = t.getTokenId();
// System.err.println("TOKEN FOUND....-->" + t.getCoveredText() + "<--");
fullDocument = fullDocument + tokenLine;
}
// add opening markable tag
fullDocument = fullDocument + "\n\n\n";
// collection for timexes which have an emptyValue attribute
HashMap emptyValueTimexes = new HashMap();
// association for HeidelTime-internal Timex3 IDs -> markable_ids
HashMap idTranslation = new HashMap();
// get the timex3s and add them to fullDocument
int markableCounter = 1;
FSIterator itTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
while (itTimex.hasNext()){
Timex3 t = (Timex3) itTimex.next();
if(t instanceof Timex3Interval) continue;
if(t.getEmptyValue() != null && !t.getEmptyValue().equals(""))
emptyValueTimexes.put(t, markableCounter);
// full tag - probably not required
// String open = "";
String open = "\n";
String tokenInfoSingle = "\n";
String close = " \n";
// set the attributes of the TIMEX3 annotations
open = open.replace("MARKABLEID", markableCounter+"");
open = open.replace("MODSTRING", t.getTimexMod());
open = open.replace("QUANTSTRING", t.getTimexQuant());
open = open.replace("FREQSTRING", t.getTimexFreq());
open = open.replace("VALUESTRING", t.getTimexValue());
open = open.replace("TYPESTRING", t.getTimexType());
fullDocument = fullDocument + open;
// get the ids of the tokens which are involved
FSIterator tokenIt = jcas.getAnnotationIndex(Token.type).iterator();
while (tokenIt.hasNext())
{
Token tok = (Token) tokenIt.next();
if ((tok.getBegin() >= t.getBegin()) && (tok.getEnd() <= t.getEnd())){
int tokID = tok.getTokenId();
String line = tokenInfoSingle;
line = line.replace("TOKENID", tokID+"");
fullDocument = fullDocument + line;
}
}
fullDocument = fullDocument + close;
idTranslation.put(t.getTimexId(), markableCounter+"");
markableCounter++;
}
// add document creation time tag
Pattern p = Pattern.compile("m_id=\"([^\"]*)\"");
Matcher m = p.matcher(dctTag);
if(m.find())
dctTag = dctTag.substring(0, m.start(1)) + (markableCounter++) + dctTag.substring(m.end(1), dctTag.length());
fullDocument = fullDocument + dctTag + "\n";
// add empty tags
for(Entry entry : emptyValueTimexes.entrySet()) {
String open = " \n";
fullDocument = fullDocument + open;
}
// add empty tags from timex3intervals
FSIterator tx3intIt = jcas.getAnnotationIndex(Timex3Interval.type).iterator();
while(tx3intIt.hasNext()) {
Timex3Interval tx3i = (Timex3Interval) tx3intIt.next();
if(tx3i.getEmptyValue() != null && !tx3i.getEmptyValue().equals("")) {
String beginMarkable = idTranslation.get(tx3i.getBeginTimex());
String endMarkable = idTranslation.get(tx3i.getEndTimex());
fullDocument += " \n";
}
}
// add closing tag for markables
fullDocument += " \n\n \n ";
writeDocument(fullDocument, filename);
}
/**
* writes a populated DOM xml(timeml) document to a given directory/file
* @param xmlDoc xml dom object
* @param filename name of the file that gets appended to the set output path
*/
private void writeDocument(String fullDocument, String filename) {
// create output file handle
File outFile = new File(mOutputDir, filename+".xml");
BufferedWriter bw = null;
try {
// create a buffered writer for the output file
bw = new BufferedWriter(new FileWriter(outFile));
bw.append(fullDocument);
} catch (IOException e) { // something went wrong with the bufferedwriter
e.printStackTrace();
Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be written.");
} finally { // clean up for the bufferedwriter
try {
bw.close();
} catch(IOException e) {
e.printStackTrace();
Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be closed.");
}
}
}
public static synchronized Integer getOutCount() {
return outCount++;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy