de.unihd.dbs.uima.consumer.eventi2014writer.Eventi2014Writer Maven / Gradle / Ivy
/*
* Eventi2014Writer.java
*
* Copyright (c) 2014, Database Research Group, Institute of Computer Science, Heidelberg University.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* author: Jannik Strötgen
* email: [email protected]
*
* The Eventi2014 Writer writes Eventi-style output.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.uima.consumer.eventi2014writer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.Timex3Interval;
import de.unihd.dbs.uima.types.heideltime.Token;
public class Eventi2014Writer extends CasConsumer_ImplBase {
private Class> component = this.getClass();
private static final String PARAM_OUTPUTDIR = "OutputDir";
// counter for outputting documents. gets increased in case there is no DCT/filename info
private static volatile Integer outCount = 0;
private File mOutputDir;
public void initialize() throws ResourceInitializationException {
mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
if (!mOutputDir.exists()) {
if(!mOutputDir.mkdirs()) {
Logger.printError(component, "Couldn't create non-existant folder "+mOutputDir.getAbsolutePath());
throw new ResourceInitializationException();
}
}
if(!mOutputDir.canWrite()) {
Logger.printError(component, "Folder "+mOutputDir.getAbsolutePath()+" is not writable.");
throw new ResourceInitializationException();
}
}
public void processCas(CAS aCAS) throws ResourceProcessException {
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new ResourceProcessException(e);
}
// prepare everything for document
String fullDocument = "";
// get the DCT
Dct dct = null;
String filename = null;
String dctTag = "";
try {
dct = (Dct) jcas.getAnnotationIndex(Dct.type).iterator().next();
String[] parts = dct.getFilename().split("---");
filename = parts[0];
dctTag = parts[1];
} catch(Exception e) {
e.printStackTrace();
filename = "doc_" + Eventi2014Writer.getOutCount();
}
// create the document according to the formatting requirements of EVENTI 2014
// first line:
String firstLine = "\n";
fullDocument = firstLine.replaceAll("FILENAME", filename);
// get the tokens and add them to fullDocument
FSIterator itToken = jcas.getAnnotationIndex(Token.type).iterator();
int oldTokNum = 0;
int oldTokID = 0;
while (itToken.hasNext()){
Token t = (Token) itToken.next();
String[] parts = t.getFilename().split("---");
String sentNum = parts[1];
String tokNum = parts[2];
while (oldTokID < (t.getTokenId() - 1)){
oldTokNum++;
oldTokID++;
String tokenLine = "TOKENSTRING \n";
tokenLine = tokenLine.replace("TOKENID", oldTokID+"");
tokenLine = tokenLine.replace("SENTENCEID", sentNum);
tokenLine = tokenLine.replace("TOKENNUMBER", oldTokNum+"");
tokenLine = tokenLine.replace("TOKENSTRING", "");
fullDocument = fullDocument + tokenLine;
}
String tokenLine = "TOKENSTRING \n";
tokenLine = tokenLine.replace("TOKENID", t.getTokenId()+"");
tokenLine = tokenLine.replace("SENTENCEID", sentNum);
tokenLine = tokenLine.replace("TOKENNUMBER", tokNum);
tokenLine = tokenLine.replace("TOKENSTRING", t.getCoveredText());
oldTokNum = Integer.parseInt(tokNum);
oldTokID = t.getTokenId();
// System.err.println("TOKEN FOUND....-->" + t.getCoveredText() + "<--");
fullDocument = fullDocument + tokenLine;
}
// add opening markable tag
fullDocument = fullDocument + "\n\n\n";
// collection for timexes which have an emptyValue attribute
HashMap emptyValueTimexes = new HashMap();
// association for HeidelTime-internal Timex3 IDs -> markable_ids
HashMap idTranslation = new HashMap();
// get the timex3s and add them to fullDocument
int markableCounter = 1;
FSIterator itTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
while (itTimex.hasNext()){
Timex3 t = (Timex3) itTimex.next();
if(t instanceof Timex3Interval) continue;
if(t.getEmptyValue() != null && !t.getEmptyValue().equals(""))
emptyValueTimexes.put(t, markableCounter);
// full tag - probably not required
// String open = "";
String open = "\n";
String tokenInfoSingle = "\n";
String close = " \n";
// set the attributes of the TIMEX3 annotations
open = open.replace("MARKABLEID", markableCounter+"");
open = open.replace("MODSTRING", t.getTimexMod());
open = open.replace("QUANTSTRING", t.getTimexQuant());
open = open.replace("FREQSTRING", t.getTimexFreq());
open = open.replace("VALUESTRING", t.getTimexValue());
open = open.replace("TYPESTRING", t.getTimexType());
fullDocument = fullDocument + open;
// get the ids of the tokens which are involved
FSIterator tokenIt = jcas.getAnnotationIndex(Token.type).iterator();
while (tokenIt.hasNext())
{
Token tok = (Token) tokenIt.next();
if ((tok.getBegin() >= t.getBegin()) && (tok.getEnd() <= t.getEnd())){
int tokID = tok.getTokenId();
String line = tokenInfoSingle;
line = line.replace("TOKENID", tokID+"");
fullDocument = fullDocument + line;
}
}
fullDocument = fullDocument + close;
idTranslation.put(t.getTimexId(), markableCounter+"");
markableCounter++;
}
// add document creation time tag
Pattern p = Pattern.compile("m_id=\"([^\"]*)\"");
Matcher m = p.matcher(dctTag);
if(m.find())
dctTag = dctTag.substring(0, m.start(1)) + (markableCounter++) + dctTag.substring(m.end(1), dctTag.length());
fullDocument = fullDocument + dctTag + "\n";
// add empty tags
for(Entry entry : emptyValueTimexes.entrySet()) {
String open = " \n";
fullDocument = fullDocument + open;
}
// add empty tags from timex3intervals
FSIterator tx3intIt = jcas.getAnnotationIndex(Timex3Interval.type).iterator();
while(tx3intIt.hasNext()) {
Timex3Interval tx3i = (Timex3Interval) tx3intIt.next();
if(tx3i.getEmptyValue() != null && !tx3i.getEmptyValue().equals("")) {
String beginMarkable = idTranslation.get(tx3i.getBeginTimex());
String endMarkable = idTranslation.get(tx3i.getEndTimex());
fullDocument += " \n";
}
}
// add closing tag for markables
fullDocument += " \n\n \n ";
writeDocument(fullDocument, filename);
}
/**
* writes a populated DOM xml(timeml) document to a given directory/file
* @param xmlDoc xml dom object
* @param filename name of the file that gets appended to the set output path
*/
private void writeDocument(String fullDocument, String filename) {
// create output file handle
File outFile = new File(mOutputDir, filename+".xml");
BufferedWriter bw = null;
try {
// create a buffered writer for the output file
bw = new BufferedWriter(new FileWriter(outFile));
bw.append(fullDocument);
} catch (IOException e) { // something went wrong with the bufferedwriter
e.printStackTrace();
Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be written.");
} finally { // clean up for the bufferedwriter
try {
bw.close();
} catch(IOException e) {
e.printStackTrace();
Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be closed.");
}
}
}
public static synchronized Integer getOutCount() {
return outCount++;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy