de.unihd.dbs.uima.consumer.aceternwriter.ACETernWriter Maven / Gradle / Ivy
/*
* ACETernWriter.java
*
* Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* author: Jannik Strötgen
* email: [email protected]
*
* ACE Tern Writer creates files according to the ACE Tern style.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.uima.consumer.aceternwriter;
import java.util.List;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.SourceDocInfo;
/**
*
* @author jstroetgen
*
*/
public class ACETernWriter extends CasConsumer_ImplBase {
public static final String PARAM_OUTPUTDIR = "OutputDir";
public static final String PARAM_CONVERTTIMEX3TO2 = "ConvertTimex3To2";
private File mOutputDir;
private int mDocNum;
private Boolean convertTimex3To2 = true;
private boolean printDetails = false;
/**
* initialize
*/
public void initialize() throws ResourceInitializationException {
mDocNum = 0;
convertTimex3To2 = (Boolean) getConfigParameterValue(PARAM_CONVERTTIMEX3TO2);
mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
if (!mOutputDir.exists()) {
mOutputDir.mkdirs();
}
}
/**
* process
*/
public void processCas(CAS aCAS) throws ResourceProcessException {
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new ResourceProcessException(e);
}
printTimexAnnotationsInline(jcas);
}
public void printTimexAnnotationsInline(JCas jcas){
// retrieve the filename of the input file from the CAS
FSIterator it = jcas.getAnnotationIndex(SourceDocInfo.type).iterator();
File outFile = null;
File inFile = null;
if (it.hasNext()) {
SourceDocInfo fileLoc = (SourceDocInfo) it.next();
try {
inFile = new File(new URL(fileLoc.getUri()).getPath());
String outFileName = inFile.getName();
if (fileLoc.getOffsetInSource() > 0) {
outFileName += ("_" + fileLoc.getOffsetInSource());
}
outFileName += ".xmi";
outFile = new File(mOutputDir, outFileName);
} catch (MalformedURLException e1) {
// invalid URL, use default processing below
}
}
if (outFile == null) {
outFile = new File(mOutputDir, "doc" + mDocNum++);
}
// what has to be printed?
String toprint = "";
// document text
String doctext = jcas.getDocumentText();
int startposition = 0;
int endposition = doctext.length();
boolean anyTimex = false;
// get timex index
FSIndex indexTimex = jcas.getAnnotationIndex(Timex3.type);
FSIterator iterTimex = indexTimex.iterator();
while (iterTimex.hasNext()){
anyTimex = true;
Timex3 t = (Timex3) iterTimex.next();
endposition = t.getBegin();
if (endposition < startposition){
if (printDetails == true){
System.err.println("[Tern2004Writer] Overlapping expressions... ignoring: "+t.getCoveredText());
}
}else{
// CHANGES DUE TO TIMEX2 not equal TIMEX3
String timexvalue = t.getTimexValue();
if(convertTimex3To2) {
timexvalue = translatetimex3timex2(timexvalue);
if (t.getTimexType().equals("SET")){
timexvalue = translatetimex3timex2set(timexvalue);
}
}
toprint = toprint + doctext.substring(startposition, endposition); // text from begin or last timex to begin of new timex
toprint = toprint + ""; // timex opening tag
toprint = toprint + t.getCoveredText(); // timex text
toprint = toprint + " ";
startposition = t.getEnd();
}
}
if (anyTimex == true){
// text from last timex to end
toprint = toprint + doctext.substring(startposition);
}
if (anyTimex == false){
// whole document text
toprint = toprint + doctext;
}
// print toprint text
try {
BufferedWriter bf = new BufferedWriter(new FileWriter(outFile));
bf.append(toprint);
bf.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
public String translatetimex3timex2(String timexvalue){
// change decades
String decade = "(.*\\d\\d\\d)X";
if (timexvalue.matches(decade)){
for (MatchResult m : findMatches(Pattern.compile(decade), timexvalue)){
timexvalue = m.group(1);
}
}
// change century
String century = "(.*\\d\\d)XX";
if (timexvalue.matches(century)){
for (MatchResult m : findMatches(Pattern.compile(century), timexvalue)){
timexvalue = m.group(1);
}
}
return timexvalue;
}
public String translatetimex3timex2set(String timexvalue){
// change year
String year = "(P(\\d)+Y)";
if (timexvalue.matches(year)){
timexvalue = "XXXX";
}
// change month
String month = "(P(\\d)+M)";
if (timexvalue.matches(month)){
timexvalue = "XXXX-XX";
}
// change day
String day = "(P(\\d)+D)";
if (timexvalue.matches(day)){
timexvalue = "XXXX-XX-XX";
}
// change hour
String hour = "(PT(\\d)+H)";
if (timexvalue.matches(hour)){
timexvalue = "XXXX-XX-XXTXX";
}
// change minute
String minute = "(PT(\\d)+M)";
if (timexvalue.matches(minute)){
timexvalue = "XXXX-XX-XXTXX:XX";
}
return timexvalue;
}
/**
* Find all the matches of a pattern in a charSequence and return the
* results as list.
*
* @param pattern
* @param s
* @return
*/
public static Iterable findMatches(Pattern pattern,
CharSequence s) {
List results = new ArrayList();
for (Matcher m = pattern.matcher(s); m.find();)
results.add(m.toMatchResult());
return results;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy