de.unihd.dbs.uima.reader.aceternreader.ACETernReader Maven / Gradle / Ivy
/*
* ACETernReader.java
*
* Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* author: Jannik Strötgen
* email: [email protected]
*
* ACE Tern Reader reads temporal annotated corpora that are in the ACE Tern style.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.uima.reader.aceternreader;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.SourceDocInfo;
/**
* CollectionReader for ACE Tern Data
*/
public class ACETernReader extends CollectionReader_ImplBase {
private static Logger logger = null;
private static final String compontent_id = "de.unihd.dbs.uima.reader.aceternreader";
/**
* Needed information to create cas objects for all "documents"
*/
public Integer numberOfDocuments = 0;
/**
* Parameter information
*/
public static final String PARAM_INPUTDIR = "InputDirectory";
public static final String PARAM_DCT = "AnnotateCreationTime";
public Boolean annotateDCT = false;
/**
* List containing all filenames of "documents"
*/
private ArrayList mFiles;
/**
* Current file number
*/
private int currentIndex;
/**
* @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
*/
public void initialize() throws ResourceInitializationException {
logger = getUimaContext().getLogger();
logger.log(Level.INFO, "initialize() - Initializing ACETern-Reader...");
annotateDCT = (Boolean) getConfigParameterValue(PARAM_DCT);
File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
currentIndex = 0;
// if input directory does not exist or is not a directory, throw exception
if (!directory.exists() || !directory.isDirectory()) {
throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
}
// get list of files (without subdirectories) in the specified directory
mFiles = new ArrayList();
File[] files = directory.listFiles();
for (int i = 0; i < files.length; i++) {
if (!files[i].isDirectory()) {
mFiles.add(files[i]);
}
}
}
/**
* @see org.apache.uima.collection.CollectionReader#hasNext()
*/
public boolean hasNext() {
return currentIndex < mFiles.size();
}
/**
* @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
*/
public void getNext(CAS aCAS) throws IOException, CollectionException {
System.err.print(".");
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
// open input stream to file
File file = (File) mFiles.get(currentIndex++);
logger.log(Level.INFO, "getNext(CAS) - Reading file " + file.getName());
String text = "";
String xml = FileUtils.file2String(file);
text = xml;
// put document into CAS
text = text.replaceAll("(?s)", "");
jcas.setDocumentText(text);
// Keep Source document information
SourceDocInfo srcDocInfo = new SourceDocInfo(jcas);
URL url = file.getAbsoluteFile().toURI().toURL();
srcDocInfo.setUri(url.toString());
srcDocInfo.addToIndexes();
// Get document creation time if necessary
if (annotateDCT){
/*
* if DCT shall be set, set it now
*/
setDCT(xml, jcas, url.toString());
}
}
@SuppressWarnings("unused")
public void setDCT(String xml, JCas jcas, String filename){
// SET DOCUMENT CREATION TIME!!!!
// possible tags for DCT:
// DATETIME (all WikiWar documents) with the following format 2009-12-20T17:00:00
// DATE_TIME (Tern 2004) with the following format "10/17/2000 18:46:13.59" "10/17/2000 18:41:01.17" "11/04/2000 9:14:43.41" "2000-10-01 20:56:35"
// DATE (Tern 2004) with the following format "07/15/2000" "1996-02-13" "1997-03-09 10:50:59"
// WITHOUT DATE ARE THE ACE TERN 2004 training files: chtb_171.eng.sgm, 172, 174, 179, 183,
// DATETIME (ACE 2005 training) with the following formats additionally: 20041221-20:24:00, 20030422
String datetimetag = null;
// possible date formats
String dateformat1 = "(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d\\d):(\\d\\d):(\\d\\d)(.*?)"; // 2009-12-20T17:00:00 or 2000-10-01 20:56:35
String dateformat2 = "(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d):(\\d\\d):(\\d\\d)(.*?)"; // 2009-12-20T7:00:00 or 2000-10-01 9:56:35
String dateformat3 = "(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)"; // 10/17/2000 18:46:13.59
String dateformat4 = "(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)"; // 10/17/2000 1:46:13.59
String dateformat5 = "(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(.*?)"; // 1996-02-13
String dateformat6 = "(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d)(.*?)"; // 07/15/2000
String dateformat7 = "(.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?)";
String dateformat8 = "(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)-(\\d\\d):(\\d\\d):(\\d\\d)(.*?)"; // 20041221-20:24:00
String dateformat9 = "(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?)"; // 20030422
for (MatchResult m : findMatches(Pattern.compile("(|||)(("+dateformat1+
")|("+dateformat2+
")|("+dateformat3+
")|("+dateformat4+
")|("+dateformat5+
")|("+dateformat6+
")|("+dateformat7+
")|("+dateformat8+
")|("+dateformat9+")( |||))"), xml)){
datetimetag = m.group(2);
}
String time_value = null;
String date_value = null;
if (!(datetimetag == null)){
if (datetimetag.matches(dateformat1)){
for (MatchResult m : findMatches(Pattern.compile(dateformat1), datetimetag)){
date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
time_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4)+"T"+m.group(6)+":"+m.group(7)+":"+m.group(8);
}
}
else if (datetimetag.matches(dateformat2)){
for (MatchResult m : findMatches(Pattern.compile(dateformat2), datetimetag)){
date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
time_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4)+"T0"+m.group(6)+":"+m.group(7)+":"+m.group(8);
}
}
else if (datetimetag.matches(dateformat3)){
for (MatchResult m : findMatches(Pattern.compile(dateformat3), datetimetag)){
date_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3);
time_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3)+"T"+m.group(5)+":"+m.group(6)+":"+m.group(7)+"."+m.group(8);
}
}
else if (datetimetag.matches(dateformat4)){
for (MatchResult m : findMatches(Pattern.compile(dateformat4), datetimetag)){
date_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3);
time_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3)+"T0"+m.group(5)+":"+m.group(6)+":"+m.group(7)+"."+m.group(8);
}
}
else if (datetimetag.matches(dateformat5)){
for (MatchResult m : findMatches(Pattern.compile(dateformat5), datetimetag)){
date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
}
}
else if (datetimetag.matches(dateformat6)){
for (MatchResult m : findMatches(Pattern.compile(dateformat6), datetimetag)){
date_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3);
}
}
else if (datetimetag.matches(dateformat7)){
for (MatchResult m : findMatches(Pattern.compile(dateformat7), datetimetag)){
String year = m.group(4);
String month = normMonth(m.group(2));
String day = normDay(m.group(3));
date_value = year+"-"+month+"-"+day;
}
}
else if (datetimetag.matches(dateformat8)){
for (MatchResult m : findMatches(Pattern.compile(dateformat8), datetimetag)){
date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
time_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4)+"T"+m.group(5)+":"+m.group(6)+":"+m.group(7);
}
}
else if (datetimetag.matches(dateformat9)){
for (MatchResult m : findMatches(Pattern.compile(dateformat9), datetimetag)){
date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
}
}
else{
System.err.println();
System.err.println("["+compontent_id+"] cannot set dct with datetimetag: "+datetimetag);
}
if (!(date_value == null)){
Dct dct = new Dct(jcas);
dct.setBegin(0);
dct.setEnd(1);
dct.setFilename(filename);
dct.setTimexId("dct");
if (!(time_value == null)){
dct.setValue(time_value);
// System.err.println("["+compontent_id+"] set dct to: "+time_value);
}else if (!(date_value == null)){
dct.setValue(date_value);
// System.err.println("["+compontent_id+"] set dct to: "+date_value);
}
else{
System.err.println();
System.err.println("["+compontent_id+"] something wrong with setting DCT of : "+datetimetag);
}
dct.addToIndexes();
}
}
else{
if (date_value == null){
// System.err.println("Checking for further formats of DCT...");
String refYear = "";
String refMonth = "";
String refDay = "";
for (MatchResult m1 : findMatches(Pattern.compile("DATE:[\\s]+("+dateformat7+")"),xml)){
String referenceDate = m1.group(1);
if (referenceDate.matches(dateformat7)){
for (MatchResult mr : findMatches(Pattern.compile(dateformat7), referenceDate)){
refYear = mr.group(4);
refMonth = normMonth(mr.group(2));
refDay = normDay(mr.group(3));
}
}
}
for (MatchResult m : findMatches(Pattern.compile(""
+"(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jun\\.|Jul\\.|Aug\\.|Sep\\.|Oct\\.|Nov\\.|Dec\\.|"
+ "JAN\\.|FEB\\.|MAR\\.|APR\\.|MAY\\.|JUN\\.|JUL\\.|AUG\\.|SEP\\.|OCT\\.|NOV\\.|DEC\\.)[\\s]+([\\d]?[\\d])"
+" "), xml)){
String exactMonth = m.group(1);
String exactDay = m.group(2);
date_value = refYear+"-"+normMonth(exactMonth)+"-"+normDay(exactDay);
}
}
if (date_value == null){
for (MatchResult m : findMatches(Pattern.compile(""
+".*?(\\d\\d\\d\\d)(\\d\\d)(\\d\\d).*?"
+" "), xml)){
String exactYear = m.group(1);
String exactMonth = m.group(2);
String exactDay = m.group(3);
date_value = exactYear+"-"+exactMonth+"-"+exactDay;
}
}
if (date_value == null){
String refYear = "";
String refMonth = "";
String refDay = "";
for (MatchResult m : findMatches(Pattern.compile(".*?(\\d\\d\\d\\d)(\\d\\d)(\\d\\d).*? "),xml)){
refYear = m.group(1);
refMonth = normMonth(m.group(2));
refDay = normDay(m.group(3));
}
if (!(refYear.matches(""))){
for (MatchResult m : findMatches(Pattern.compile(".*?"
+"(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]).*?"+
" "), xml)){
String exactMonth = normMonth(m.group(1));
String exactDay = normDay(m.group(2));
date_value = refYear+"-"+exactMonth+"-"+exactDay;
}
}
}
if (date_value == null){
String refYear = "";
String refMonth = "";
String refDay = "";
for (MatchResult m : findMatches(Pattern.compile("Publish Date:[\\s]+(\\d\\d)/(\\d\\d)/(\\d\\d)"),xml)){
refYear = "19"+m.group(3);
refMonth = normMonth(m.group(1));
refDay = normDay(m.group(2));
}
if (!(refYear.matches(""))){
for (MatchResult m : findMatches(Pattern.compile(".*?"
+"(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jun\\.|Jul\\.|Aug\\.|Sep\\.|Oct\\.|Nov\\.|Dec\\.|"
+ "JAN\\.|FEB\\.|MAR\\.|APR\\.|MAY\\.|JUN\\.|JUL\\.|AUG\\.|SEP\\.|OCT\\.|NOV\\.|DEC\\.)[\\s]+([\\d]?[\\d]).*?"+
" "), xml)){
String exactMonth = normMonth(m.group(1));
String exactDay = normDay(m.group(2));
date_value = refYear+"-"+exactMonth+"-"+exactDay;
}
}
}
// Document Creation Time style of EVALITA I-CAB corpus (Italian corpus)
// example:
if (date_value == null){
try {
for (MatchResult m : findMatches(Pattern.compile("()"), xml)){
datetimetag = m.group(2);
}
if (datetimetag.matches(dateformat9)){
for (MatchResult m : findMatches(Pattern.compile(dateformat9), datetimetag)){
date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
}
} else {
System.err.println();
System.err.println("["+compontent_id+"] cannot set dct with datetimetag: "+datetimetag);
}
} catch(NullPointerException e) { } // nothing to see here, carry on
}
if (date_value == null){
System.err.println();
System.err.println("["+compontent_id+"] Cannot set Document Creation Time - no datetimetag found in "+filename+"!");
}
else{
Dct dct = new Dct(jcas);
dct.setBegin(0);
dct.setEnd(1);
dct.setFilename(filename);
dct.setTimexId("dct");
dct.setValue(date_value);
dct.addToIndexes();
}
}
}
public String normDay(String day){
if (!(day.matches("\\d\\d"))){
if (day.equals("1")){
day = "01";
}
else if (day.equals("2")){
day = "02";
}
else if (day.equals("3")){
day = "03";
}
else if (day.equals("4")){
day = "04";
}
else if (day.equals("5")){
day = "05";
}
else if (day.equals("6")){
day = "06";
}
else if (day.equals("7")){
day = "07";
}
else if (day.equals("8")){
day = "08";
}
else if (day.equals("9")){
day = "09";
}
}
return day;
}
public String normMonth(String month){
if (month.toLowerCase().startsWith("jan")){
month = "01";
}
else if (month.toLowerCase().startsWith("feb")){
month = "02";
}
else if (month.toLowerCase().startsWith("mar")){
month = "03";
}
else if (month.toLowerCase().startsWith("apr")){
month = "04";
}
else if (month.toLowerCase().startsWith("may")){
month = "05";
}
else if (month.toLowerCase().startsWith("jun")){
month = "06";
}
else if (month.toLowerCase().startsWith("jul")){
month = "07";
}
else if (month.toLowerCase().startsWith("aug")){
month = "08";
}
else if (month.toLowerCase().startsWith("sep")){
month = "09";
}
else if (month.toLowerCase().startsWith("oct")){
month = "10";
}
else if (month.toLowerCase().startsWith("nov")){
month = "11";
}
else if (month.toLowerCase().startsWith("dec")){
month = "12";
}
return month;
}
/**
* @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
*/
public void close() throws IOException {
}
/**
* @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
*/
public Progress[] getProgress() {
return new Progress[] { new ProgressImpl(currentIndex, mFiles.size(), Progress.ENTITIES) };
}
/**
* Gets the total number of documents that will be returned by this collection reader. This is not
* part of the general collection reader interface.
*
* @return the number of documents in the collection
*/
public int getNumberOfDocuments() {
return mFiles.size();
}
/**
* Find all the matches of a pattern in a charSequence and return the
* results as list.
*
* @param pattern
* @param s
* @return
*/
public static Iterable findMatches(Pattern pattern,
CharSequence s) {
List results = new ArrayList();
for (Matcher m = pattern.matcher(s); m.find();)
results.add(m.toMatchResult());
return results;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy