de.unihd.dbs.uima.reader.eventi2014reader.Eventi2014Reader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
* Eventi2014Reader.java
*
* Copyright (c) 2014, Database Research Group, Institute of Computer Science, Heidelberg University.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* author: Jannik Strötgen
* email: [email protected]
*
* The Eventi2014 Reader reads Eventi corpora.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.uima.reader.eventi2014reader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
/**
* CollectionReader for TempEval Data
*/
public class Eventi2014Reader extends CollectionReader_ImplBase {
private Class> component = this.getClass();
// uima descriptor parameter name
private String PARAM_INPUTDIR = "InputDirectory";
private Integer numberOfDocuments = 0;
// For improving the formatting of the documentText
// -> to not have a space between all the tokens
// HashSet containing tokens in front of which no white space is added
private HashSet hsNoSpaceBefore = new HashSet();
private HashSet hsNoSpaceBehind = new HashSet();
private Queue files = new LinkedList();
public void initialize() throws ResourceInitializationException {
String dirPath = (String) getConfigParameterValue(PARAM_INPUTDIR);
dirPath = dirPath.trim();
hsNoSpaceBefore.add(".");
hsNoSpaceBefore.add(",");
hsNoSpaceBefore.add(":");
hsNoSpaceBefore.add(";");
hsNoSpaceBefore.add("?");
hsNoSpaceBefore.add("!");
hsNoSpaceBefore.add(")");
hsNoSpaceBehind.add("(");
populateFileList(dirPath);
}
public void getNext(CAS aCAS) throws IOException, CollectionException {
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
fillJCas(jcas);
// give an indicator that a file has been processed
System.err.print(".");
/*TODO:DEBUGGING
FSIterator fsi = jcas.getAnnotationIndex(Token.type).iterator();
while(fsi.hasNext())
System.err.println("token: " + ((Token)fsi.next()).getTokenId());
*/
}
private void fillJCas(JCas jcas) throws IOException, CollectionException {
// grab a file to process
File f = files.poll();
String text = "";
String xml = FileUtils.file2String(f);
String[] lines = xml.split("\n");
String fullDctTag = "";
String dct = "";
String filename = "";
String lastTok = "";
int sentBegin = 0;
int sentEnd = -1;
for (String line : lines) {
// get document name
if (line.startsWith("");
for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) {
filename = mr.group(1);
}
}
// handle the tokens
if (line.startsWith("(.*?) ");
for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) {
String token = mr.group(4);
// System.err.println("INPUT: -->" + token + "<--");
int tokID = Integer.parseInt(mr.group(1));
int sentNum = Integer.parseInt(mr.group(2));
int tokNum = Integer.parseInt(mr.group(3));
// prepare token annotation
int tokBegin;
int tokEnd;
// first token in sentence
if (text.equals("")){
tokBegin = 0;
tokEnd = token.length();
text = token;
lastTok = token;
}
else{
// tokens without space before the tokens
if (hsNoSpaceBefore.contains(token)){
tokBegin = text.length();
tokEnd = tokBegin + token.length();
text = text + token;
lastTok = token;
}
// // empty tokens
// else if (token.equals("")){
// tokBegin = text.length();
// tokEnd = tokBegin + token.length();
// text = text + token;
// lastTok = token;
// }
else{
// tokens without space behind the tokens
if (!(hsNoSpaceBehind.contains(lastTok))){
tokBegin = text.length()+ 1;
text = text + " " + token;
}
// all other tokens
else{
tokBegin = text.length();
text = text + token;
}
tokEnd = tokBegin + token.length();
lastTok = token;
}
}
// check for new sentences
if (tokNum == 0){
if (sentEnd >= 0){
// add sentence annotation, once a new sentence starts
addSentenceAnnotation(jcas, sentBegin, sentEnd, filename);
}
sentBegin = tokBegin;
}
// add the token annotation
addTokenAnnotation(jcas, tokBegin, tokEnd, tokID, filename, sentNum, tokNum);
sentEnd = tokEnd;
}
}
// get the document creation time
if (line.startsWith(")");
for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) {
fullDctTag = mr.group(1);
dct = mr.group(2);
System.err.println("DCT: " + dct);
}
}
}
// add the very last sentence annotation
addSentenceAnnotation(jcas, sentBegin, sentEnd, filename);
jcas.setDocumentText(text);
// add DCT to jcas
if (!(dct.equals(""))){
Dct dctAnnotation = new Dct(jcas);
dctAnnotation.setBegin(0);
dctAnnotation.setEnd(text.length());
dctAnnotation.setFilename(filename + "---" + fullDctTag);
dctAnnotation.setValue(dct);
dctAnnotation.addToIndexes();
}
}
public void addSentenceAnnotation(JCas jcas, int begin, int end, String filename){
Sentence sentAnnotation = new Sentence(jcas);
sentAnnotation.setBegin(begin);
sentAnnotation.setEnd(end);
sentAnnotation.setFilename(filename);
sentAnnotation.addToIndexes();
}
public void addTokenAnnotation(JCas jcas, int begin, int end, int tokID, String filename, int sentNum, int tokNum){
Token tokenAnnotation = new Token(jcas);
tokenAnnotation.setBegin(begin);
tokenAnnotation.setEnd(end);
tokenAnnotation.setTokenId(tokID);
tokenAnnotation.setFilename(filename + "---" + sentNum + "---" + tokNum);
tokenAnnotation.addToIndexes();
}
public boolean hasNext() throws IOException, CollectionException {
return files.size() > 0;
}
public Progress[] getProgress() {
return new Progress[] { new ProgressImpl(numberOfDocuments-files.size(), numberOfDocuments , Progress.ENTITIES) };
}
public void close() throws IOException {
files.clear();
}
private void populateFileList(String dirPath) throws ResourceInitializationException {
ArrayList myFiles = new ArrayList();
File dir = new File(dirPath);
// check if the given directory path is valid
if(!dir.exists() || !dir.isDirectory())
throw new ResourceInitializationException();
else
myFiles.addAll(Arrays.asList(dir.listFiles()));
// check for existence and readability; add handle to the list
for(File f : myFiles) {
if(!f.exists() || !f.isFile() || !f.canRead()) {
Logger.printDetail(component, "File \""+f.getAbsolutePath()+"\" was ignored because it either didn't exist, wasn't a file or wasn't readable.");
} else {
files.add(f);
}
}
numberOfDocuments = files.size();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy