All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unihd.dbs.uima.reader.eventi2014reader.Eventi2014Reader Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/*
 * Eventi2014Reader.java
 * 
 * Copyright (c) 2014, Database Research Group, Institute of Computer Science, Heidelberg University. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the GNU General Public License.
 * 
 * author: Jannik Strötgen
 * email:  [email protected]
 * 
 * The Eventi2014 Reader reads Eventi corpora.
 * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
 */

package de.unihd.dbs.uima.reader.eventi2014reader;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox;
import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;

/**
 * CollectionReader for TempEval Data 
 */
public class Eventi2014Reader extends CollectionReader_ImplBase {
	private Class component = this.getClass();
	
	// uima descriptor parameter name
	private String PARAM_INPUTDIR = "InputDirectory";
	
	private Integer numberOfDocuments = 0;
	
	// For improving the formatting of the documentText 
	// -> to not have a space between all the tokens
	// HashSet containing tokens in front of which no white space is added
	private HashSet hsNoSpaceBefore = new HashSet();
	private HashSet hsNoSpaceBehind = new HashSet();
	
	private Queue files = new LinkedList();
	
	public void initialize() throws ResourceInitializationException {
		String dirPath = (String) getConfigParameterValue(PARAM_INPUTDIR);
		dirPath = dirPath.trim();
		
		hsNoSpaceBefore.add(".");
		hsNoSpaceBefore.add(",");
		hsNoSpaceBefore.add(":");
		hsNoSpaceBefore.add(";");
		hsNoSpaceBefore.add("?");
		hsNoSpaceBefore.add("!");
		hsNoSpaceBefore.add(")");
		
		hsNoSpaceBehind.add("(");
		
		populateFileList(dirPath);
	}

	public void getNext(CAS aCAS) throws IOException, CollectionException {
		JCas jcas;
		
		try {
			jcas = aCAS.getJCas();
		} catch (CASException e) {
			throw new CollectionException(e);
		}

		fillJCas(jcas);
		
		// give an indicator that a file has been processed
		System.err.print(".");

		
		/*TODO:DEBUGGING
		FSIterator fsi = jcas.getAnnotationIndex(Token.type).iterator();
		while(fsi.hasNext())
			System.err.println("token: " + ((Token)fsi.next()).getTokenId());
		*/
	}

	private void fillJCas(JCas jcas) throws IOException, CollectionException {
		// grab a file to process
		File f = files.poll();
		
	    String text = "";   
	    String xml = FileUtils.file2String(f);
	    
	    String[] lines = xml.split("\n");
	    
	    String fullDctTag = "";
	    String dct = "";
	    String filename = "";
	    String lastTok = "";
	    int sentBegin = 0;
	    int sentEnd  = -1;
	    
	    for (String line : lines) {
	    	
	    	// get document name
			if (line.startsWith("");
				for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) {
					filename = mr.group(1);
				}
			}
			
			// handle the tokens
			if (line.startsWith("(.*?)");
				for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) {

					String token   = mr.group(4); 
//					System.err.println("INPUT: -->" + token + "<--");
					int tokID   = Integer.parseInt(mr.group(1));
					int sentNum = Integer.parseInt(mr.group(2));
					int tokNum  = Integer.parseInt(mr.group(3));
					
					// prepare token annotation
					int tokBegin;
					int tokEnd;
					
					// first token in sentence
					if (text.equals("")){
						tokBegin = 0;
						tokEnd   = token.length();
						text  = token;
						lastTok = token;
					}
					else{
						// tokens without space before the tokens
						if (hsNoSpaceBefore.contains(token)){
							tokBegin = text.length();
							tokEnd   = tokBegin + token.length();
							text  = text + token;
							lastTok = token;
						}
//						// empty tokens
//						else if (token.equals("")){
//							tokBegin = text.length();
//							tokEnd   = tokBegin + token.length();
//							text  = text + token;
//							lastTok = token;
//						}
						else{
							// tokens without space behind the tokens
							if (!(hsNoSpaceBehind.contains(lastTok))){
								tokBegin = text.length()+ 1;
								text  = text + " " + token;
							}
							// all other tokens
							else{
								tokBegin = text.length();
								text = text + token;
							}
							tokEnd   = tokBegin + token.length();
							lastTok = token;
						}
					}
					// check for new sentences
					if (tokNum == 0){
						if (sentEnd >= 0){
							// add sentence annotation, once a new sentence starts
							addSentenceAnnotation(jcas, sentBegin, sentEnd, filename);
						}
						sentBegin = tokBegin;
					}
					// add the token annotation
					addTokenAnnotation(jcas, tokBegin, tokEnd, tokID, filename, sentNum, tokNum);
					sentEnd = tokEnd;
				}
			}
			
			// get the document creation time
			if (line.startsWith(")");
				for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) {
					fullDctTag = mr.group(1); 
					dct = mr.group(2);
					System.err.println("DCT: " + dct);
				}
			}
	    }
	    // add the very last sentence annotation
	    addSentenceAnnotation(jcas, sentBegin, sentEnd, filename);
	    jcas.setDocumentText(text);
	    
	    // add DCT to jcas
	    if (!(dct.equals(""))){
		    Dct dctAnnotation  = new Dct(jcas);
		    dctAnnotation.setBegin(0);
		    dctAnnotation.setEnd(text.length());
		    dctAnnotation.setFilename(filename + "---" + fullDctTag);
		    dctAnnotation.setValue(dct);
		    dctAnnotation.addToIndexes();
	    }
		
	}
	
	public void addSentenceAnnotation(JCas jcas, int begin, int end, String filename){
		Sentence sentAnnotation = new Sentence(jcas);
		sentAnnotation.setBegin(begin);
		sentAnnotation.setEnd(end);
		sentAnnotation.setFilename(filename);
		sentAnnotation.addToIndexes();
	}
	
	public void addTokenAnnotation(JCas jcas, int begin, int end, int tokID, String filename, int sentNum, int tokNum){
		Token tokenAnnotation = new Token(jcas);
		tokenAnnotation.setBegin(begin);
		tokenAnnotation.setEnd(end);
		tokenAnnotation.setTokenId(tokID);
		tokenAnnotation.setFilename(filename + "---" + sentNum + "---" + tokNum);
		tokenAnnotation.addToIndexes();
	}

	public boolean hasNext() throws IOException, CollectionException {
	    return files.size() > 0;
	}
	
	public Progress[] getProgress() {
		return new Progress[] { new ProgressImpl(numberOfDocuments-files.size(), numberOfDocuments , Progress.ENTITIES) };
	}
	
	public void close() throws IOException {
		files.clear();
	}

	private void populateFileList(String dirPath) throws ResourceInitializationException {
		ArrayList myFiles = new ArrayList();
		File dir = new File(dirPath);
		
		// check if the given directory path is valid
		if(!dir.exists() || !dir.isDirectory())
			throw new ResourceInitializationException();
		else
			myFiles.addAll(Arrays.asList(dir.listFiles()));
		
		// check for existence and readability; add handle to the list
		for(File f : myFiles) {
			if(!f.exists() || !f.isFile() || !f.canRead()) {
				Logger.printDetail(component, "File \""+f.getAbsolutePath()+"\" was ignored because it either didn't exist, wasn't a file or wasn't readable.");
			} else {
				files.add(f);
			}
		}
		
		numberOfDocuments = files.size();
	}
	
	
	
}







© 2015 - 2025 Weber Informatics LLC | Privacy Policy