de.unihd.dbs.uima.reader.tempeval2reader.Tempeval2Reader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
There is a newer version: 2.2.1
Show newest version
/*
 * TempEval2Reader.java
 * 
 * Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the GNU General Public License.
 * 
 * author: Jannik Strötgen
 * email:  [email protected]
 * 
 * The TempEval2 Reader reads TempEval-2 corpora.
 * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
 */

package de.unihd.dbs.uima.reader.tempeval2reader;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;

/**
 * CollectionReader for TempEval Data 
 */
public class Tempeval2Reader extends CollectionReader_ImplBase {
	/**
	 * Logger for this class
	 */
	private static Logger logger = null;
	
	/**
	 * ComponentId
	 */
	private static final String compontent_id = "de.unihd.dbs.uima.reader.tempeval2reader";
	
	/**
	 * Parameter for files in the input directory
	 */
	public static final String FILE_BASE_SEGMENTATION  = "base-segmentation.tab";
	public static final String FILE_DCT                = "dct.tab";
	
	/**
	 * Needed information to create cas objects for all "documents"
	 */
	public Integer numberOfDocuments = 0;
	
	/**
	 * HashMap for all tokens of a document
	 */
	public HashMap hmToken = new HashMap();
	
	/**
	 * HashMap for all document creation times
	 */
	public HashMap hmDct = new HashMap();
	
	
  /**
   * Name of configuration parameter that must be set to the path of a directory
   * containing input files.
   */
	  public static final String PARAM_INPUTDIR  = "InputDirectory";
	  public static final String PARAM_CHARSET  = "Charset";
	  public static final String PARAM_USE_SPACES = "UseSpacesAsSeparators";

  /**
   * List containing all filenames of "documents"
   */
  private List filenames = new ArrayList();

  
  /**
   * Current file number
   */
  private int currentIndex;

  /**
   * Parentheses are given as "-LRB-" ... reset to "(" ...
   */
  
  Boolean resettingParentheses = true;
  
  /**
   * Check the TempEval counting beginning if "0" or "1"
   */
	int newTokSentNumber = 0;
	
  /**
   * Charset to use for reading in files
   */
  Charset charset = null;
	
  /**
   * Charset to use for reading in files
   */
  Boolean USE_SPACES = true;

  /**
   * 
   */
  public void initialize() throws ResourceInitializationException {
	  String charsetText = (String) getConfigParameterValue(PARAM_CHARSET);
	  if(charsetText == null || charsetText.equals(""))
		  charsetText = "UTF-8";
	  try {
		  charset = Charset.forName(charsetText);
	  } catch(Exception e) {
		  System.err.println("["+compontent_id+"] Charset " + charsetText + " was not available to be used.");
		  throw new ResourceInitializationException();
	  }

	  Boolean useSpaces = (Boolean) getConfigParameterValue(PARAM_USE_SPACES);
	  if(useSpaces != false) {
		  USE_SPACES = true;
	  } else {
		  USE_SPACES = false;
	  }
	   
	  // save doc names to list
	  List inputFiles = getFilesFromInputDirectory();
	  
	  // get total document number and put all doc names into list "filenames"
	  numberOfDocuments = getNumberOfDocuments(inputFiles);
	  System.err.println("["+compontent_id+"] number of documents: "+numberOfDocuments);
  }
  
  
  public void getNext(CAS cas) throws IOException, CollectionException {
	  
	  // create jcas  
	  JCas jcas;
	  try {
		  jcas = cas.getJCas();
	  } catch (CASException e) {
		  throw new CollectionException(e);
	  }
	  
	  // clear HashMaps for new document
	  hmToken.clear();
	  hmDct.clear();
	  
	  // get current doc name 
	  String docname = filenames.get(currentIndex++);
	  
	  // save doc names to list
	  List inputFiles = getFilesFromInputDirectory();
	  
	  // set documentText, sentences, tokens from file
	  setTextSentencesTokens(docname, inputFiles, jcas);
	  
	  // set document creation time (dct)
	  setDocumentCreationTime(docname, inputFiles, jcas);
  }

  /**
   * @see org.apache.uima.collection.CollectionReader#hasNext()
   */
  public boolean hasNext() throws IOException, CollectionException {
    return currentIndex < numberOfDocuments;
  }

  /**
   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
   */
  public Progress[] getProgress() {
    return new Progress[] { new ProgressImpl(currentIndex, numberOfDocuments, Progress.ENTITIES) };
  }

  /**
   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
   */
  public void close() throws IOException {
  }

  
  
  public void setDocumentCreationTime(String docname, List inputFiles, JCas jcas) throws IOException{
	  String documentCreationTime = "";
	  
	  String directory = (String) getConfigParameterValue(PARAM_INPUTDIR);
	  String filename = directory+"/"+FILE_DCT;
	  for (File file : inputFiles) {
		if (file.getAbsolutePath().equals(filename)){
			try {
				String line;
				BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
				while ((line = bf.readLine()) != null){
					String[] parts = line.split("(\t)+");
					String fileId  = parts[0];
					if (fileId.equals(docname)){
						documentCreationTime = parts[1];
						Dct dct = new Dct(jcas);
						String text = jcas.getDocumentText();
						dct.setBegin(0);
						dct.setEnd(text.length());
						dct.setFilename(fileId);
						dct.setValue(documentCreationTime);
						dct.setTimexId("t0");
						dct.addToIndexes();
						
						// add dct to HashMap
						hmDct.put("t0", dct);
					}
				}
				bf.close();
			}
			catch (IOException e){
				throw new IOException(e);
			}
			
		}
	  }
  }
  
  public void setTextSentencesTokens(String docname, List inputFiles, JCas jcas) throws IOException{
	  String text        = "";
	  String sentString  = "";
	  Integer positionCounter = 0;
	  Integer sentId = -1;
	  Integer lastSentId = -1;
	  
	  String directory = (String) getConfigParameterValue(PARAM_INPUTDIR);
	  String filename = directory+"/"+FILE_BASE_SEGMENTATION;
	  for (File file : inputFiles) {
		if (file.getAbsolutePath().equals(filename)){
			try {
				String line;
				BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
				Boolean lastSentProcessed  = false;
				Boolean firstSentProcessed = false;
				String fileId = "";
				Boolean veryFirstLine = true;
				while ((line = bf.readLine()) != null){
					
					// Check if TempEval counting starts with "0" or "1"
					if (veryFirstLine){
						String[] parts = line.split("\t");
						newTokSentNumber = Integer.parseInt(parts[1]);
					}
					veryFirstLine = false;
					
					String[] parts = line.split("\t");
					fileId  = parts[0];
					sentId = Integer.parseInt(parts[1]);
					Integer tokId  = Integer.parseInt(parts[2]);

					// Check for "empty tokens" (Italian corpus)
					String tokenString = "";
					if (!(parts.length < 4)){
						tokenString = parts[3];	
					}
					
					if (resettingParentheses == true){
						tokenString = resetParentheses(tokenString);
					}
					
					if (fileId.equals(docname)){
						
						// First Sentence, first Token
						if ((sentId == newTokSentNumber) && (tokId == newTokSentNumber)){
							firstSentProcessed = true;
							text = tokenString;
							sentString  = tokenString;
							positionCounter = addTokenAnnotation(tokenString, fileId, sentId, tokId, positionCounter, jcas);
						}
						
						// new Sentence, first Token
						else if ((tokId == newTokSentNumber) || (lastSentId != sentId)){
							positionCounter = addSentenceAnnotation(sentString, fileId, sentId-1, positionCounter, jcas);
							if(!USE_SPACES) // in chinese, there are no spaces 
								text = text + tokenString;
							else
								text = text + " " + tokenString;
							sentString  = tokenString;
							positionCounter = addTokenAnnotation(tokenString, fileId, sentId, tokId, positionCounter, jcas);
						}
						
						// within any sentence
						else{
							if(!USE_SPACES) { // in chinese, there are no spaces 
								text = text + tokenString;
								sentString = sentString + tokenString;
							} else {
								text = text + " " + tokenString;
								sentString = sentString + " " + tokenString;
							}
							positionCounter = addTokenAnnotation(tokenString, fileId, sentId, tokId, positionCounter, jcas);
						}
					}
					else{
						if ((firstSentProcessed) && (!(lastSentProcessed))){
							positionCounter = addSentenceAnnotation(sentString, docname, lastSentId, positionCounter, jcas);
							lastSentProcessed = true;
						}
					}
					lastSentId = sentId;
				}
				if (fileId.equals(docname)){
					positionCounter = addSentenceAnnotation(sentString, docname, lastSentId, positionCounter, jcas);
				}
				
				bf.close();
			}catch (IOException e){
				throw new IOException(e);
			}
		}
	  }
	  jcas.setDocumentText(text);	  
  }
  
  public String resetParentheses(String tokenString){
	  if (tokenString.equals("-LRB-")){
		  tokenString = tokenString.replace("-LRB-", "(");
	  }
	  else if (tokenString.equals("-RRB-")){
		  tokenString = tokenString.replace("-RRB-", ")");
	  }
	  else if (tokenString.equals("-LSB-")){
		  tokenString = tokenString.replace("-LSB-","[");
	  }
	  else if (tokenString.equals("-RSB-")){
		  tokenString = tokenString.replace("-RSB-","]");
	  }
	  else if (tokenString.equals("-LCB-")){
		  tokenString = tokenString.replace("-LCB-","{");
	  }
	  else if (tokenString.equals("-RCB-")){
		  tokenString = tokenString.replace("-RCB-","}");
	  }
	  // ITALIAN TEMPEVAL CORPUS PROBLEMS
	  else if (tokenString.endsWith("a'")){
		  tokenString = tokenString.replaceFirst("a'", "à");
	  }
	  else if (tokenString.endsWith("i'")){
		  tokenString = tokenString.replaceFirst("i'", "ì");
	  }
	  else if (tokenString.endsWith("e'")){
		  tokenString = tokenString.replaceFirst("e'", "è");
	  }
	  else if (tokenString.endsWith("u'")){
		  tokenString = tokenString.replaceFirst("u'", "ù");
	  }
	  else if (tokenString.endsWith("o'")){
		  tokenString = tokenString.replaceFirst("o'", "ò");
	  }
	  return tokenString;
  }
  
  public Integer addSentenceAnnotation(String sentenceString, String fileId, Integer sentId, Integer positionCounter, JCas jcas){
	  Sentence sentence = new Sentence(jcas);
	  Integer begin = positionCounter - sentenceString.length();
	  sentence.setFilename(fileId);
	  sentence.setSentenceId(sentId);
	  sentence.setBegin(begin);
	  sentence.setEnd(positionCounter);
	  sentence.addToIndexes();
	  return positionCounter;
  }
  
  
  
  /**
   * Add token annotation to jcas
   * @param tokenString
   * @param fileId
   * @param tokId
   * @param positionCounter
   * @param jcas
   * @return
   */
  public Integer addTokenAnnotation(String tokenString, String fileId, Integer sentId, Integer tokId, Integer positionCounter, JCas jcas){
		Token token = new Token(jcas);
		if (!((sentId == newTokSentNumber) && (tokId == newTokSentNumber))){
			if(USE_SPACES) // in chinese, there are no spaces, so the +1 correction is unnecessary
				positionCounter = positionCounter + 1;
		}
		token.setBegin(positionCounter);
		positionCounter = positionCounter + tokenString.length();
		token.setEnd(positionCounter);
		token.setTokenId(tokId);
		token.setSentId(sentId);
		token.setFilename(fileId);
		token.addToIndexes();
		
		String id = fileId+"_"+sentId+"_"+tokId;
		hmToken.put(id, token);
		
		return positionCounter;
  }
  
  /**
   * count the number of different "documents" and save doc names in filenames
   * @param inputFiles
   * @return
   * @throws ResourceInitializationException
   */
  private Integer getNumberOfDocuments(List inputFiles) throws ResourceInitializationException{
	  String directory = (String) getConfigParameterValue(PARAM_INPUTDIR);
	  String filename = directory+"/"+FILE_BASE_SEGMENTATION;
	  for (File file : inputFiles) {
		if (file.getAbsolutePath().equals(filename)){
			try {
				String line;
				BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
				while ((line = bf.readLine()) != null){
					String docName = (line.split("\t"))[0];
					if (!(filenames.contains(docName))){
						filenames.add(docName);
					}
				}
				bf.close();
			} catch (IOException e) {
				throw new ResourceInitializationException(e);
			}
		}
	  }	  
	  int  docCounter = filenames.size();
	  return docCounter;
  }
  
  
  private List getFilesFromInputDirectory() {
	  // get directory and save 
	  File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
	  List documentFiles = new ArrayList();
	  
	  
	  // if input directory does not exist or is not a directory, throw exception
	  if (!directory.exists() || !directory.isDirectory()) {
		  logger.log(Level.WARNING, "getFilesFromInputDirectory() " + directory
				  + " does not exist. Client has to set configuration parameter '" + PARAM_INPUTDIR + "'.");
		  return null;
	  }

	  // get list of files (not subdirectories) in the specified directory
	  File[] dirFiles = directory.listFiles();
	  for (int i = 0; i < dirFiles.length; i++) {
		  if (!dirFiles[i].isDirectory()) {
			  documentFiles.add(dirFiles[i]);
		  }
	  }
	  return documentFiles;
  }
  
}