de.unihd.dbs.uima.reader.aceternreader.ACETernReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
There is a newer version: 2.2.1
Show newest version
/*
 * ACETernReader.java
 * 
 * Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the GNU General Public License.
 * 
 * author: Jannik Strötgen
 * email:  [email protected]
 * 
 * ACE Tern Reader reads temporal annotated corpora that are in the ACE Tern style.
 * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
 */

package de.unihd.dbs.uima.reader.aceternreader;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.SourceDocInfo;



/**
 * CollectionReader for ACE Tern Data 
 */
public class ACETernReader extends CollectionReader_ImplBase {
	
	private static Logger logger = null;
	
	private static final String compontent_id = "de.unihd.dbs.uima.reader.aceternreader";
	
	/**
	 * Needed information to create cas objects for all "documents"
	 */
	public Integer numberOfDocuments = 0;
  
	/**
	 * Parameter information
	 */
	public static final String PARAM_INPUTDIR = "InputDirectory";
	public static final String PARAM_DCT      = "AnnotateCreationTime";
	public Boolean annotateDCT = false; 

	/**
	 * List containing all filenames of "documents"
	 */
	private ArrayList mFiles;
    
	/**
	 * Current file number
	 */
	private int currentIndex;

  

	
	/**
	 * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
	 */
	public void initialize() throws ResourceInitializationException {
		
		logger = getUimaContext().getLogger();
		logger.log(Level.INFO, "initialize() - Initializing ACETern-Reader...");
		
		annotateDCT = (Boolean) getConfigParameterValue(PARAM_DCT);
		
		File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
		currentIndex = 0;
		
		// if input directory does not exist or is not a directory, throw exception
		if (!directory.exists() || !directory.isDirectory()) {
			throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
					new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
		}

		// get list of files (without subdirectories) in the specified directory
		mFiles = new ArrayList();
		File[] files = directory.listFiles();
		for (int i = 0; i < files.length; i++) {
			if (!files[i].isDirectory()) {
				mFiles.add(files[i]);
			}
		}
	}

	
	/**
	 * @see org.apache.uima.collection.CollectionReader#hasNext()
	 */
	public boolean hasNext() {
		return currentIndex < mFiles.size();
	}

	/**
	 * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
	 */
	public void getNext(CAS aCAS) throws IOException, CollectionException {
		System.err.print(".");
		JCas jcas;
		try {
			jcas = aCAS.getJCas();
		} catch (CASException e) {
			throw new CollectionException(e);
		}

		// open input stream to file
	    File file = (File) mFiles.get(currentIndex++);
		logger.log(Level.INFO, "getNext(CAS) - Reading file " + file.getName());
	    

	    String text = "";   
	    String xml = FileUtils.file2String(file);
	    text = xml;

	    // put document into CAS
	    text = text.replaceAll("(?s)", "");
	    jcas.setDocumentText(text);


	    // Keep Source document information
	    SourceDocInfo srcDocInfo = new SourceDocInfo(jcas);
	    URL url = file.getAbsoluteFile().toURI().toURL();
		srcDocInfo.setUri(url.toString());
	    srcDocInfo.addToIndexes();
	    

	    // Get document creation time if necessary
		if (annotateDCT){
			/*
			 * if DCT shall be set, set it now
			 */

			setDCT(xml, jcas, url.toString());
		}
	}

	@SuppressWarnings("unused")
	public void setDCT(String xml, JCas jcas, String filename){
		
		// SET DOCUMENT CREATION TIME!!!!
		// possible tags for DCT:
		// DATETIME (all WikiWar documents) with the following format 2009-12-20T17:00:00
		// DATE_TIME (Tern 2004) with the following format "10/17/2000 18:46:13.59" "10/17/2000 18:41:01.17" "11/04/2000 9:14:43.41" "2000-10-01 20:56:35"
		// DATE (Tern 2004) with the following format "07/15/2000" "1996-02-13" "1997-03-09 10:50:59" 
		// WITHOUT DATE ARE THE ACE TERN 2004 training files: chtb_171.eng.sgm, 172, 174, 179, 183, 
		// DATETIME (ACE 2005 training) with the following formats additionally: 20041221-20:24:00, 20030422

		String datetimetag = null;
		// possible date formats
		String dateformat1 = "(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d\\d):(\\d\\d):(\\d\\d)(.*?)"; // 2009-12-20T17:00:00 or 2000-10-01 20:56:35
		String dateformat2 = "(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d):(\\d\\d):(\\d\\d)(.*?)"; // 2009-12-20T7:00:00 or 2000-10-01 9:56:35
		String dateformat3 = "(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)"; // 10/17/2000 18:46:13.59
		String dateformat4 = "(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)"; // 10/17/2000 1:46:13.59
		String dateformat5 = "(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(.*?)"; // 1996-02-13
		String dateformat6 = "(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d)(.*?)"; // 07/15/2000
		String dateformat7 = "(.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?)";
		String dateformat8 = "(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)-(\\d\\d):(\\d\\d):(\\d\\d)(.*?)"; // 20041221-20:24:00
		String dateformat9 = "(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?)"; // 20030422
		for (MatchResult m : findMatches(Pattern.compile("(|||)(("+dateformat1+
																						")|("+dateformat2+
																						")|("+dateformat3+
																						")|("+dateformat4+
																						")|("+dateformat5+
																						")|("+dateformat6+
																						")|("+dateformat7+
																						")|("+dateformat8+
																						")|("+dateformat9+")(|||))"), xml)){
			datetimetag = m.group(2);
		}
		
		
		String time_value = null;
		String date_value = null;
		if (!(datetimetag == null)){
			if (datetimetag.matches(dateformat1)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat1), datetimetag)){
					date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
					time_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4)+"T"+m.group(6)+":"+m.group(7)+":"+m.group(8);
				}
			}
			else if (datetimetag.matches(dateformat2)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat2), datetimetag)){
					date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
					time_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4)+"T0"+m.group(6)+":"+m.group(7)+":"+m.group(8);
				}
			}
			else if (datetimetag.matches(dateformat3)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat3), datetimetag)){
					date_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3);
					time_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3)+"T"+m.group(5)+":"+m.group(6)+":"+m.group(7)+"."+m.group(8);
				}
			}
			else if (datetimetag.matches(dateformat4)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat4), datetimetag)){
					date_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3);
					time_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3)+"T0"+m.group(5)+":"+m.group(6)+":"+m.group(7)+"."+m.group(8);
				}
			}
			else if (datetimetag.matches(dateformat5)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat5), datetimetag)){
					date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
				}
			}
			else if (datetimetag.matches(dateformat6)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat6), datetimetag)){
					date_value = m.group(4)+"-"+m.group(2)+"-"+m.group(3);
				}
			}
			else if (datetimetag.matches(dateformat7)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat7), datetimetag)){
					String year  = m.group(4);
					String month = normMonth(m.group(2));
					String day   = normDay(m.group(3));
					date_value = year+"-"+month+"-"+day;
				}
			}
			else if (datetimetag.matches(dateformat8)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat8), datetimetag)){
					date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
					time_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4)+"T"+m.group(5)+":"+m.group(6)+":"+m.group(7);
				}
			}
			else if (datetimetag.matches(dateformat9)){
				for (MatchResult m : findMatches(Pattern.compile(dateformat9), datetimetag)){
					date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
				}
			}
			else{
				System.err.println();
				System.err.println("["+compontent_id+"] cannot set dct with datetimetag: "+datetimetag);
			}
			if (!(date_value == null)){
				Dct dct = new Dct(jcas);
				dct.setBegin(0);
				dct.setEnd(1);
				dct.setFilename(filename);
				dct.setTimexId("dct");
				if (!(time_value == null)){
					dct.setValue(time_value);
//					System.err.println("["+compontent_id+"] set dct to: "+time_value);
				}else if (!(date_value == null)){
					dct.setValue(date_value);
//					System.err.println("["+compontent_id+"] set dct to: "+date_value);
				}
				else{
					System.err.println();
					System.err.println("["+compontent_id+"] something wrong with setting DCT of : "+datetimetag);
				}
				dct.addToIndexes();
			}
		}
		else{
			if (date_value == null){
//				System.err.println("Checking for further formats of DCT...");
				String refYear  = "";
				String refMonth = "";
				String refDay   = "";
				for (MatchResult m1 : findMatches(Pattern.compile("DATE:[\\s]+("+dateformat7+")"),xml)){
					String referenceDate = m1.group(1);
					if (referenceDate.matches(dateformat7)){
						for (MatchResult mr : findMatches(Pattern.compile(dateformat7), referenceDate)){
							refYear  = mr.group(4);
							refMonth = normMonth(mr.group(2));
							refDay   = normDay(mr.group(3));
						}
					}
				}
				for (MatchResult m : findMatches(Pattern.compile(""
						+"(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jun\\.|Jul\\.|Aug\\.|Sep\\.|Oct\\.|Nov\\.|Dec\\.|"
						+ "JAN\\.|FEB\\.|MAR\\.|APR\\.|MAY\\.|JUN\\.|JUL\\.|AUG\\.|SEP\\.|OCT\\.|NOV\\.|DEC\\.)[\\s]+([\\d]?[\\d])"
						+""), xml)){
					String exactMonth = m.group(1);
					String exactDay   = m.group(2);
					date_value = refYear+"-"+normMonth(exactMonth)+"-"+normDay(exactDay);
				}
			}
			if (date_value == null){
				for (MatchResult m : findMatches(Pattern.compile(""
						+".*?(\\d\\d\\d\\d)(\\d\\d)(\\d\\d).*?"
						+""), xml)){
					String exactYear  = m.group(1);
					String exactMonth = m.group(2);
					String exactDay   = m.group(3);
					date_value = exactYear+"-"+exactMonth+"-"+exactDay;
				}
			}
			if (date_value == null){
				String refYear  = "";
				String refMonth = "";
				String refDay   = "";
				for (MatchResult m : findMatches(Pattern.compile(".*?(\\d\\d\\d\\d)(\\d\\d)(\\d\\d).*?"),xml)){
					refYear  = m.group(1);
					refMonth = normMonth(m.group(2));
					refDay   = normDay(m.group(3));
				}
				if (!(refYear.matches(""))){
					for (MatchResult m : findMatches(Pattern.compile(".*?"
							+"(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]).*?"+
							""), xml)){
						String exactMonth = normMonth(m.group(1));
						String exactDay   = normDay(m.group(2));
						date_value = refYear+"-"+exactMonth+"-"+exactDay;
					}
				}
			}
			if (date_value == null){
				String refYear  = "";
				String refMonth = "";
				String refDay   = "";				
					for (MatchResult m : findMatches(Pattern.compile("Publish Date:[\\s]+(\\d\\d)/(\\d\\d)/(\\d\\d)"),xml)){
						refYear  = "19"+m.group(3);
						refMonth = normMonth(m.group(1));
						refDay   = normDay(m.group(2));
					}
					if (!(refYear.matches(""))){
						for (MatchResult m : findMatches(Pattern.compile(".*?"
								+"(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jun\\.|Jul\\.|Aug\\.|Sep\\.|Oct\\.|Nov\\.|Dec\\.|"
								+ "JAN\\.|FEB\\.|MAR\\.|APR\\.|MAY\\.|JUN\\.|JUL\\.|AUG\\.|SEP\\.|OCT\\.|NOV\\.|DEC\\.)[\\s]+([\\d]?[\\d]).*?"+
								""), xml)){
							String exactMonth = normMonth(m.group(1));
							String exactDay   = normDay(m.group(2));
							date_value = refYear+"-"+exactMonth+"-"+exactDay;
						}
					}
					
			}
			// Document Creation Time style of EVALITA I-CAB corpus (Italian corpus)
			// example: 
			if (date_value == null){
				try {
					for (MatchResult m : findMatches(Pattern.compile("()"), xml)){
						datetimetag = m.group(2);
					}
					if (datetimetag.matches(dateformat9)){
						for (MatchResult m : findMatches(Pattern.compile(dateformat9), datetimetag)){
							date_value = m.group(2)+"-"+m.group(3)+"-"+m.group(4);
						}
					} else {
						System.err.println();
						System.err.println("["+compontent_id+"] cannot set dct with datetimetag: "+datetimetag);
					}
				} catch(NullPointerException e) { } // nothing to see here, carry on
			}
			if (date_value == null){
				System.err.println();
				System.err.println("["+compontent_id+"] Cannot set Document Creation Time - no datetimetag found in "+filename+"!");
			}
			else{
				Dct dct = new Dct(jcas);
				dct.setBegin(0);
				dct.setEnd(1);
				dct.setFilename(filename);
				dct.setTimexId("dct");
				dct.setValue(date_value);
				dct.addToIndexes();
			}
		}
	}
	
	public String normDay(String day){
		if (!(day.matches("\\d\\d"))){
			if (day.equals("1")){
				day = "01";
			}
			else if (day.equals("2")){
				day = "02";
			}
			else if (day.equals("3")){
				day = "03";
			}
			else if (day.equals("4")){
				day = "04";
			}
			else if (day.equals("5")){
				day = "05";
			}
			else if (day.equals("6")){
				day = "06";
			}
			else if (day.equals("7")){
				day = "07";
			}
			else if (day.equals("8")){
				day = "08";
			}
			else if (day.equals("9")){
				day = "09";
			}
		}
		
		return day;
	}
	
	public String normMonth(String month){
		if (month.toLowerCase().startsWith("jan")){
			month = "01";
		}
		else if (month.toLowerCase().startsWith("feb")){
			month = "02";
		}
		else if (month.toLowerCase().startsWith("mar")){
			month = "03";
		}
		else if (month.toLowerCase().startsWith("apr")){
			month = "04";
		}
		else if (month.toLowerCase().startsWith("may")){
			month = "05";
		}
		else if (month.toLowerCase().startsWith("jun")){
			month = "06";
		}
		else if (month.toLowerCase().startsWith("jul")){
			month = "07";
		}
		else if (month.toLowerCase().startsWith("aug")){
			month = "08";
		}
		else if (month.toLowerCase().startsWith("sep")){
			month = "09";
		}
		else if (month.toLowerCase().startsWith("oct")){
			month = "10";
		}
		else if (month.toLowerCase().startsWith("nov")){
			month = "11";
		}
		else if (month.toLowerCase().startsWith("dec")){
			month = "12";
		}
		return month;
	}
	
	/**
	 * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
	 */
	public void close() throws IOException {
	}

	/**
	 * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
	 */
	public Progress[] getProgress() {
		return new Progress[] { new ProgressImpl(currentIndex, mFiles.size(), Progress.ENTITIES) };
	}

	/**
	 * Gets the total number of documents that will be returned by this collection reader. This is not
	 * part of the general collection reader interface.
	 * 
	 * @return the number of documents in the collection
	 */
	public int getNumberOfDocuments() {
		return mFiles.size();
	}
  
	/**
	 * Find all the matches of a pattern in a charSequence and return the
	 * results as list.
	 * 
	 * @param pattern
	 * @param s
	 * @return
	 */
	public static Iterable findMatches(Pattern pattern,
			CharSequence s) {
		List results = new ArrayList();

		for (Matcher m = pattern.matcher(s); m.find();)
			results.add(m.toMatchResult());

		return results;
	}
}