All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unihd.dbs.uima.consumer.aceternwriter.ACETernWriter Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/*
 * ACETernWriter.java
 * 
 * Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the GNU General Public License.
 * 
 * author: Jannik Strötgen
 * email:  [email protected]
 * 
 * ACE Tern Writer creates files according to the ACE Tern style.
 * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
 */
package de.unihd.dbs.uima.consumer.aceternwriter;

import java.util.List;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;

import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.SourceDocInfo;


/**
 * 
 * @author jstroetgen
 *
 */
public class ACETernWriter extends CasConsumer_ImplBase {

	public static final String PARAM_OUTPUTDIR = "OutputDir";
	public static final String PARAM_CONVERTTIMEX3TO2 = "ConvertTimex3To2";

	private File mOutputDir;
 
	private int mDocNum;
	
	private Boolean convertTimex3To2 = true;
	
	private boolean printDetails = false;

	/**
	 * initialize
	 */
	public void initialize() throws ResourceInitializationException {
    
		mDocNum = 0;
		convertTimex3To2 = (Boolean) getConfigParameterValue(PARAM_CONVERTTIMEX3TO2);
		mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
		if (!mOutputDir.exists()) {
			mOutputDir.mkdirs();
		} 
	}


	/**
	 * process
	 */
	public void processCas(CAS aCAS) throws ResourceProcessException {

		JCas jcas;
		try {
			jcas = aCAS.getJCas();
		} catch (CASException e) {
			throw new ResourceProcessException(e);
		}

		printTimexAnnotationsInline(jcas);
	}
	
	
	
	
	public void printTimexAnnotationsInline(JCas jcas){
		// retrieve the filename of the input file from the CAS
	    FSIterator it = jcas.getAnnotationIndex(SourceDocInfo.type).iterator();
	    File outFile = null;
	    File inFile = null;
	    if (it.hasNext()) {
	      SourceDocInfo fileLoc = (SourceDocInfo) it.next();
	      try {
	        inFile = new File(new URL(fileLoc.getUri()).getPath());
	        String outFileName = inFile.getName();
	        if (fileLoc.getOffsetInSource() > 0) {
	          outFileName += ("_" + fileLoc.getOffsetInSource());
	        }
	        outFileName += ".xmi";
	        outFile = new File(mOutputDir, outFileName);
	      } catch (MalformedURLException e1) {
	        // invalid URL, use default processing below
	      }
	    }
	    if (outFile == null) {
	      outFile = new File(mOutputDir, "doc" + mDocNum++);
	    }
	    
	    // what has to be printed?
	    String toprint = "";
	    
	    // document text
	    String doctext    = jcas.getDocumentText();
	    int startposition = 0;
	    int endposition   = doctext.length();
	    boolean anyTimex  = false;
	    
		// get timex index
		FSIndex indexTimex   = jcas.getAnnotationIndex(Timex3.type);
		FSIterator iterTimex = indexTimex.iterator();
		
		
		while (iterTimex.hasNext()){
			anyTimex = true;
			Timex3 t = (Timex3) iterTimex.next();
			endposition = t.getBegin();
			if (endposition < startposition){
				if (printDetails == true){
					System.err.println("[Tern2004Writer] Overlapping expressions... ignoring: "+t.getCoveredText());
				}
			}else{
				// CHANGES DUE TO TIMEX2 not equal TIMEX3
				String timexvalue = t.getTimexValue();
				if(convertTimex3To2) {
					timexvalue = translatetimex3timex2(timexvalue);
					if (t.getTimexType().equals("SET")){
						timexvalue = translatetimex3timex2set(timexvalue); 
					}
				}
				toprint = toprint + doctext.substring(startposition, endposition); // text from begin or last timex to begin of new timex
				toprint = toprint + "";         // timex opening tag
				toprint = toprint + t.getCoveredText();                            // timex text
				toprint = toprint + "";
				startposition = t.getEnd();
			}
		}
		if (anyTimex == true){
            // text from last timex to end
			toprint = toprint + doctext.substring(startposition);
		}
		if (anyTimex == false){
			// whole document text
			toprint = toprint + doctext;
		}
		
		// print toprint text
		try {
			BufferedWriter bf = new BufferedWriter(new FileWriter(outFile));
			bf.append(toprint);
			bf.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}
	
	public String translatetimex3timex2(String timexvalue){
		
		// change decades
		String decade = "(.*\\d\\d\\d)X";
		if (timexvalue.matches(decade)){
			for (MatchResult m : findMatches(Pattern.compile(decade), timexvalue)){
				timexvalue = m.group(1);
			}
		}
		// change century
		String century = "(.*\\d\\d)XX";
		if (timexvalue.matches(century)){
			for (MatchResult m : findMatches(Pattern.compile(century), timexvalue)){
				timexvalue = m.group(1);
			}
		}		
		
		return timexvalue;
	}
	
	public String translatetimex3timex2set(String timexvalue){
		
		// change year
		String year = "(P(\\d)+Y)";
		if (timexvalue.matches(year)){
			timexvalue = "XXXX";
		}
		// change month
		String month = "(P(\\d)+M)";
		if (timexvalue.matches(month)){
			timexvalue = "XXXX-XX";
		}
		// change day
		String day = "(P(\\d)+D)";
		if (timexvalue.matches(day)){
			timexvalue = "XXXX-XX-XX";
		}
		// change hour
		String hour = "(PT(\\d)+H)";
		if (timexvalue.matches(hour)){
			timexvalue = "XXXX-XX-XXTXX";
		}
		// change minute
		String minute = "(PT(\\d)+M)";
		if (timexvalue.matches(minute)){
			timexvalue = "XXXX-XX-XXTXX:XX";
		}
		
		return timexvalue;
	}
	
	/**
	 * Find all the matches of a pattern in a charSequence and return the
	 * results as list.
	 * 
	 * @param pattern
	 * @param s
	 * @return
	 */
	public static Iterable findMatches(Pattern pattern,
			CharSequence s) {
		List results = new ArrayList();

		for (Matcher m = pattern.matcher(s); m.find();)
			results.add(m.toMatchResult());

		return results;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy