All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unihd.dbs.uima.reader.tempeval3reader.Tempeval3Reader Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/**
 * 
 */
package de.unihd.dbs.uima.reader.tempeval3reader;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.Queue;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Dct;

/**
 * @author Julian Zell
 *
 */
public class Tempeval3Reader extends CollectionReader_ImplBase {
	private Class component = this.getClass();
	
	// uima descriptor parameter name
	private String PARAM_INPUTDIR = "InputDirectory";
	
	private Integer numberOfDocuments = 0;
	
	private Queue files = new LinkedList();
	
	public void initialize() throws ResourceInitializationException {
		String dirPath = (String) getConfigParameterValue(PARAM_INPUTDIR);
		dirPath = dirPath.trim();
		
		populateFileList(dirPath);
	}

	public void getNext(CAS aCAS) throws IOException, CollectionException {
		JCas jcas;
		
		try {
			jcas = aCAS.getJCas();
		} catch (CASException e) {
			throw new CollectionException(e);
		}

		fillJCas(jcas);
		
		// give an indicator that a file has been processed
		System.err.print(".");
	}

	private void fillJCas(JCas jcas) {
		// grab a file to process
		File f = files.poll();
		try {
			// create xml parsing facilities
			DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
			// parse input xml file
			Document doc = db.parse(f);
			
			doc.getDocumentElement().normalize();
			
			// get the  tag's content to set the document text
			NodeList nList = doc.getElementsByTagName("TEXT");
			Node textNode = nList.item(0);
			String text = textNode.getTextContent();
			
			jcas.setDocumentText(text);
			
			// get the  timex tag's value attribute for the dct
			Boolean gotDCT = false;
			String dctText = null;
			try {
				nList = doc.getDocumentElement().getElementsByTagName("DCT");
				nList = ((Element) nList.item(0)).getElementsByTagName("TIMEX3"); // timex3 tag
				Node dctTimex = nList.item(0);
				NamedNodeMap dctTimexAttr = dctTimex.getAttributes();
				Node dctValue = dctTimexAttr.getNamedItem("value");
				dctText = dctValue.getTextContent();
				gotDCT = true;
			} catch(Exception e) {
				gotDCT = false;
			}
			
			if(!gotDCT) 
				try { // try a different location for the DCT timex element
					nList = doc.getDocumentElement().getElementsByTagName("TEXT");
					nList = ((Element) nList.item(0)).getElementsByTagName("TIMEX3"); // timex3 tag
					Node dctTimex = nList.item(0);
					NamedNodeMap dctTimexAttr = dctTimex.getAttributes();
					if(dctTimexAttr.getNamedItem("functionInDocument") != null && dctTimexAttr.getNamedItem("functionInDocument").getTextContent().equals("CREATION_TIME")) {
						Node dctValue = dctTimexAttr.getNamedItem("value");
						dctText = dctValue.getTextContent();
					}
					gotDCT = true;
				} catch(Exception e) {
					gotDCT = false;
				}
			
			// get the document id
			nList = doc.getElementsByTagName("DOCID");
			String filename = null;
			if(nList != null && nList.getLength() > 0)
				filename = nList.item(0).getTextContent();
			else
				filename = f.getName().replaceAll("\\.[^\\.]+$", "");

			Dct dct = new Dct(jcas);
			dct.setBegin(0);
			dct.setEnd(text.length());
			dct.setFilename(filename);
			dct.setValue(dctText);
			dct.setTimexId("t0");
			dct.addToIndexes();
		} catch(Exception e) {
			  e.printStackTrace();
			  Logger.printError(component, "File "+f.getAbsolutePath()+" could not be properly parsed.");
		}
	}

	public boolean hasNext() throws IOException, CollectionException {
	    return files.size() > 0;
	}
	
	public Progress[] getProgress() {
		return new Progress[] { new ProgressImpl(numberOfDocuments-files.size(), numberOfDocuments , Progress.ENTITIES) };
	}
	
	public void close() throws IOException {
		files.clear();
	}

	private void populateFileList(String dirPath) throws ResourceInitializationException {
		ArrayList myFiles = new ArrayList();
		File dir = new File(dirPath);
		
		// check if the given directory path is valid
		if(!dir.exists() || !dir.isDirectory())
			throw new ResourceInitializationException();
		else
			myFiles.addAll(Arrays.asList(dir.listFiles()));
		
		// check for existence and readability; add handle to the list
		for(File f : myFiles) {
			if(!f.exists() || !f.isFile() || !f.canRead()) {
				Logger.printDetail(component, "File \""+f.getAbsolutePath()+"\" was ignored because it either didn't exist, wasn't a file or wasn't readable.");
			} else {
				files.add(f);
			}
		}
		
		numberOfDocuments = files.size();
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy