de.citec.scie.util.CachedJCasUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scie-core Show documentation
Contains the SCIE main application and the CLI interface. This project integrates the named entity recognition (NER), the PDF import and the classification and interfaces with the UIMA framework. The command line interface can be used to produce a set of UIMA XCAS files.
There is a newer version: 2.0.1
Show newest version
/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.util;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;

/**
 * This is a custom wrapper implementation to reduce the necessary calls to UIMA
 * JCasUtil, which seems to be terribly slow. An instance can be retrieved using
 * the "getInstance" function. Note that you do not need to fill the cache
 * explicitly. This is done in the background at the first query. Note that the
 * cache will not be updated afterwards! This is only done if
 * "update" is called.
 *
 * @author Benjamin Paassen - [email protected]
 */
public class CachedJCasUtil {

	/*
	 * Right now we do not need multiple caches for any application. Thus we
	 * choose the mos space-efficient variant of only caching information
	 * for one document at a time.
	 */
	public static final int MAX_NUM_INSTANCES = 1;

	private static final ReentrantLock cacheLock = new ReentrantLock();
	private static final ArrayList cachedDocuments = new ArrayList<>();
	private static final ArrayList cachedInstances = new ArrayList<>();

	private final JCas jcas;

	private CachedJCasUtil(JCas jcas) {
		this.jcas = jcas;
	}

	public JCas getJcas() {
		return jcas;
	}

	/**
	 * Returns the cache for the given JCas instance. Please note that this is
	 * only thread-safe with regard to the runtime of this function. The
	 * returned cache itself is not synchronized. Please ensure
	 * that only one annotator processes a given jcas instance at a time.
	 *
	 * @param jcas a jcas instance.
	 * @return the corresponding CachedJCasUtil.
	 */
	public static CachedJCasUtil getInstance(JCas jcas) {
		cacheLock.lock();
		try {
			int cacheIdx = cachedDocuments.indexOf(jcas);
			final CachedJCasUtil instance;
			if (cacheIdx > -1) {
				/*
				 * If we have a cache for this document, move it to the end of
				 * the list to indicate that it was used recently.
				 */
				cachedDocuments.remove(cacheIdx);
				instance = cachedInstances.remove(cacheIdx);
				cachedDocuments.add(jcas);
				cachedInstances.add(instance);
			} else {
				/*
				 * Otherwise create a new cache.
				 */
				instance = new CachedJCasUtil(jcas);
				cachedDocuments.add(jcas);
				cachedInstances.add(instance);
				/*
				 * If we have now more caches than we want we remove the cache
				 * that was used longest ago.
				 */
				if (cachedDocuments.size() > MAX_NUM_INSTANCES) {
					cachedDocuments.remove(0);
					cachedInstances.remove(0);
				}
			}
			return instance;
		} finally {
			cacheLock.unlock();
		}
	}
	/**
	 * We store a Tree-based cache for each class that is queried.
	 *
	 * Each cache stores the Annotations for a given gives in order of their
	 * starting position.
	 */
	private final HashMap, AnnotationCache> caches
			= new HashMap<>();

	/**
	 * Manually adds the given instance to the respective cache.
	 *
	 * Please note that this constructs the cache if it is not there yet. During
	 * construction all Annotions of the given class will be added to the cache.
	 * If the given Annotation is part of the UIMA indices already, it will thus
	 * be added twice.
	 *
	 * In short: Please do only use this method if you know, what you are doing.
	 * Otherwise use "update" after you have added new Annotations.
	 *
	 * @param 
	 * @param annotationInstance
	 */
	public  void add(X annotationInstance) {
		add((Class) annotationInstance.getClass(), annotationInstance);
	}

	/**
	 * Manually adds the given instance to the respective cache.
	 *
	 * Please note that this constructs the cache if it is not there yet. During
	 * construction all Annotions of the given class will be added to the cache.
	 * If the given Annotation is part of the UIMA indices already, it will thus
	 * be added twice.
	 *
	 * In short: Please do only use this method if you know, what you are doing.
	 * Otherwise use "update" after you have added new Annotations.
	 *
	 * @param 
	 * @param annotationInstance
	 */
	public  void add(Class annotationClass, X annotationInstance) {
		//get the cache
		final AnnotationCache cache = getCache(annotationClass);
		cache.add(annotationInstance);
	}

	public  boolean remove(Class annotationClass, X annotationInstance) {
		//get the cache
		final AnnotationCache cache = getCache(annotationClass);
		return cache.remove(annotationInstance);
	}

	/**
	 * Updates the cache for the given class and inserts all new instances of
	 * the given class.
	 *
	 * @param 
	 * @param annotationClass
	 */
	public  void update(Class annotationClass) {
		final Collection annoInstances = JCasUtil.select(jcas, annotationClass);
		final AnnotationCache cache = new AnnotationCache<>();
		for (final X anno : annoInstances) {
			cache.add(anno);
		}
		caches.put(annotationClass, cache);
	}

	/**
	 * Returns all annotations of the given class.
	 *
	 * The returned list has a well defined order: The output Annotations are
	 * first ordered according to their begin, than according to their end
	 * index.
	 */
	public  List select(Class annotationClass) {
		final AnnotationCache cache = getCache(annotationClass);
		return cache.getAll();
	}

	/**
	 * Returns all annotations of the given class that are contained in the text
	 * region specified by the given begin and end (inclusive).
	 *
	 * The returned list has a well defined order: The output Annotations are
	 * first ordered according to their begin, than according to their end
	 * index.
	 */
	public  List selectCovered(Class annotationClass, int begin, int end) {
		final AnnotationCache cache = getCache(annotationClass);
		return cache.getCovered(begin, end);
	}

	/**
	 * Returns all annotations of the given class that contain at least the text
	 * region specified by the given begin and end (inclusive).
	 *
	 * The returned list has a well defined order: The output Annotations are
	 * first ordered according to their begin, than according to their end
	 * index.
	 */
	public  List selectCovering(Class annotationClass, int begin,
			int end) {
		final AnnotationCache cache = getCache(annotationClass);
		return cache.getCovering(begin, end);
	}

	/**
	 * Returns all annotations of the given class that are overlapping with the
	 * text region specified by the given begin and end (inclusive).
	 *
	 * The returned list has a well defined order: The output Annotations are
	 * first ordered according to their begin, than according to their end
	 * index.
	 */
	public  List selectOverlapping(Class annotationClass, int begin,
			int end) {
		final AnnotationCache cache = getCache(annotationClass);
		return cache.getOverlapping(begin, end);
	}

	/**
	 * This constructs a cache for the given class if necessary or returns it if
	 * it is already cached.
	 */
	private  AnnotationCache getCache(Class annotationClass) {
		AnnotationCache cache = caches.get(annotationClass);
		if (cache == null) {
			final Collection annoInstances = JCasUtil.select(jcas, annotationClass);
			cache = new AnnotationCache<>();
			for (final X anno : annoInstances) {
				cache.add(anno);
			}
			caches.put(annotationClass, cache);
		}
		return cache;
	}

	private static class AnnotationCache {

		/**
		 * We store the annotations themselves in a TreeMap indiced by their
		 * start position. For the case of overlapping annotations we have an
		 * additional hierarchy level of a TreeMap that stores the Annotations
		 * with the same start positions indiced according to their end
		 * position. Unfortunately, even that is not enough, because several
		 * annotations of the same class can exist for the same begin and end
		 * index. Thus we need a LinkedList at that last level of hierarchy.
		 */
		private final TreeMap>> actualCashe
				= new TreeMap<>();
		/**
		 * For the special case of overlapping annotations however we also need
		 * to now how many annotations we need to check in linear time to ensure
		 * that given criteria are met.
		 */
		private int maxLength = 0;

		public AnnotationCache() {
		}

		public void add(X anno) {
			final int length = anno.getEnd() - anno.getBegin();
			if (length > maxLength) {
				maxLength = length;
			}
			TreeMap> overlappingAnnos = actualCashe.get(anno.getBegin());
			if (overlappingAnnos == null) {
				overlappingAnnos = new TreeMap<>();
				actualCashe.put(anno.getBegin(), overlappingAnnos);
			}
			LinkedList samePosAnnos = overlappingAnnos.get(anno.getEnd());
			if (samePosAnnos == null) {
				samePosAnnos = new LinkedList<>();
				overlappingAnnos.put(anno.getEnd(), samePosAnnos);
			}

			samePosAnnos.add(anno);
		}

		public boolean remove(X anno) {
			TreeMap> overlappingAnnos = actualCashe.get(anno.getBegin());
			if (overlappingAnnos == null) {
				return false;
			}
			LinkedList samePosAnnos = overlappingAnnos.get(anno.getEnd());
			if (samePosAnnos == null) {
				return false;
			}
			return samePosAnnos.remove(anno);
		}

		/**
		 * Returns all Annotations. The returned list has a well defined order:
		 * The output Annotations are first ordered according to their begin,
		 * than according to their end index.
		 *
		 * @return
		 */
		public List getAll() {
			final ArrayList returnList = new ArrayList<>(actualCashe.size());
			for (TreeMap> overlappingAnnos : actualCashe.values()) {
				for (LinkedList samePosAnnos : overlappingAnnos.values()) {
					returnList.addAll(samePosAnnos);
				}
			}
			return returnList;
		}

		/**
		 * Returns all annotations stored in this cache that start before (or
		 * at) the start index and end after (or at) the end index.
		 *
		 * The returned list has a well defined order: The output Annotations
		 * are first ordered according to their begin, than according to their
		 * end index.
		 *
		 * @param start
		 * @param end
		 * @return
		 */
		public List getCovering(int start, int end) {
			final ArrayList returnList = new ArrayList<>();
			final int lowerKey = end - maxLength;
			if (lowerKey > start) {
				return returnList;
			}
			/*
			 * We only care about the SubMap of Annotations that begin from
			 *
			 * end-maxLength
			 *
			 * to
			 *
			 * start
			 *
			 */
			final NavigableMap>> subMap
					= actualCashe.subMap(lowerKey, true, start, true);
			//for that subMap we check if the given criteria are met.
			for (final TreeMap> overlappingAnnos : subMap.values()) {
				//get only the annotations that end at or after the specified end.
				final NavigableMap> samePosAnnoMap = overlappingAnnos.
						tailMap(end, true);
				//and add them all.
				for (LinkedList samePosAnnos : samePosAnnoMap.values()) {
					returnList.addAll(samePosAnnos);
				}
			}
			return returnList;
		}

		/**
		 * Returns all annotations stored in this cache that start after (or at)
		 * the start index and end before (or at) the end index.
		 *
		 * The returned list has a well defined order: The output Annotations
		 * are first ordered according to their begin, than according to their
		 * end index.
		 *
		 * @param start
		 * @param end
		 * @return
		 */
		public List getCovered(int start, int end) {
			/*
			 * We only care about the SubMap of Annotations that begin from
			 *
			 * begin
			 *
			 * to
			 *
			 * end
			 */
			final NavigableMap>> subMap
					= actualCashe.subMap(start, true, end, true);
			final ArrayList returnList = new ArrayList<>();
			//for that subMap we check if the given criteria are met.
			for (final TreeMap> overlappingAnnos : subMap.values()) {
				//get only the annotations that end before or at the specified end.
				final NavigableMap> samePosAnnoMap = overlappingAnnos.
						headMap(end, true);
				//and add them all.
				for (LinkedList samePosAnnos : samePosAnnoMap.values()) {
					returnList.addAll(samePosAnnos);
				}
			}
			return returnList;
		}

		/**
		 * Returns all annotations stored in this cache that start before (or
		 * at) the end index and end after (or at) the start index.
		 *
		 * The returned list has a well defined order: The output Annotations
		 * are first ordered according to their begin, than according to their
		 * end index.
		 *
		 * @param start
		 * @param end
		 * @return
		 */
		public List getOverlapping(int start, int end) {
			final ArrayList returnList = new ArrayList<>();
			final int lowerKey = end - maxLength;
			if (lowerKey > start) {
				return returnList;
			}
			/*
			 * We only care about the SubMap of Annotations that begin from
			 *
			 * end-maxLength
			 *
			 * to
			 *
			 * end
			 *
			 */
			final NavigableMap>> subMap
					= actualCashe.subMap(lowerKey, true, end, true);
			//for that subMap we check if the given criteria are met.
			for (final TreeMap> overlappingAnnos : subMap.values()) {
				//get only the annotations that end at or after the specified end.
				final NavigableMap> samePosAnnoMap = overlappingAnnos.
						tailMap(start, true);
				//and add them all.
				for (LinkedList samePosAnnos : samePosAnnoMap.values()) {
					returnList.addAll(samePosAnnos);
				}
			}
			return returnList;
		}

		public List getFollowing(int index, int count) {
			//get the sub map following the given index.
			final NavigableMap>> tailMap
					= actualCashe.tailMap(index, false);
			final ArrayList returnList = new ArrayList<>();
			for (final TreeMap> overlappingAnnos : tailMap.values()) {
				for (final LinkedList samePosAnnos : overlappingAnnos.values()) {
					for (final X anno : samePosAnnos) {
						if (returnList.size() == count) {
							return returnList;
						}
						returnList.add(anno);
					}
				}
			}
			return returnList;

		}
	}
}