de.julielab.jcore.utility.JCoReTools Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-utilities Show documentation
Utilities for UIMA compliant components.
There is a newer version: 2.6.1
/**
 * JulesTools.java
 * 
 * Copyright (c) 2006, JULIE Lab.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Common Public License v1.0
 * 

 * Author: muehlhausen
 * 

 * Current version: 1.3
 * Since version:   1.0
 * 

 * Creation date: Dec 11, 2006
 * 

 * Tool for creating new UIMA Objects and other UIMA related things
 **/

package de.julielab.jcore.utility;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.DataResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import de.julielab.jcore.types.Header;

/**
 * 

 * The binarySearch methods work specifically on Annotation objects, sorted by given function.
 * The addToFSArray methods are useful for adding elements to FSArrays which are rather awkward to use and, especially, to extend.
 * The addToStringArray methods serve a similar purpose.
 * One of the most used methods from this list is {@link #getDocId(JCas)} which will look for an annotation of type de.julielab.jcore.types.Header and return its docId feature value.
 * The {@link #deserializeXmi(CAS, InputStream, int)} method is used in UIMA 2.x to fix issues with special Unicode characters. For more information, refer to the JavaDoc of the method.
 * 
 *
 * @author faessler
 */
public class JCoReTools {

    /**
     * Number of elements to be added if an FSArray needs to be resized, effectively creating a new, larger FSArray.
     */
    public static final int DEFAULT_ADDITION_SIZE = 10;
    /**
     * Logger for this class
     */
    private static final Logger log = LoggerFactory.getLogger(JCoReTools.class);

    /**
     * 
     * Returns an FSArray that contains all elements of the given array and newElement.
     * 
The new element
     * is set into array if it has trailing null entries. Then, the new element is set to
     * the first position of array that is null and only followed by null entries until the end of the
     * array. If array is full, i.e. there are no trailing null entries, a new FSArray
     * of size array.size()+{@link #DEFAULT_ADDITION_SIZE} is created. All elements of array are copied into
     * the new FSArray and newElement is added after the last element of array.
     * Depending on {@link #DEFAULT_ADDITION_SIZE} there might be trailing null entries left in the new
     * FSArray they can be used to set further elements without the need to create a new FSArray.
     * 

     * In any case, it should be assumed that the return value is a new FSArray. Thus, one should not rely
     * on the possible in-place change of the passed array and replace the variable holding array
     * with the return value of this method.
     * 
     *
     * @param array      The array to what the feature structure should be added
     * @param newElement The feature structure that should be added to the array
     * @return An FSArray containing all entries from array plus newElement. The
     * returned FSArray will be array if there was enough space to add newElement
     * or a new FSArray otherwise.
     */
    public static FSArray addToFSArray(final FSArray array, final FeatureStructure newElement) {
        return addToFSArray(array, newElement, DEFAULT_ADDITION_SIZE);
    }

    /**
     * 
     * Returns an FSArray that contains all elements of the given array and newElement.
     * 
The new element
     * is set into array if it has trailing null entries. Then, the new element is set to
     * the first position of array that is null and only followed by null entries until the end of the
     * array. If array is full, i.e. there are no trailing null entries, a new FSArray
     * of size array.size()+additionSize is created. All elements of array are copied into
     * the new FSArray and newElement is added after the last element of array.
     * Depending on additionSize there might be trailing null entries left in the new
     * FSArray they can be used to set further elements without the need to create a new FSArray.
     * 

     * In any case, it should be assumed that the return value is a new FSArray. Thus, one should not rely
     * on the possible in-place change of the passed array and replace the variable holding array
     * with the return value of this method.
     * 
     *
     * @param array        The array to what the feature structure should be added
     * @param newElement   The feature structure that should be added to the array
     * @param additionSize The size the array should be expanded
     * @return An FSArray containing all entries from array plus newElement. The
     * returned FSArray will be array if there was enough space to add newElement
     * or a new FSArray otherwise.
     */
    public static FSArray addToFSArray(final FSArray array, final FeatureStructure newElement, final int additionSize) {

        assert (additionSize > 0);

        FSArray outputArray = array;

        if (null == outputArray) {
            try {
                outputArray = new FSArray(newElement.getCAS().getJCas(), 1);
            } catch (CASException e1) {
                throw new JCoReUtilitiesException(e1);
            }
        }

        int lastElementIndex = outputArray.size() - 1;

        if (outputArray.get(lastElementIndex) != null) {
            log.trace("Last element of passed array was not null, thus array is full and a new one is created.");
            try {
                FSArray array2 = new FSArray(outputArray.getCAS().getJCas(), outputArray.size() + additionSize);
                array2.copyFromArray(outputArray.toArray(), 0, 0, outputArray.size());
                array2.set(lastElementIndex + 1, newElement);
                outputArray = array2;
                log.trace("New array is of size {}.", array2.size());
                return array2;
            } catch (CASException e) {
                e.printStackTrace();
            }
        } else {
            log.trace(
                    "There is still room left over in the passed array, new element is appended after the last non-null element.");
        }

        while ((lastElementIndex > 0) && (outputArray.get(lastElementIndex - 1) == null)) {
            lastElementIndex--;
        }
        log.trace("Last non-null element was found on index {}, adding new element on position {}.",
                lastElementIndex - 1, lastElementIndex);
        outputArray.set(lastElementIndex, newElement);
        return outputArray;
    }

    /**
     * 
     * Returns an FSArray that contains all elements of the given array and newElements.
     * 
The new elements
     * are set into inputArray if it has trailing null entries. Then, the new elements are set to
     * the first positions of inputArray that are null and only followed by null entries until the end of the
     * array. If inputArray is too small, i.e. there are not enough trailing null entries, a new FSArray
     * of size inputArray.size()+newElements.size() is created. All elements of inputArray are copied into
     * the new FSArray and newElements are added after the last element of inputArray.
     * 

     * In any case, it should be assumed that the return value is a new FSArray. Thus, one should not rely
     * on the possible in-place change of the passed array and replace the variable holding array
     * with the return value of this method.
     * 
     *
     * @param inputArray  The array to what the feature structures should be added
     * @param newElements The feature structure that should be added to the array
     * @return An FSArray containing all entries from inputArray plus newElements. The
     * returned FSArray will be inputArray if there was enough space to add newElements
     * or a new FSArray otherwise.
     */
    public static FSArray addToFSArray(final FSArray inputArray,
                                       final Collection newElements) {

        FSArray array = inputArray;

        if (null == newElements || newElements.size() == 0)
            return array;

        if (null == array) {
            try {
                array = new FSArray(newElements.iterator().next().getCAS().getJCas(), 1);
            } catch (CASException e1) {
                throw new JCoReUtilitiesException(e1);
            }
        }

        try {
            int lastElementIndex = array.size() - 1;

            // Search for the last non-null element. If none is found,
            // lastElementIndex will actually be -1 after the
            // loop.
            while (lastElementIndex >= 0 && array.get(lastElementIndex) == null) {
                lastElementIndex--;
            }

            FSArray ret = null;
            // Is there enough space in the existing array to put all new
            // elements in it?
            int requiredSpace = lastElementIndex + 1 + newElements.size();
            if (requiredSpace <= array.size()) {
                log.trace(
                        "Existing array has size {}. Since space for {} elements is required the passed array is kept.",
                        array.size(), requiredSpace);
                ret = array;
            } else {
                log.trace("Passed array has size {} but there are {} elements overall, thus a new FSArray is created.",
                        array.size(), requiredSpace);
                // There is not enough space for all new elements in the given
                // FSArray so create a new one
                ret = new FSArray(array.getCAS().getJCas(), requiredSpace);
                for (int i = 0; i <= lastElementIndex; i++)
                    ret.set(i, array.get(i));
            }
            // Add the new elements.
            int currentIndex = lastElementIndex + 1;
            for (Iterator it = newElements.iterator(); it.hasNext(); currentIndex++) {
                ret.set(currentIndex, it.next());
            }
            return ret;
        } catch (CASException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Returns a new FSArray with the exact size and contents of array. This is a shallow
     * copy, the array entries are copied by reference.
     *
     * @param array The FSArray to copy.
     * @return A new FSArray with the size and contents of array.
     */
    public static FSArray copyFSArray(FSArray array) {
        FSArray output = null;
        try {
            output = new FSArray(array.getCAS().getJCas(), array.size());
            for (int i = 0; i < array.size(); ++i)
                output.set(i, array.get(i));
        } catch (CASException e) {
            throw new JCoReUtilitiesException(e);
        }
        return output;
    }

    /**
     * 
     * Creates a new string array, copies the values of array into it and adds element.
     * 
     * 
     * This method does not handle null values as {@link #addToFSArray(FSArray, FeatureStructure, int)} does.
     * To add multiple elements at once, avoiding excessive copying, refer to {@link #addToStringArray(StringArray, String[])}.
     * 
     *
     * @param array   The source array to extend.
     * @param element The element to add.
     * @return A new StringArray with the same contents as array extended by element.
     */
    public static StringArray addToStringArray(StringArray array, String element) {
        try {
            StringArray newArray = null;
            if (array == null) {
                newArray = new StringArray(array.getCAS().getJCas(), 1);
            } else {
                newArray = new StringArray(array.getCAS().getJCas(), array.size() + 1);
                newArray.copyFromArray(array.toArray(), 0, 0, array.size());
            }
            newArray.set(newArray.size() - 1, element);
            return newArray;
        } catch (CASException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 
     * Creates a new string array, copies the values of array into it and adds elements.
     * 
     *
     * @param array    The array to extend.
     * @param elements The elements to add into a new array.
     * @return A new StringArray containing all values of array plus elements.
     */
    public static StringArray addToStringArray(StringArray array, String[] elements) {
        if (null == elements)
            return null;
        try {
            StringArray newArray = null;
            if (array == null) {
                newArray = new StringArray(array.getCAS().getJCas(), elements.length);
                newArray.copyFromArray(elements, 0, 0, elements.length);
            } else {
                newArray = new StringArray(array.getCAS().getJCas(), array.size() + elements.length);
                newArray.copyFromArray(array.toArray(), 0, 0, array.size());
                newArray.copyFromArray(elements, 0, array.size(), elements.length);
            }
            return newArray;
        } catch (CASException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * Prints the content of the FSArray to System.out
     *
     * @param array The array to be printed
     */
    public static void printFSArray(FSArray array) {
        for (int i = 0; i < array.size(); i++) {
            FeatureStructure fs = array.get(i);
            System.out.println("fs[" + i + "] =  " + fs);
        }
    }

    public static void printAnnotationIndex(JCas jCas, int type) {
        for (Iterator it = jCas.getAnnotationIndex(type).iterator(); it.hasNext(); ) {
            Annotation a = it.next();
            System.out.println("[" + a.getBegin() + "-" + a.getEnd() + "] " + a.getCoveredText());
        }

    }

    /**
     * 
     * Returns the document ID of the document in the JCas.
     * 
     * 
     * This can only be done when an annotation of type
     * de.julielab.jcore.types.Header (or a subtype) is present and
     * its feature docId is set.
     * 
     *
     * @param aJCas
     * @return The value of of {@link de.julielab.jcore.types.Header#getDocId()}
     */
    public static String getDocId(JCas aJCas) {
        AnnotationIndex headerIndex = aJCas.getAnnotationIndex(Header.type);
        FSIterator it = headerIndex.iterator();
        if (!it.hasNext())
            return null;
        Header header = (Header) it.next();
        String pubmedId = header.getDocId();
        return pubmedId;
    }

    /**
     * 
     * Deserializes an UTF-8 encoded XMI input stream into the given CAS.
     * 
     * 
     * This method has largely been taken directly from
     * {@link XmiCasDeserializer#deserialize(InputStream, CAS)}. However, the
     * given input stream is explicitly transformed into an UTF-8 encoded
     * {@link InputSource} for the XML parsing process. This is necessary
     * because the Xerces internal UTF-8 handling is faulty with Unicode
     * characters above the BMP (see
     * https://issues.apache.org/jira/browse/XERCESJ-1257). Thus, this method
     * explicitly uses UTF-8 encoding. For other encodings, use the default UIMA
     * deserialization mechanism.
     * 
     * 
     * The {@code attributeBufferSize} parameter only has an effect if the
     * julielab Xerces version is on the classpath. Then, the XMLStringBuffer
     * initial size is set via a system property. This can be very helpful for
     * documents because UIMA stores the document text as an attribute to the
     * sofa element in the XMI format. Such long attribute values are not
     * expected by Xerces which initializes its attribute buffers with a size of
     * 32 chars. Then, reading a large sofa (= document text) string results in
     * a very long process of resizing the buffer array and copying the old
     * buffer contents into the larger array. By setting a larger size from the
     * beginning, a lot of time can be saved.
     * 
     *
     * @param cas                 The CAS to populate.
     * @param is                  The XMI data stream to populate the CAS with.
     * @param attributeBufferSize
     * @throws SAXException
     * @throws IOException
     * @see https://issues.apache.org/jira/browse/XERCESJ-1257
     */
    public static void deserializeXmi(CAS cas, InputStream is, int attributeBufferSize)
            throws SAXException, IOException {
        Reader reader = new InputStreamReader(is, "UTF-8");
        InputSource source = new InputSource(reader);
        source.setEncoding("UTF-8");

        if (attributeBufferSize > 0)
            System.setProperty("julielab.xerces.attributebuffersize", String.valueOf(attributeBufferSize));

        XMLReader xmlReader = XMLReaderFactory.createXMLReader();
        XmiCasDeserializer deser = new XmiCasDeserializer(cas.getTypeSystem());
        ContentHandler handler = deser.getXmiCasHandler(cas, false, null, -1);
        xmlReader.setContentHandler(handler);
        xmlReader.parse(source);

        System.clearProperty("julielab.xerces.attributebuffersize");
    }

    public static > int binarySearch(List annotations,
                                                                                   Function comparisonValueFunction, R searchValue) {
        return binarySearch(annotations, comparisonValueFunction, searchValue, 0, annotations.size() - 1);
    }

    public static > int binarySearch(List annotations,
                                                                                   Function comparisonValueFunction, R searchValue, int from, int to) {
        assert from <= to : "End offset is smaller than begin offset";
        int lookupIndex = from + (to - from) / 2;
        T annotation = annotations.get(lookupIndex);
        R comparisonValue = comparisonValueFunction.apply(annotation);
        int comparison = searchValue.compareTo(comparisonValue);
        if (comparison == 0)
            return lookupIndex;
        else if (comparison < 0) {
            if (from > lookupIndex - 1)
                return -(lookupIndex) - 1;
            return binarySearch(annotations, comparisonValueFunction, searchValue, from, lookupIndex - 1);
        } else {
            if (to < lookupIndex + 1)
                return -(lookupIndex + 1) - 1;
            return binarySearch(annotations, comparisonValueFunction, searchValue, lookupIndex + 1, to);
        }
    }

    /**
     * 
     * Helper method to transparently handle GZIPPed external resource files.
     * 
     * When using external resources for analysis engines in UIMA, typically a custom object implementing {@link org.apache.uima.resource.SharedResourceObject}
     * is created as the resource provider. Since the overhead in handling external resources is mostly done when the resource is rather large, file
     * resources are commonly compressed using GZIP. This method takes the input stream of the {@link DataResource} object
     * passed by UIMA to {@link org.apache.uima.resource.SharedResourceObject#load(DataResource)} and checks if its URI
     * ends with .gzip or .gz. If so, the input stream is wrapped into a {@link GZIPInputStream}. This way a gzipped or
     * plain resource file can be used without further code adaptions.
     *
     * @param resource The {@link DataResource} object passed to {@link org.apache.uima.resource.SharedResourceObject#load(DataResource)}.
     * @return The original input stream, if the resource URI did not end in .gz or .gzip, a GZIP input stream otherwise.
     * @throws IOException If reading the resource file fails.
     */
    public static InputStream resolveExternalResourceGzipInputStream(DataResource resource) throws IOException {
        InputStream is = resource.getInputStream();
        String lcUriString = resource.getUri().toString().toLowerCase();
        if (lcUriString.endsWith(".gz") || lcUriString.endsWith(".gzip"))
            is = new GZIPInputStream(is);
        return is;
    }
}