All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.ctakes.CTAKESUtils Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ctakes;

import java.io.IOException;
import java.io.OutputStream;
import java.net.URISyntaxException;

import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.impl.XCASSerializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.cas.impl.XmiSerializationSharedData;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XmlCasSerializer;
import org.xml.sax.SAXException;

/**
 * This class provides methods to extract biomedical information from plain text
 * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
 * 
 * 

* Apache cTAKES is built on top of Apache * UIMA framework and OpenNLP * toolkit. *

*/ public class CTAKESUtils { // UMLS username property private final static String CTAKES_UMLS_USER = "ctakes.umlsuser"; // UMLS password property private final static String CTAKES_UMLS_PASS = "ctakes.umlspw"; /** * Returns a new UIMA Analysis Engine (AE). This method ensures that only * one instance of an AE is created. * *

* An Analysis Engine is a component responsible for analyzing unstructured * information, discovering and representing semantic content. Unstructured * information includes, but is not restricted to, text documents. *

* * @param aeDescriptor * pathname for XML file including an AnalysisEngineDescription * that contains all of the information needed to instantiate and * use an AnalysisEngine. * @param umlsUser * UMLS username for NLM database * @param umlsPass * UMLS password for NLM database * @return an Analysis Engine for analyzing unstructured information. * @throws IOException * if any I/O error occurs. * @throws InvalidXMLException * if the input XML is not valid or does not specify a valid * ResourceSpecifier. * @throws ResourceInitializationException * if a failure occurred during production of the resource. * @throws URISyntaxException * if URL of the resource is not formatted strictly according to * to RFC2396 and cannot be converted to a URI. */ public static AnalysisEngine getAnalysisEngine(String aeDescriptor, String umlsUser, String umlsPass) throws IOException, InvalidXMLException, ResourceInitializationException, URISyntaxException { // UMLS user ID and password. String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor) .toURI().getPath(); // get Resource Specifier from XML XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath); ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser() .parseResourceSpecifier(aeIputSource); // UMLS user ID and password if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null) && (!umlsPass.isEmpty())) { /* * It is highly recommended that you change UMLS credentials in the * XML configuration file instead of giving user and password using * CTAKESConfig. */ System.setProperty(CTAKES_UMLS_USER, umlsUser); System.setProperty(CTAKES_UMLS_PASS, umlsPass); } // create AE AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(aeSpecifier); return ae; } /** * Returns a new JCas () appropriate for the given Analysis Engine. This * method ensures that only one instance of a JCas is created. A Jcas is a * Java Cover Classes based Object-oriented CAS (Common Analysis System) * API. * *

* Important: It is highly recommended that you reuse CAS objects rather * than creating new CAS objects prior to each analysis. This is because CAS * objects may be expensive to create and may consume a significant amount * of memory. *

* * @param ae * AnalysisEngine used to create an appropriate JCas object. * @return a JCas object appropriate for the given AnalysisEngine. * @throws ResourceInitializationException * if a CAS could not be created because this AnalysisEngine's * CAS metadata (type system, type priorities, or FS indexes) * are invalid. */ public static JCas getJCas(AnalysisEngine ae) throws ResourceInitializationException { JCas jcas = ae.newJCas(); return jcas; } /** * Serializes a CAS in the given format. * * @param jcas * CAS (Common Analysis System) to be serialized. * @param type * type of cTAKES (UIMA) serializer used to write CAS. * @param prettyPrint * {@code true} to do pretty printing of output. * @param stream * {@see OutputStream} object used to print out information * extracted by using cTAKES. * @throws SAXException * if there was a SAX exception. * @throws IOException * if any I/O error occurs. */ public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint, OutputStream stream) throws SAXException, IOException { if (type == CTAKESSerializer.XCAS) { XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint); } else if (type == CTAKESSerializer.XMI) { XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), stream, prettyPrint, new XmiSerializationSharedData()); } else { XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), stream); } } /** * Returns the annotation value based on the given annotation type. * * @param annotation * {@see IdentifiedAnnotation} object. * @param property * {@see CTAKESAnnotationProperty} enum used to identify the * annotation type. * @return the annotation value. */ public static String getAnnotationProperty(IdentifiedAnnotation annotation, CTAKESAnnotationProperty property) { String value = null; if (property == CTAKESAnnotationProperty.BEGIN) { value = Integer.toString(annotation.getBegin()); } else if (property == CTAKESAnnotationProperty.END) { value = Integer.toString(annotation.getEnd()); } else if (property == CTAKESAnnotationProperty.CONDITIONAL) { value = Boolean.toString(annotation.getConditional()); } else if (property == CTAKESAnnotationProperty.CONFIDENCE) { value = Float.toString(annotation.getConfidence()); } else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) { value = Integer.toString(annotation.getDiscoveryTechnique()); } else if (property == CTAKESAnnotationProperty.GENERIC) { value = Boolean.toString(annotation.getGeneric()); } else if (property == CTAKESAnnotationProperty.HISTORY_OF) { value = Integer.toString(annotation.getHistoryOf()); } else if (property == CTAKESAnnotationProperty.ID) { value = Integer.toString(annotation.getId()); } else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) { FSArray mentions = annotation.getOntologyConceptArr(); StringBuilder sb = new StringBuilder(); if (mentions != null) { for (int i = 0; i < mentions.size(); i++) { if (mentions.get(i) instanceof UmlsConcept) { UmlsConcept concept = (UmlsConcept) mentions.get(i); sb.append(concept.getCui()); if (i < mentions.size() - 1) { sb.append(","); } } } } value = sb.toString(); } else if (property == CTAKESAnnotationProperty.POLARITY) { value = Integer.toString(annotation.getPolarity()); } return value; } /** * Resets cTAKES objects, if created. This method ensures that new cTAKES * objects (a.k.a., Analysis Engine and JCas) will be created if getters of * this class are called. * * @param ae UIMA Analysis Engine * @param jcas JCas object */ public static void reset(AnalysisEngine ae, JCas jcas) { // Analysis Engine resetAE(ae); // JCas resetCAS(jcas); jcas = null; } /** * Resets the CAS (Common Analysis System), emptying it of all content. * * @param jcas JCas object */ public static void resetCAS(JCas jcas) { if (jcas != null) { jcas.reset(); } } /** * Resets the AE (AnalysisEngine), releasing all resources held by the * current AE. * * @param ae UIMA Analysis Engine */ public static void resetAE(AnalysisEngine ae) { if (ae != null) { ae.destroy(); ae = null; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy