All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.ctakes.CTAKESContentHandler Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ctakes;

import java.util.Collection;
import java.util.Iterator;

import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Class used to extract biomedical information while parsing.
 *
 * 

* This class relies on Apache cTAKES * that is a natural language processing system for extraction of information * from electronic medical record clinical free-text. *

*/ public class CTAKESContentHandler extends ContentHandlerDecorator { // Prefix used for metadata including cTAKES annotations public static String CTAKES_META_PREFIX = "ctakes:"; // Configuration object for CTAKESContentHandler private CTAKESConfig config = null; // StringBuilder object used to build the clinical free-text for cTAKES private StringBuilder sb = null; // Metadata object used for cTAKES annotations private Metadata metadata = null; // UIMA Analysis Engine private AnalysisEngine ae = null; // JCas object for working with the CAS (Common Analysis System) private JCas jcas = null; /** * Creates a new {@see CTAKESContentHandler} for the given {@see * ContentHandler} and Metadata objects. * * @param handler * the {@see ContentHandler} object to be decorated. * @param metadata * the {@see Metadata} object that will be populated using * biomedical information extracted by cTAKES. * @param config * the {@see CTAKESConfig} object used to configure the handler. */ public CTAKESContentHandler(ContentHandler handler, Metadata metadata, CTAKESConfig config) { super(handler); this.metadata = metadata; this.config = config; this.sb = new StringBuilder(); } /** * Creates a new {@see CTAKESContentHandler} for the given {@see * ContentHandler} and Metadata objects. * * @param handler * the {@see ContentHandler} object to be decorated. * @param metadata * the {@see Metadata} object that will be populated using * biomedical information extracted by cTAKES. */ public CTAKESContentHandler(ContentHandler handler, Metadata metadata) { this(handler, metadata, new CTAKESConfig()); } /** * Default constructor. */ public CTAKESContentHandler() { this(new DefaultHandler(), new Metadata()); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (config.isText()) { sb.append(ch, start, length); } super.characters(ch, start, length); } @Override public void endDocument() throws SAXException { try { // create an Analysis Engine if (ae == null) { ae = CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), config.getUMLSUser(), config.getUMLSPass()); } // create a JCas, given an AE if (jcas == null) { jcas = CTAKESUtils.getJCas(ae); } // get metadata to process StringBuilder metaText = new StringBuilder(); String[] metadataToProcess = config.getMetadata(); if (metadataToProcess != null) { for (String name : config.getMetadata()) { for (String value : metadata.getValues(name)) { metaText.append(value); metaText.append(System.lineSeparator()); } } } // analyze text jcas.setDocumentText(metaText.toString() + sb.toString()); ae.process(jcas); // add annotations to metadata metadata.add(CTAKES_META_PREFIX + "schema", config.getAnnotationPropsAsString()); CTAKESAnnotationProperty[] annotationPros = config.getAnnotationProps(); Collection collection = JCasUtil.select(jcas, IdentifiedAnnotation.class); Iterator iterator = collection.iterator(); while (iterator.hasNext()) { IdentifiedAnnotation annotation = iterator.next(); StringBuilder annotationBuilder = new StringBuilder(); annotationBuilder.append(annotation.getCoveredText()); if (annotationPros != null) { for (CTAKESAnnotationProperty property : annotationPros) { annotationBuilder.append(config.getSeparatorChar()); annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation, property)); } } metadata.add(CTAKES_META_PREFIX + annotation.getType().getShortName(), annotationBuilder.toString()); } if (config.isSerialize()) { // serialize data CTAKESUtils.serialize(jcas, config.getSerializerType(), config.isPrettyPrint(), config.getOutputStream()); } } catch (Exception e) { throw new SAXException(e.getMessage()); } finally { CTAKESUtils.resetCAS(jcas); } } /** * Returns metadata that includes cTAKES annotations. * * @return {@Metadata} object that includes cTAKES annotations. */ public Metadata getMetadata() { return metadata; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy