de.citec.scie.Annotator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scie-core Show documentation
Contains the SCIE main application and the CLI interface. This project integrates the named entity recognition (NER), the PDF import and the classification and interfaces with the UIMA framework. The command line interface can be used to produce a set of UIMA XCAS files.
The newest version!
/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie;

import de.bwaldvogel.liblinear.SolverType;
import de.citec.scie.analysis.AnalysisAnnotator;
import de.citec.scie.annotators.AnnotationCounter;
import de.citec.scie.annotators.relations.AnimalAnnotator;
import de.citec.scie.annotators.relations.InjuryAnnotator;
import de.citec.scie.annotators.relations.ResultAnnotator;
import de.citec.scie.annotators.relations.TreatmentAnnotator;
import de.citec.scie.classifiers.Classifier;
import de.citec.scie.classifiers.LibLinearClassifier;
import de.citec.scie.typesystem.Typesystem;
import de.citec.scie.util.ResourceFinder;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.jcas.JCas;

/**
 * The Annotator class is used to run the complete analysis on the incoming
 * input stream or file and to return a UIMA JCas instance with the relevant
 * data.
 *
 * @author Andreas Stöckel -- [email protected]
 */
public class Annotator {

	private final AnnotatorPipeline relationalAnnotators = setupRelationalAnnotators();
	private final AnalysisEngine engine = Typesystem.getAnalysisEngine(Constants.ANNOTATOR);

	public static final String CLASSIFIER_DIR = "classifiers";

	public static Classifier loadClassifier(String name, SolverType type) {
		final File classifier_dir = ResourceFinder.find("data/classifiers");
		if (!classifier_dir.exists() || !classifier_dir.isDirectory()) {
			throw new RuntimeException("The classifiers directory does not exist!");
		}
		final File modelFile = new File(classifier_dir, name + ".model");
		if (!modelFile.exists()) {
			throw new RuntimeException(modelFile.getAbsolutePath() + " was not found!");
		}
		final File featuresFile = new File(classifier_dir, name + ".features");
		if (!featuresFile.exists()) {
			throw new UnsupportedOperationException(featuresFile.getAbsolutePath() + " was not found!");
		}
		try {
			final FileReader modelReader = new FileReader(modelFile);
			final FileReader featuresReader = new FileReader(featuresFile);
			final LibLinearClassifier out = new LibLinearClassifier(type);
			out.readParameters(modelReader, featuresReader);
			return out;
		} catch (IOException ex) {
			throw new RuntimeException(ex);
		}
	}

	public static AnnotatorPipeline setupRelationalAnnotators() {
		final AnnotatorPipeline out = new AnnotatorPipeline();
		/*
		 * The ANIMAL Relation has an Organism Core and the slots:
		 * 
		 * Age
		 * Gender
		 * Number
		 * Weight
		 * 
		 * In that order. For core, each slot and the relation itself we need
		 * the fitting classifier.
		 */
		final Classifier animalCore = loadClassifier(
				"AnimalCore", SolverType.L1R_LR);
		final Classifier animalAge = loadClassifier(
				"AnimalAge", SolverType.L1R_LR);
		final Classifier animalGender = loadClassifier(
				"AnimalGender", SolverType.L1R_LR);
		final Classifier animalNumber = loadClassifier(
				"AnimalNumber", SolverType.L1R_LR);
		final Classifier animalWeight = loadClassifier(
				"AnimalWeight", SolverType.L1R_LR);
		final Classifier[] animalSlotClassifiers
				= {animalAge, animalGender, animalNumber, animalWeight};
		final Classifier animalRelation = loadClassifier(
				"AnimalRelation", SolverType.L2R_LR);
		out.getAnnotators().add(new AnimalAnnotator(
				animalCore, animalSlotClassifiers, animalRelation));
		/*
		 * The INJURY Relation has an InjuryType Core and the slots:
		 * 
		 * Duration
		 * InjuryDevice
		 * InjuryHeight
		 * 
		 * in that order. For core, each slot and the relation itself we need
		 * the fitting classifier.
		 */
		final Classifier injuryCore = loadClassifier(
				"InjuryCore", SolverType.L1R_LR);
		final Classifier injuryDuration = loadClassifier(
				"InjuryDuration", SolverType.L1R_LR);
		final Classifier injuryDevice = loadClassifier(
				"InjuryDevice", SolverType.L1R_LR);
		final Classifier injuryHeight = loadClassifier(
				"InjuryHeight", SolverType.L1R_LR);
		final Classifier[] injurySlotClassifiers
				= {injuryDuration, injuryDevice, injuryHeight};
		final Classifier injuryRelation = loadClassifier(
				"InjuryRelation", SolverType.L2R_LR);
		out.getAnnotators().add(new InjuryAnnotator(
				injuryCore, injurySlotClassifiers, injuryRelation));
		/*
		 * The TREATMENT Relation has a Drug Core and the slots:
		 * 
		 * Delivery
		 * Dose
		 * 
		 * in that order. For core, each slot and the relation itself we need
		 * the fitting classifier.
		 */
		final Classifier treatmentCore = loadClassifier(
				"TreatmentCore", SolverType.L1R_LR);
		final Classifier treatmentDelivery = loadClassifier(
				"TreatmentDelivery", SolverType.L1R_LR);
		final Classifier treatmentDose = loadClassifier(
				"TreatmentDose", SolverType.L1R_LR);
		final Classifier[] treatmentSlotClassifiers
				= {treatmentDelivery, treatmentDose};
		final Classifier treatmentRelation = loadClassifier(
				"TreatmentRelation", SolverType.L1R_LR);
		out.getAnnotators().add(new TreatmentAnnotator(
				treatmentCore, treatmentSlotClassifiers, treatmentRelation));
		/*
		 * The RESULT Relation has a InvestigationMethods Core and the slots:
		 * 
		 * p-Value
		 * Significance
		 * Trend
		 * 
		 * in that order. For core, each slot and the relation itself we need
		 * the fitting classifier.
		 */
		final Classifier resultCore = loadClassifier(
				"ResultCore", SolverType.L1R_LR);
		final Classifier resultPValue = loadClassifier(
				"ResultPValue", SolverType.L1R_LR);
		final Classifier resultSignificance = loadClassifier(
				"ResultSignificance", SolverType.L1R_LR);
		final Classifier resultTrend = loadClassifier(
				"ResultTrend", SolverType.L1R_LR);
		final Classifier[] resultSlotClassifiers
				= {resultPValue, resultSignificance, resultTrend};
		final Classifier resultRelation = loadClassifier(
				"ResultRelation", SolverType.L2R_LR);
		out.getAnnotators().add(new ResultAnnotator(
				resultCore, resultSlotClassifiers, resultRelation));
		/*
		 * For active learning we also annotate fitting annotations on top.
		 */
		out.getAnnotators().add(new AnalysisAnnotator());
		return out;
	}

	/**
	 * Reads the given input stream into a single string containing its contents
	 * interpreted as UTF-8 encoded text.
	 *
	 * @param is is the input stream from which the text should be loaded.
	 * @return a string containing the text.
	 * @throws IOException if something goes wrong.
	 */
	private String readTextStream(InputStream is) throws IOException {
		final BufferedReader reader = new BufferedReader(
				new InputStreamReader(is, "UTF-8"));
		final StringBuilder txt = new StringBuilder();
		String currentLine;
		while ((currentLine = reader.readLine()) != null) {
			txt.append(currentLine);
			txt.append("\n");
		}
		return txt.toString();
	}

	/**
	 * Runs the SCIE annotations on the given input stream.
	 *
	 * @param is is the input stream from which the input document should be
	 * read.
	 * @param isPdf specifies whether the input stream points at a PDF document.
	 * If false, the input stream is interpreted as UTF-8 encoded text.
	 * @return a JCas instance containing the annotated text.
	 * @throws UIMAException if a UIMAException occurs.
	 * @throws IOException if an IOException occurs.
	 */
	public JCas annotateInputStream(InputStream is, boolean isPdf)
			throws UIMAException, IOException {
		// Get the underlying type system
		JCas jcas = Typesystem.getJCas(Constants.TYPESYSTEM);

		// Either read the given input stream is PDF or as plain UTF-8 text
		if (isPdf) {
			PDFImporter.importPdf(is, jcas);
		} else {
			jcas.setDocumentText(readTextStream(is));
		}

		// Reset the AnnotationCounter
		AnnotationCounter.reset();

		// Run NER.
		engine.process(jcas);

		// Run the relational annotators.
		relationalAnnotators.process(jcas);

		return jcas;
	}

	/**
	 * Runs the SCIE annotations on the given input stream.
	 *
	 * @param inputFile is the input file on which the analysis should run.
	 * @param isPdf specifies whether the input file points at a PDF document.
	 * If false, the input file is interpreted as UTF-8 encoded text.
	 * @return a JCas instance containing the annotated text.
	 * @throws UIMAException if a UIMAException occurs.
	 * @throws IOException if an IOException occurs.
	 */
	public JCas annotateInputFile(File inputFile, boolean isPdf)
			throws UIMAException, IOException {
		try (FileInputStream fis = new FileInputStream(inputFile);
				BufferedInputStream bfis = new BufferedInputStream(fis)) {
			return annotateInputStream(bfis, isPdf);
		}
	}

}