All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.PDFImporter Maven / Gradle / Ivy

Go to download

Contains the SCIE main application and the CLI interface. This project integrates the named entity recognition (NER), the PDF import and the classification and interfaces with the UIMA framework. The command line interface can be used to produce a set of UIMA XCAS files.

There is a newer version: 2.0.1
Show newest version
/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie;

import de.citec.scie.annotators.structure.StructureAnnotator;
import de.citec.scie.pdf.PDFStructuredTextExtractor;
import de.citec.scie.pdf.structure.Document;
import java.io.IOException;
import java.io.InputStream;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;

/**
 * This class is a wrapper for the SCIE PDFTextExtractor.
 *
 * @author Benjamin Paassen - [email protected]
 *
 */
public class PDFImporter {

	public static void importPdf(InputStream input, JCas target) throws IOException {
		//Use the PDFStructuredTextExctractor from the included package.
		Document outDoc = PDFStructuredTextExtractor.importAsDocument(input);

		// transform to output text and index all objects in the hiararchy.
		final String docText = outDoc.indexedToString(0);
		target.setDocumentText(docText);

		// copy the structure information to the UIMA handling.
		final StructureAnnotator structureAnnotator
				= new StructureAnnotator(outDoc);
		try {
			structureAnnotator.process(target);
		} catch (AnalysisEngineProcessException ex) {
			throw new IOException(ex);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy