de.citec.scie.PDFImporter Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie;
import de.citec.scie.annotators.structure.StructureAnnotator;
import de.citec.scie.pdf.PDFStructuredTextExtractor;
import de.citec.scie.pdf.structure.Document;
import java.io.IOException;
import java.io.InputStream;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
/**
* This class is a wrapper for the SCIE PDFTextExtractor.
*
* @author Benjamin Paassen - [email protected]
*
*/
public class PDFImporter {
public static void importPdf(InputStream input, JCas target) throws IOException {
//Use the PDFStructuredTextExctractor from the included package.
Document outDoc = PDFStructuredTextExtractor.importAsDocument(input);
// transform to output text and index all objects in the hiararchy.
final String docText = outDoc.indexedToString(0);
target.setDocumentText(docText);
// copy the structure information to the UIMA handling.
final StructureAnnotator structureAnnotator
= new StructureAnnotator(outDoc);
try {
structureAnnotator.process(target);
} catch (AnalysisEngineProcessException ex) {
throw new IOException(ex);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy