de.citec.scie.pdf.PDFStructuredTextExtractor Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
import de.citec.scie.pdf.structure.Document;
import de.citec.scie.pdf.structure.Page;
import de.citec.scie.pdf.structure.Paragraph;
import de.citec.scie.pdf.structure.Text;
import de.citec.scie.pdf.structure.Text.VerticalAlignment;
import de.citec.scie.pdf.structure.TextBlock;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextPosition;
/**
* This class takes a PDF File as input and extracts the text of it in an
* HTML-like hierarchical object structure (see the package "structure" for the
* classes itself).
*
* @author Benjamin Paassen - [email protected]
*
*/
public class PDFStructuredTextExtractor {
public static final int MINIMUMPARSIZE = 80;
/**
* Assumes the given InputStream to contain PDF data and parses it.
* The parsed data is transformed to a Document object.
*
* @param input an input stream containing PDF data.
* @return a Document object containing the text and structure of the given
* PDF.
* @throws IOException is thrown if anything goes wrong during either
* stream reading or parsing.
*/
public static Document importAsDocument(InputStream input) throws IOException {
PDDocument doc = null;
try {
final PDFParser parser = new PDFParser(input);
parser.parse();
doc = parser.getPDDocument();
//create output instance.
final Document outDoc = new Document();
//iterate through all pages.
final List allPages = doc.getDocumentCatalog().getAllPages();
if (allPages.isEmpty()) {
throw new IOException("PDFBox did not find any pages!");
}
int pageNum = 0;
for (final Object pageOBj : allPages) {
//create output page.
pageNum++;
final Page outPage = new Page();
outPage.setPageNumber(pageNum);
//add it to the output Document.
outDoc.content.add(outPage);
//get the PDPage.
final PDPage page = (PDPage) pageOBj;
//Preprocess the page.
final PDPagePreprocessor preProc = new PDPagePreprocessor(page);
preProc.process();
//get the TextBlockRankEstimator instance for this page ready.
final TextBlockRankEstimator blockRankEst = new TextBlockRankEstimator();
//We create a TextBlock for every PreTextBlock.
//start by trying to split each PreTextBlock into several different PreTextBlocks by
//utilizing the statistics we did.
for (final PreTextBlock splitBlock : preProc.getPreTextBlock().split()) {
final TextBlock outTextBlock = new TextBlock();
outPage.content.add(outTextBlock);
blockRankEst.addBlock(outTextBlock, splitBlock);
//then add the content in paragraphs.
Paragraph outPar = new Paragraph();
outTextBlock.content.add(outPar);
Text outText = new Text();
outPar.content.add(outText);
//set up estimators
ParagraphEstimator parEst = new ParagraphEstimator(splitBlock);
WhiteSpaceEstimator spaceEst = new WhiteSpaceEstimator();
VerticalAlignmentEstimator vAlignEst = new VerticalAlignmentEstimator(
splitBlock.lines.get(0));
/*
* we store the current font and font size because that is
* our
* criterion to decide whether we are still in the same text
* object
* or not.
*/
TextPosition firstGlyph = splitBlock.lines.get(0).content.get(0);
outText.setFontSize(firstGlyph.getFontSizeInPt());
if (firstGlyph.getFont() != null
&& firstGlyph.getFont().getFontDescriptor() != null) {
outText.setFontName(firstGlyph.getFont().getFontDescriptor().getFontName());
}
outText.setVerticalAlignment(vAlignEst.calculateAlignment(firstGlyph));
//this is our string buffer.
StringBuilder currentTextBuilder = new StringBuilder();
for (final PreTextLine line : splitBlock.lines) {
vAlignEst = new VerticalAlignmentEstimator(line);
//if we have a new paragraph, create a new paragraph object.
if (parEst.isNewParagraph(line)) {
outPar = new Paragraph();
outTextBlock.content.add(outPar);
//also create a new text object. But delete the last whitespace.
currentTextBuilder.delete(currentTextBuilder.length() - 1,
currentTextBuilder.length());
outText.setText(currentTextBuilder.toString());
outText = new Text();
outPar.content.add(outText);
currentTextBuilder = new StringBuilder();
firstGlyph = line.content.get(0);
outText.setFontSize(firstGlyph.getFontSizeInPt());
if (firstGlyph.getFont() != null
&& firstGlyph.getFont().getFontDescriptor() != null) {
outText.setFontName(firstGlyph.getFont().getFontDescriptor().
getFontName());
}
}
for (final TextPosition glyph : line.content) {
/*
* if font, font size or vertical alignment are not
* equal,
* change the Text object.
*/
final String glyphFont;
if (glyph.getFont() != null
&& glyph.getFont().getFontDescriptor() != null) {
glyphFont = glyph.getFont().getFontDescriptor().getFontName();
} else {
glyphFont = null;
}
final boolean fontEquals;
if (glyphFont == null) {
fontEquals = outText.getFontName() == null;
} else {
if (outText.getFontName() == null) {
fontEquals = false;
} else {
fontEquals = glyphFont.equals(outText.getFontName());
}
}
final float glyphFontSize = glyph.getFontSizeInPt();
final VerticalAlignment glyphAlignment = vAlignEst.
calculateAlignment(glyph);
if (!fontEquals
|| glyphFontSize != outText.getFontSize()
|| glyphAlignment != outText.getVerticalAlignment()) {
outText.setText(currentTextBuilder.toString());
outText = new Text();
outPar.content.add(outText);
currentTextBuilder = new StringBuilder();
outText.setFontName(glyphFont);
outText.setFontSize(glyphFontSize);
outText.setVerticalAlignment(glyphAlignment);
spaceEst = new WhiteSpaceEstimator();
}
//if we don't have a new paragraph. check if we have to add a whitespace.
if (spaceEst.hasWhiteSpace(glyph)) {
currentTextBuilder.append(' ');
}
currentTextBuilder.append(glyph.getCharacter());
}
//we seperate each line break with a Whitespace because not every line break in
//the pdf is an actual paragraph. We only take paragraph breaks into account
//for the structure.
if (currentTextBuilder.length() > 0) {
final char previousChar = currentTextBuilder.charAt(
currentTextBuilder.length() - 1);
if (previousChar != '-') {
currentTextBuilder.append(' ');
}
}
}
outText.setText(currentTextBuilder.toString());
}
for (final TextBlock outBlock : outPage.content) {
//Do a sanity check regarding paragraphs and remove the paragraphs if they do not
//seem sane.
paragraphSanityCheck(outBlock);
//set the block rank for each TextBlock
outBlock.setRelativeFontSize(blockRankEst.getRelativeFontSize(outBlock));
}
}
//clean up page numbers and other redundant textblocks in the documents
final DocumentBlockCleaner cleaner = new DocumentBlockCleaner();
cleaner.blockCleanup(outDoc);
if (outDoc.content.isEmpty()) {
throw new IOException("After cleanup the document contained nothing!");
}
return outDoc;
} finally {
if (doc != null) {
doc.close();
}
input.close();
}
}
/**
* Assumes the given InputStream to contain PDF data and parses it.
* The parsed data is returned as plain text.
*
* @param input an input stream containing PDF data.
* @return a plain text String containing the text inside the PDF.
* @throws IOException is thrown if anything goes wrong during either
* stream reading or parsing.
*/
public static String importAsString(InputStream input) throws IOException {
return importAsDocument(input).indexedToString(0);
}
/**
* Assumes the given InputStream to contain PDF data and parses it.
* The parsed data is returned as an InputStream containing the plain
* text data of the PDF input stream.
*
* @param input an input stream containing PDF data.
* @return an InputStream (ByteArrayInputStream with UTF-8 encoding)
* containing the plain text data of the PDF input stream.
* @throws IOException is thrown if anything goes wrong during either
* stream reading or parsing.
*/
public static InputStream importAsInputStream(InputStream input) throws IOException {
return new ByteArrayInputStream(importAsString(input).getBytes("UTF-8"));
}
/**
* This does not only check the sanity of a given Textblock but also
* corrects it if it does not seem sane.
*
* @param outBlock a TextBlock.
*/
private static void paragraphSanityCheck(TextBlock outBlock) {
if (outBlock.content.size() > 1) {
//check the average paragraph size inside the block.
int accumulatedSize = 0;
for (final Paragraph par : outBlock.content) {
for (final Text text : par.content) {
accumulatedSize += text.getText().length();
}
}
final double avgSize = (double) accumulatedSize / (double) outBlock.content.size();
if (avgSize < MINIMUMPARSIZE) {
//if the paragraphs are too small we put all text into a huge paragraph containing all
//text to have a fallback solution.
final Paragraph newPar = new Paragraph();
for (final Paragraph par : outBlock.content) {
newPar.content.addAll(par.content);
}
outBlock.content.clear();
outBlock.content.add(newPar);
}
}
}
private static class PDPagePreprocessor extends PDFStreamEngine {
/**
* The properties path to make the PDFStreamEngine work.
*/
private static final String propertiesPath
= "org/apache/pdfbox/resources/PDFTextStripper.properties";
/**
* The current page that is analyzed.
*/
private final PDPage page;
/**
* The PreTextBlock that represents the page content in our
* PreProcessing step. We split this later on.
*/
private final PreTextBlock preTextBlock = new PreTextBlock();
public PDPagePreprocessor(PDPage page) throws IOException {
super(ResourceLoader.loadProperties(propertiesPath, true));
this.page = page;
}
/**
* This starts the processing.
*/
public void process() throws IOException {
//start the PDFStreamEngine
processStream(page, page.findResources(), page.getContents().getStream());
}
@Override
protected void processTextPosition(TextPosition text) {
//add the TextPosition to the PreTextBlock. This does histogram management automatically.
preTextBlock.addTextPosition(text);
}
public PreTextBlock getPreTextBlock() {
return preTextBlock;
}
}
}