de.citec.scie.pdf.PDFStructuredTextExtractor Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.scie.pdf;

import de.citec.scie.pdf.structure.Document;
import de.citec.scie.pdf.structure.Page;
import de.citec.scie.pdf.structure.Paragraph;
import de.citec.scie.pdf.structure.Text;
import de.citec.scie.pdf.structure.Text.VerticalAlignment;
import de.citec.scie.pdf.structure.TextBlock;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextPosition;

/**
 * This class takes a PDF File as input and extracts the text of it in an
 * HTML-like hierarchical object structure (see the package "structure" for the
 * classes itself).
 *
 * @author Benjamin Paassen - [email protected]
 *
 */
public class PDFStructuredTextExtractor {

	public static final int MINIMUMPARSIZE = 80;

	/**
	 * Assumes the given InputStream to contain PDF data and parses it.
	 * The parsed data is transformed to a Document object.
	 *
	 * @param input an input stream containing PDF data.
	 * @return a Document object containing the text and structure of the given
	 * PDF.
	 * @throws IOException is thrown if anything goes wrong during either
	 * stream reading or parsing.
	 */
	public static Document importAsDocument(InputStream input) throws IOException {
		PDDocument doc = null;
		try {
			final PDFParser parser = new PDFParser(input);
			parser.parse();
			doc = parser.getPDDocument();

			//create output instance.
			final Document outDoc = new Document();
			//iterate through all pages.
			final List allPages = doc.getDocumentCatalog().getAllPages();
			if (allPages.isEmpty()) {
				throw new IOException("PDFBox did not find any pages!");
			}
			int pageNum = 0;
			for (final Object pageOBj : allPages) {
				//create output page.
				pageNum++;
				final Page outPage = new Page();
				outPage.setPageNumber(pageNum);
				//add it to the output Document.
				outDoc.content.add(outPage);
				//get the PDPage.
				final PDPage page = (PDPage) pageOBj;
				//Preprocess the page.
				final PDPagePreprocessor preProc = new PDPagePreprocessor(page);
				preProc.process();
				//get the TextBlockRankEstimator instance for this page ready.
				final TextBlockRankEstimator blockRankEst = new TextBlockRankEstimator();
				//We create a TextBlock for every PreTextBlock.
				//start by trying to split each PreTextBlock into several different PreTextBlocks by
				//utilizing the statistics we did.
				for (final PreTextBlock splitBlock : preProc.getPreTextBlock().split()) {
					final TextBlock outTextBlock = new TextBlock();
					outPage.content.add(outTextBlock);

					blockRankEst.addBlock(outTextBlock, splitBlock);
					//then add the content in paragraphs.
					Paragraph outPar = new Paragraph();
					outTextBlock.content.add(outPar);
					Text outText = new Text();
					outPar.content.add(outText);
					//set up estimators
					ParagraphEstimator parEst = new ParagraphEstimator(splitBlock);
					WhiteSpaceEstimator spaceEst = new WhiteSpaceEstimator();
					VerticalAlignmentEstimator vAlignEst = new VerticalAlignmentEstimator(
							splitBlock.lines.get(0));
					/*
					 * we store the current font and font size because that is
					 * our
					 * criterion to decide whether we are still in the same text
					 * object
					 * or not.
					 */
					TextPosition firstGlyph = splitBlock.lines.get(0).content.get(0);
					outText.setFontSize(firstGlyph.getFontSizeInPt());
					if (firstGlyph.getFont() != null
							&& firstGlyph.getFont().getFontDescriptor() != null) {
						outText.setFontName(firstGlyph.getFont().getFontDescriptor().getFontName());
					}
					outText.setVerticalAlignment(vAlignEst.calculateAlignment(firstGlyph));
					//this is our string buffer.
					StringBuilder currentTextBuilder = new StringBuilder();
					for (final PreTextLine line : splitBlock.lines) {
						vAlignEst = new VerticalAlignmentEstimator(line);
						//if we have a new paragraph, create a new paragraph object.
						if (parEst.isNewParagraph(line)) {
							outPar = new Paragraph();
							outTextBlock.content.add(outPar);
							//also create a new text object. But delete the last whitespace.
							currentTextBuilder.delete(currentTextBuilder.length() - 1,
									currentTextBuilder.length());
							outText.setText(currentTextBuilder.toString());
							outText = new Text();
							outPar.content.add(outText);
							currentTextBuilder = new StringBuilder();
							firstGlyph = line.content.get(0);
							outText.setFontSize(firstGlyph.getFontSizeInPt());
							if (firstGlyph.getFont() != null
									&& firstGlyph.getFont().getFontDescriptor() != null) {
								outText.setFontName(firstGlyph.getFont().getFontDescriptor().
										getFontName());
							}
						}
						for (final TextPosition glyph : line.content) {
							/*
							 * if font, font size or vertical alignment are not
							 * equal,
							 * change the Text object.
							 */
							final String glyphFont;
							if (glyph.getFont() != null
									&& glyph.getFont().getFontDescriptor() != null) {
								glyphFont = glyph.getFont().getFontDescriptor().getFontName();
							} else {
								glyphFont = null;
							}

							final boolean fontEquals;
							if (glyphFont == null) {
								fontEquals = outText.getFontName() == null;
							} else {
								if (outText.getFontName() == null) {
									fontEquals = false;
								} else {
									fontEquals = glyphFont.equals(outText.getFontName());
								}
							}

							final float glyphFontSize = glyph.getFontSizeInPt();
							final VerticalAlignment glyphAlignment = vAlignEst.
									calculateAlignment(glyph);

							if (!fontEquals
									|| glyphFontSize != outText.getFontSize()
									|| glyphAlignment != outText.getVerticalAlignment()) {
								outText.setText(currentTextBuilder.toString());
								outText = new Text();
								outPar.content.add(outText);
								currentTextBuilder = new StringBuilder();
								outText.setFontName(glyphFont);
								outText.setFontSize(glyphFontSize);
								outText.setVerticalAlignment(glyphAlignment);
								spaceEst = new WhiteSpaceEstimator();
							}
							//if we don't have a new paragraph. check if we have to add a whitespace.
							if (spaceEst.hasWhiteSpace(glyph)) {
								currentTextBuilder.append(' ');
							}
							currentTextBuilder.append(glyph.getCharacter());
						}
						//we seperate each line break with a Whitespace because not every line break in
						//the pdf is an actual paragraph. We only take paragraph breaks into account
						//for the structure.
						if (currentTextBuilder.length() > 0) {
							final char previousChar = currentTextBuilder.charAt(
									currentTextBuilder.length() - 1);
							if (previousChar != '-') {
								currentTextBuilder.append(' ');
							}
						}
					}
					outText.setText(currentTextBuilder.toString());
				}
				for (final TextBlock outBlock : outPage.content) {
					//Do a sanity check regarding paragraphs and remove the paragraphs if they do not
					//seem sane.
					paragraphSanityCheck(outBlock);
					//set the block rank for each TextBlock
					outBlock.setRelativeFontSize(blockRankEst.getRelativeFontSize(outBlock));
				}
			}

			//clean up page numbers and other redundant textblocks in the documents
			final DocumentBlockCleaner cleaner = new DocumentBlockCleaner();
			cleaner.blockCleanup(outDoc);
			if (outDoc.content.isEmpty()) {
				throw new IOException("After cleanup the document contained nothing!");
			}
			return outDoc;
		} finally {
			if (doc != null) {
				doc.close();
			}
			input.close();
		}
	}

	/**
	 * Assumes the given InputStream to contain PDF data and parses it.
	 * The parsed data is returned as plain text.
	 *
	 * @param input an input stream containing PDF data.
	 * @return a plain text String containing the text inside the PDF.
	 * @throws IOException is thrown if anything goes wrong during either
	 * stream reading or parsing.
	 */
	public static String importAsString(InputStream input) throws IOException {
		return importAsDocument(input).indexedToString(0);
	}

	/**
	 * Assumes the given InputStream to contain PDF data and parses it.
	 * The parsed data is returned as an InputStream containing the plain
	 * text data of the PDF input stream.
	 *
	 * @param input an input stream containing PDF data.
	 * @return an InputStream (ByteArrayInputStream with UTF-8 encoding)
	 * containing the plain text data of the PDF input stream.
	 * @throws IOException is thrown if anything goes wrong during either
	 * stream reading or parsing.
	 */
	public static InputStream importAsInputStream(InputStream input) throws IOException {
		return new ByteArrayInputStream(importAsString(input).getBytes("UTF-8"));
	}

	/**
	 * This does not only check the sanity of a given Textblock but also
	 * corrects it if it does not seem sane.
	 *
	 * @param outBlock a TextBlock.
	 */
	private static void paragraphSanityCheck(TextBlock outBlock) {
		if (outBlock.content.size() > 1) {
			//check the average paragraph size inside the block.
			int accumulatedSize = 0;
			for (final Paragraph par : outBlock.content) {
				for (final Text text : par.content) {
					accumulatedSize += text.getText().length();
				}
			}
			final double avgSize = (double) accumulatedSize / (double) outBlock.content.size();
			if (avgSize < MINIMUMPARSIZE) {
				//if the paragraphs are too small we put all text into a huge paragraph containing all
				//text to have a fallback solution.
				final Paragraph newPar = new Paragraph();
				for (final Paragraph par : outBlock.content) {
					newPar.content.addAll(par.content);
				}
				outBlock.content.clear();
				outBlock.content.add(newPar);
			}
		}
	}

	private static class PDPagePreprocessor extends PDFStreamEngine {

		/**
		 * The properties path to make the PDFStreamEngine work.
		 */
		private static final String propertiesPath
				= "org/apache/pdfbox/resources/PDFTextStripper.properties";
		/**
		 * The current page that is analyzed.
		 */
		private final PDPage page;
		/**
		 * The PreTextBlock that represents the page content in our
		 * PreProcessing step. We split this later on.
		 */
		private final PreTextBlock preTextBlock = new PreTextBlock();

		public PDPagePreprocessor(PDPage page) throws IOException {
			super(ResourceLoader.loadProperties(propertiesPath, true));
			this.page = page;
		}

		/**
		 * This starts the processing.
		 */
		public void process() throws IOException {
			//start the PDFStreamEngine
			processStream(page, page.findResources(), page.getContents().getStream());
		}

		@Override
		protected void processTextPosition(TextPosition text) {
			//add the TextPosition to the PreTextBlock. This does histogram management automatically.
			preTextBlock.addTextPosition(text);
		}

		public PreTextBlock getPreTextBlock() {
			return preTextBlock;
		}
	}
}