de.citec.scie.pdf.PreTextBlock Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

import java.util.ArrayDeque;
import java.util.ArrayList;
import org.apache.pdfbox.util.TextPosition;

/**
 * A PreTextBlock represents a ThreadBead with some additional information. A
 * ThreadBead is a TextBlock on a Page in a PDF document. Please note that this
 * might look like a very useful concept to structure the page but not all PDFs
 * use it. So it might very well be the case, that there are more text blocks on
 * the page than there are ThreadBeads. Thus this is called a "PreTextBlock".
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class PreTextBlock {

	public final ArrayList lines = new ArrayList<>();
	public final Histogramm yDistHisto = new Histogramm<>();
	public final Histogramm lengthHisto = new Histogramm<>();

	public PreTextBlock() {
	}

	public int getSize() {
		int size = 0;
		for (final PreTextLine line : lines) {
			size += line.content.size();
		}
		return size;
	}

	public void addTextPosition(final TextPosition text) {
		PreTextLine line;
		if (lines.isEmpty()) {
			line = new PreTextLine();
			lines.add(line);
		} else {
			line = lines.get(lines.size() - 1);
			if (!line.isPartOfLine(text)) {
				if (lines.size() > 1) {
					final PreTextLine lastLine = lines.get(lines.size() - 2);
					final float lastLineY = lastLine.yHisto.getMaxElement();
					final float lineY = line.yHisto.getMaxElement();
					final float lineYDist = lineY - lastLineY;
					yDistHisto.addDataPoint(lineYDist);
				}
				line.setX_End();
				line = new PreTextLine();
				lines.add(line);
			}
		}
		lengthHisto.addDataPoint(line.length());
		line.addElement(text);
	}

	public void addLine(final PreTextLine line) {
		lengthHisto.addDataPoint(line.length());
		if (!lines.isEmpty()) {
			final PreTextLine lastLine = lines.get(lines.size() - 1);
			final float lastLineY = lastLine.yHisto.getMaxElement();
			final float lineY = line.yHisto.getMaxElement();
			final float lineYDist = lineY - lastLineY;
			yDistHisto.addDataPoint(lineYDist);
		}
		lines.add(line);
	}
	private static final float YTOL = 3f;

	/**
	 * This is supposed to split a TextBlock representing a whole page into
	 * different blocks that might represent
	 *
	 * 
	 * columns in a two-column text
	 * Headings
	 * Foot notes
	 * Tables and figures
	 * The document abstract
	 * etc.
	 * 
	 *
	 * @return a list of PreTextBlocks that are a split of this one.
	 */
	public ArrayList split() {
		if (lines.isEmpty()) {
			//we return an empty list if we have new lines because in that case this block is
			//definitely not interesting.
			return new ArrayList<>();
		}
		/*
		 * Font Criterion: We start with the most usual font for the first line
		 * and check if we find a line of text that has another usual font.
		 * If so we start a new TextBlock from it and keep looking for other
		 * font changes.
		 */
		PreTextBlock newBlock = new PreTextBlock();
		final ArrayDeque fontSplit = new ArrayDeque<>();
		if (!lines.get(0).fontHisto.getBackingMap().isEmpty()) {
			String referenceFont = lines.get(0).fontHisto.getMaxElement();
			//the referenceFont might be null if the font information is not well defined in the PDF
			//(might be the case with OCR).
			if (referenceFont != null) {
				for (final PreTextLine line : lines) {
					if (!line.fontHisto.getBackingMap().isEmpty()) {
						final String currentFont = line.fontHisto.getMaxElement();
						if (!currentFont.equals(referenceFont)) {
							//if the font changes, create a new block.
							fontSplit.add(newBlock);
							newBlock = new PreTextBlock();
							referenceFont = currentFont;
						}
					}
					newBlock.addLine(line);
				}
				fontSplit.add(newBlock);
			}
		}
		/*
		 * Font Size Criterion: We start with the most usual font size for the
		 * first line and check if we find a line of text that has another usual
		 * font size. If so we start a new TextBlock from it and keep looking
		 * for other font size changes.
		 */
		final ArrayDeque fontSizeSplit = new ArrayDeque<>();
		if (!lines.get(0).fontSizeHisto.getBackingMap().isEmpty()) {
			float referenceFontSize = lines.get(0).fontSizeHisto.getMaxElement();
			newBlock = new PreTextBlock();
			for (final PreTextLine line : lines) {
				if (!line.fontSizeHisto.getBackingMap().isEmpty()) {
					final float currentFontSize = line.fontSizeHisto.getMaxElement();
					if (currentFontSize != referenceFontSize) {
						//if the font size changes, create a new block.
						fontSizeSplit.add(newBlock);
						newBlock = new PreTextBlock();
						referenceFontSize = currentFontSize;
					}
				}
				newBlock.addLine(line);
			}
			fontSizeSplit.add(newBlock);
		}
		/*
		 * Gap Criterion: We start a new block each time there is a
		 * significant gap.
		 */
		final ArrayDeque gapSplit = new ArrayDeque<>();

		if (!yDistHisto.getBackingMap().isEmpty()) {
			final float referenceGap = yDistHisto.getMaxElement();
			float lastY = lines.get(0).yHisto.getMaxElement();
			newBlock = new PreTextBlock();
			newBlock.addLine(lines.get(0));
			for (int i = 1; i < lines.size(); i++) {
				final float currentY = lines.get(i).yHisto.getMaxElement();
				final float currentGap = currentY - lastY;
				if (currentGap > YTOL * referenceGap || referenceGap > YTOL * currentGap) {
					//if the gap is unusually large, start a new block.
					gapSplit.add(newBlock);
					newBlock = new PreTextBlock();
				}
				newBlock.addLine(lines.get(i));
				lastY = currentY;
			}
			gapSplit.add(newBlock);
		}

		/*
		 * Now we have to find a compromise between all criteria. We want to
		 * maximize the number of TextBlocks that remains, but we are not
		 * allowed to have overlapping TextBlocks. So we use a greedy
		 * approach taking always the shortest block found.
		 */
		//the actual output.
		final ArrayList actualSplit = new ArrayList<>();
		//a variable to denote at which line our next block has to start.
		int currentLine = 0;
		//the different split suggestions.
		final ArrayList< ArrayDeque> splits = new ArrayList<>();
		//the current starting lines for each suggestion.
		ArrayList startLines = new ArrayList<>();
		/*
		 * only want to use split suggestions that seem sane.
		 */
		final int totalSize = getSize();
		if (isSaneSplitSuggestion(fontSplit, totalSize)) {
			splits.add(fontSplit);
			startLines.add(0);
		}
		if (isSaneSplitSuggestion(fontSizeSplit, totalSize)) {
			splits.add(fontSizeSplit);
			startLines.add(0);
		}
		if (isSaneSplitSuggestion(gapSplit, totalSize)) {
			splits.add(gapSplit);
			startLines.add(0);
		}

		//if there are split suggestions at all, start the algorithm.
		if (!splits.isEmpty()) {
			while (currentLine < lines.size()) {
				//look for the split suggestion that has a matching start and the shortest block size.
				int minSize = Integer.MAX_VALUE;
				PreTextBlock minBlock = null;
				for (int i = 0; i < splits.size(); i++) {
					if (!splits.get(i).isEmpty()) {
						final int start = startLines.get(i);
						if (start == currentLine) {
							final PreTextBlock currentBlockSuggestion = splits.get(i).
									peekFirst();
							if (currentBlockSuggestion != null) {
								if (currentBlockSuggestion.lines.size() < minSize) {
									minSize = currentBlockSuggestion.lines.size();
									minBlock = currentBlockSuggestion;
								}
							}
						}
					}
				}

				//if we have found that put the respective block into the output list.
				actualSplit.add(minBlock);
				currentLine += minSize;
				//and poll the first block from each queue until we are at the current line or after it.
				for (int i = 0; i < splits.size(); i++) {
					int currentStart = startLines.get(i);
					while (currentStart < currentLine && !splits.get(i).isEmpty()) {
						final PreTextBlock block = splits.get(i).poll();
						currentStart += block.lines.size();
					}
					startLines.set(i, currentStart);
				}
			}
		}
		if (actualSplit.isEmpty()) {
			actualSplit.add(this);
		}
		return actualSplit;
	}
	public static final int MINIMUMBLOCKSIZE = 150;

	/**
	 * The first sanity check is that we have something to split at all (there
	 * is more than one splitting point). The second sanity check is that the
	 * suggested blocks should - on average - not contain less than
	 * MINIMUMBLOCKSIZE characters.
	 *
	 * @param splitSuggestion a split suggestion.
	 * @param referenceSize the size of the TextBlock that is split.
	 * @return true if the suggestion seems sane.
	 */
	private boolean isSaneSplitSuggestion(ArrayDeque splitSuggestion,
			int referenceSize) {
		if (splitSuggestion.size() < 2) {
			return false;
		}
		final double avgSplitSize = (double) referenceSize / (double) splitSuggestion.size();
		return avgSplitSize >= MINIMUMBLOCKSIZE;
	}
}