All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.PreTextBlock Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

import java.util.ArrayDeque;
import java.util.ArrayList;
import org.apache.pdfbox.util.TextPosition;

/**
 * A PreTextBlock represents a ThreadBead with some additional information. A
 * ThreadBead is a TextBlock on a Page in a PDF document. Please note that this
 * might look like a very useful concept to structure the page but not all PDFs
 * use it. So it might very well be the case, that there are more text blocks on
 * the page than there are ThreadBeads. Thus this is called a "PreTextBlock".
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class PreTextBlock {

	public final ArrayList lines = new ArrayList<>();
	public final Histogramm yDistHisto = new Histogramm<>();
	public final Histogramm lengthHisto = new Histogramm<>();

	public PreTextBlock() {
	}

	public int getSize() {
		int size = 0;
		for (final PreTextLine line : lines) {
			size += line.content.size();
		}
		return size;
	}

	public void addTextPosition(final TextPosition text) {
		PreTextLine line;
		if (lines.isEmpty()) {
			line = new PreTextLine();
			lines.add(line);
		} else {
			line = lines.get(lines.size() - 1);
			if (!line.isPartOfLine(text)) {
				if (lines.size() > 1) {
					final PreTextLine lastLine = lines.get(lines.size() - 2);
					final float lastLineY = lastLine.yHisto.getMaxElement();
					final float lineY = line.yHisto.getMaxElement();
					final float lineYDist = lineY - lastLineY;
					yDistHisto.addDataPoint(lineYDist);
				}
				line.setX_End();
				line = new PreTextLine();
				lines.add(line);
			}
		}
		lengthHisto.addDataPoint(line.length());
		line.addElement(text);
	}

	public void addLine(final PreTextLine line) {
		lengthHisto.addDataPoint(line.length());
		if (!lines.isEmpty()) {
			final PreTextLine lastLine = lines.get(lines.size() - 1);
			final float lastLineY = lastLine.yHisto.getMaxElement();
			final float lineY = line.yHisto.getMaxElement();
			final float lineYDist = lineY - lastLineY;
			yDistHisto.addDataPoint(lineYDist);
		}
		lines.add(line);
	}
	private static final float YTOL = 3f;

	/**
	 * This is supposed to split a TextBlock representing a whole page into
	 * different blocks that might represent
	 *
	 * 
    *
  • columns in a two-column text
  • *
  • Headings
  • *
  • Foot notes
  • *
  • Tables and figures
  • *
  • The document abstract
  • *
  • etc.
  • *
* * @return a list of PreTextBlocks that are a split of this one. */ public ArrayList split() { if (lines.isEmpty()) { //we return an empty list if we have new lines because in that case this block is //definitely not interesting. return new ArrayList<>(); } /* * Font Criterion: We start with the most usual font for the first line * and check if we find a line of text that has another usual font. * If so we start a new TextBlock from it and keep looking for other * font changes. */ PreTextBlock newBlock = new PreTextBlock(); final ArrayDeque fontSplit = new ArrayDeque<>(); if (!lines.get(0).fontHisto.getBackingMap().isEmpty()) { String referenceFont = lines.get(0).fontHisto.getMaxElement(); //the referenceFont might be null if the font information is not well defined in the PDF //(might be the case with OCR). if (referenceFont != null) { for (final PreTextLine line : lines) { if (!line.fontHisto.getBackingMap().isEmpty()) { final String currentFont = line.fontHisto.getMaxElement(); if (!currentFont.equals(referenceFont)) { //if the font changes, create a new block. fontSplit.add(newBlock); newBlock = new PreTextBlock(); referenceFont = currentFont; } } newBlock.addLine(line); } fontSplit.add(newBlock); } } /* * Font Size Criterion: We start with the most usual font size for the * first line and check if we find a line of text that has another usual * font size. If so we start a new TextBlock from it and keep looking * for other font size changes. */ final ArrayDeque fontSizeSplit = new ArrayDeque<>(); if (!lines.get(0).fontSizeHisto.getBackingMap().isEmpty()) { float referenceFontSize = lines.get(0).fontSizeHisto.getMaxElement(); newBlock = new PreTextBlock(); for (final PreTextLine line : lines) { if (!line.fontSizeHisto.getBackingMap().isEmpty()) { final float currentFontSize = line.fontSizeHisto.getMaxElement(); if (currentFontSize != referenceFontSize) { //if the font size changes, create a new block. fontSizeSplit.add(newBlock); newBlock = new PreTextBlock(); referenceFontSize = currentFontSize; } } newBlock.addLine(line); } fontSizeSplit.add(newBlock); } /* * Gap Criterion: We start a new block each time there is a * significant gap. */ final ArrayDeque gapSplit = new ArrayDeque<>(); if (!yDistHisto.getBackingMap().isEmpty()) { final float referenceGap = yDistHisto.getMaxElement(); float lastY = lines.get(0).yHisto.getMaxElement(); newBlock = new PreTextBlock(); newBlock.addLine(lines.get(0)); for (int i = 1; i < lines.size(); i++) { final float currentY = lines.get(i).yHisto.getMaxElement(); final float currentGap = currentY - lastY; if (currentGap > YTOL * referenceGap || referenceGap > YTOL * currentGap) { //if the gap is unusually large, start a new block. gapSplit.add(newBlock); newBlock = new PreTextBlock(); } newBlock.addLine(lines.get(i)); lastY = currentY; } gapSplit.add(newBlock); } /* * Now we have to find a compromise between all criteria. We want to * maximize the number of TextBlocks that remains, but we are not * allowed to have overlapping TextBlocks. So we use a greedy * approach taking always the shortest block found. */ //the actual output. final ArrayList actualSplit = new ArrayList<>(); //a variable to denote at which line our next block has to start. int currentLine = 0; //the different split suggestions. final ArrayList< ArrayDeque> splits = new ArrayList<>(); //the current starting lines for each suggestion. ArrayList startLines = new ArrayList<>(); /* * only want to use split suggestions that seem sane. */ final int totalSize = getSize(); if (isSaneSplitSuggestion(fontSplit, totalSize)) { splits.add(fontSplit); startLines.add(0); } if (isSaneSplitSuggestion(fontSizeSplit, totalSize)) { splits.add(fontSizeSplit); startLines.add(0); } if (isSaneSplitSuggestion(gapSplit, totalSize)) { splits.add(gapSplit); startLines.add(0); } //if there are split suggestions at all, start the algorithm. if (!splits.isEmpty()) { while (currentLine < lines.size()) { //look for the split suggestion that has a matching start and the shortest block size. int minSize = Integer.MAX_VALUE; PreTextBlock minBlock = null; for (int i = 0; i < splits.size(); i++) { if (!splits.get(i).isEmpty()) { final int start = startLines.get(i); if (start == currentLine) { final PreTextBlock currentBlockSuggestion = splits.get(i). peekFirst(); if (currentBlockSuggestion != null) { if (currentBlockSuggestion.lines.size() < minSize) { minSize = currentBlockSuggestion.lines.size(); minBlock = currentBlockSuggestion; } } } } } //if we have found that put the respective block into the output list. actualSplit.add(minBlock); currentLine += minSize; //and poll the first block from each queue until we are at the current line or after it. for (int i = 0; i < splits.size(); i++) { int currentStart = startLines.get(i); while (currentStart < currentLine && !splits.get(i).isEmpty()) { final PreTextBlock block = splits.get(i).poll(); currentStart += block.lines.size(); } startLines.set(i, currentStart); } } } if (actualSplit.isEmpty()) { actualSplit.add(this); } return actualSplit; } public static final int MINIMUMBLOCKSIZE = 150; /** * The first sanity check is that we have something to split at all (there * is more than one splitting point). The second sanity check is that the * suggested blocks should - on average - not contain less than * MINIMUMBLOCKSIZE characters. * * @param splitSuggestion a split suggestion. * @param referenceSize the size of the TextBlock that is split. * @return true if the suggestion seems sane. */ private boolean isSaneSplitSuggestion(ArrayDeque splitSuggestion, int referenceSize) { if (splitSuggestion.size() < 2) { return false; } final double avgSplitSize = (double) referenceSize / (double) splitSuggestion.size(); return avgSplitSize >= MINIMUMBLOCKSIZE; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy