de.citec.scie.pdf.PreTextBlock Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
import java.util.ArrayDeque;
import java.util.ArrayList;
import org.apache.pdfbox.util.TextPosition;
/**
* A PreTextBlock represents a ThreadBead with some additional information. A
* ThreadBead is a TextBlock on a Page in a PDF document. Please note that this
* might look like a very useful concept to structure the page but not all PDFs
* use it. So it might very well be the case, that there are more text blocks on
* the page than there are ThreadBeads. Thus this is called a "PreTextBlock".
*
* @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
*/
public class PreTextBlock {
public final ArrayList lines = new ArrayList<>();
public final Histogramm yDistHisto = new Histogramm<>();
public final Histogramm lengthHisto = new Histogramm<>();
public PreTextBlock() {
}
public int getSize() {
int size = 0;
for (final PreTextLine line : lines) {
size += line.content.size();
}
return size;
}
public void addTextPosition(final TextPosition text) {
PreTextLine line;
if (lines.isEmpty()) {
line = new PreTextLine();
lines.add(line);
} else {
line = lines.get(lines.size() - 1);
if (!line.isPartOfLine(text)) {
if (lines.size() > 1) {
final PreTextLine lastLine = lines.get(lines.size() - 2);
final float lastLineY = lastLine.yHisto.getMaxElement();
final float lineY = line.yHisto.getMaxElement();
final float lineYDist = lineY - lastLineY;
yDistHisto.addDataPoint(lineYDist);
}
line.setX_End();
line = new PreTextLine();
lines.add(line);
}
}
lengthHisto.addDataPoint(line.length());
line.addElement(text);
}
public void addLine(final PreTextLine line) {
lengthHisto.addDataPoint(line.length());
if (!lines.isEmpty()) {
final PreTextLine lastLine = lines.get(lines.size() - 1);
final float lastLineY = lastLine.yHisto.getMaxElement();
final float lineY = line.yHisto.getMaxElement();
final float lineYDist = lineY - lastLineY;
yDistHisto.addDataPoint(lineYDist);
}
lines.add(line);
}
private static final float YTOL = 3f;
/**
* This is supposed to split a TextBlock representing a whole page into
* different blocks that might represent
*
*
* - columns in a two-column text
* - Headings
* - Foot notes
* - Tables and figures
* - The document abstract
* - etc.
*
*
* @return a list of PreTextBlocks that are a split of this one.
*/
public ArrayList split() {
if (lines.isEmpty()) {
//we return an empty list if we have new lines because in that case this block is
//definitely not interesting.
return new ArrayList<>();
}
/*
* Font Criterion: We start with the most usual font for the first line
* and check if we find a line of text that has another usual font.
* If so we start a new TextBlock from it and keep looking for other
* font changes.
*/
PreTextBlock newBlock = new PreTextBlock();
final ArrayDeque fontSplit = new ArrayDeque<>();
if (!lines.get(0).fontHisto.getBackingMap().isEmpty()) {
String referenceFont = lines.get(0).fontHisto.getMaxElement();
//the referenceFont might be null if the font information is not well defined in the PDF
//(might be the case with OCR).
if (referenceFont != null) {
for (final PreTextLine line : lines) {
if (!line.fontHisto.getBackingMap().isEmpty()) {
final String currentFont = line.fontHisto.getMaxElement();
if (!currentFont.equals(referenceFont)) {
//if the font changes, create a new block.
fontSplit.add(newBlock);
newBlock = new PreTextBlock();
referenceFont = currentFont;
}
}
newBlock.addLine(line);
}
fontSplit.add(newBlock);
}
}
/*
* Font Size Criterion: We start with the most usual font size for the
* first line and check if we find a line of text that has another usual
* font size. If so we start a new TextBlock from it and keep looking
* for other font size changes.
*/
final ArrayDeque fontSizeSplit = new ArrayDeque<>();
if (!lines.get(0).fontSizeHisto.getBackingMap().isEmpty()) {
float referenceFontSize = lines.get(0).fontSizeHisto.getMaxElement();
newBlock = new PreTextBlock();
for (final PreTextLine line : lines) {
if (!line.fontSizeHisto.getBackingMap().isEmpty()) {
final float currentFontSize = line.fontSizeHisto.getMaxElement();
if (currentFontSize != referenceFontSize) {
//if the font size changes, create a new block.
fontSizeSplit.add(newBlock);
newBlock = new PreTextBlock();
referenceFontSize = currentFontSize;
}
}
newBlock.addLine(line);
}
fontSizeSplit.add(newBlock);
}
/*
* Gap Criterion: We start a new block each time there is a
* significant gap.
*/
final ArrayDeque gapSplit = new ArrayDeque<>();
if (!yDistHisto.getBackingMap().isEmpty()) {
final float referenceGap = yDistHisto.getMaxElement();
float lastY = lines.get(0).yHisto.getMaxElement();
newBlock = new PreTextBlock();
newBlock.addLine(lines.get(0));
for (int i = 1; i < lines.size(); i++) {
final float currentY = lines.get(i).yHisto.getMaxElement();
final float currentGap = currentY - lastY;
if (currentGap > YTOL * referenceGap || referenceGap > YTOL * currentGap) {
//if the gap is unusually large, start a new block.
gapSplit.add(newBlock);
newBlock = new PreTextBlock();
}
newBlock.addLine(lines.get(i));
lastY = currentY;
}
gapSplit.add(newBlock);
}
/*
* Now we have to find a compromise between all criteria. We want to
* maximize the number of TextBlocks that remains, but we are not
* allowed to have overlapping TextBlocks. So we use a greedy
* approach taking always the shortest block found.
*/
//the actual output.
final ArrayList actualSplit = new ArrayList<>();
//a variable to denote at which line our next block has to start.
int currentLine = 0;
//the different split suggestions.
final ArrayList< ArrayDeque> splits = new ArrayList<>();
//the current starting lines for each suggestion.
ArrayList startLines = new ArrayList<>();
/*
* only want to use split suggestions that seem sane.
*/
final int totalSize = getSize();
if (isSaneSplitSuggestion(fontSplit, totalSize)) {
splits.add(fontSplit);
startLines.add(0);
}
if (isSaneSplitSuggestion(fontSizeSplit, totalSize)) {
splits.add(fontSizeSplit);
startLines.add(0);
}
if (isSaneSplitSuggestion(gapSplit, totalSize)) {
splits.add(gapSplit);
startLines.add(0);
}
//if there are split suggestions at all, start the algorithm.
if (!splits.isEmpty()) {
while (currentLine < lines.size()) {
//look for the split suggestion that has a matching start and the shortest block size.
int minSize = Integer.MAX_VALUE;
PreTextBlock minBlock = null;
for (int i = 0; i < splits.size(); i++) {
if (!splits.get(i).isEmpty()) {
final int start = startLines.get(i);
if (start == currentLine) {
final PreTextBlock currentBlockSuggestion = splits.get(i).
peekFirst();
if (currentBlockSuggestion != null) {
if (currentBlockSuggestion.lines.size() < minSize) {
minSize = currentBlockSuggestion.lines.size();
minBlock = currentBlockSuggestion;
}
}
}
}
}
//if we have found that put the respective block into the output list.
actualSplit.add(minBlock);
currentLine += minSize;
//and poll the first block from each queue until we are at the current line or after it.
for (int i = 0; i < splits.size(); i++) {
int currentStart = startLines.get(i);
while (currentStart < currentLine && !splits.get(i).isEmpty()) {
final PreTextBlock block = splits.get(i).poll();
currentStart += block.lines.size();
}
startLines.set(i, currentStart);
}
}
}
if (actualSplit.isEmpty()) {
actualSplit.add(this);
}
return actualSplit;
}
public static final int MINIMUMBLOCKSIZE = 150;
/**
* The first sanity check is that we have something to split at all (there
* is more than one splitting point). The second sanity check is that the
* suggested blocks should - on average - not contain less than
* MINIMUMBLOCKSIZE characters.
*
* @param splitSuggestion a split suggestion.
* @param referenceSize the size of the TextBlock that is split.
* @return true if the suggestion seems sane.
*/
private boolean isSaneSplitSuggestion(ArrayDeque splitSuggestion,
int referenceSize) {
if (splitSuggestion.size() < 2) {
return false;
}
final double avgSplitSize = (double) referenceSize / (double) splitSuggestion.size();
return avgSplitSize >= MINIMUMBLOCKSIZE;
}
}