de.citec.scie.pdf.ParagraphEstimator Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
/**
* This class is able to estimate if a line break also indicates a new
* paragraph.
*
* @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
*/
public class ParagraphEstimator {
private static final double LINEENDTOL = 0.8;
private static final double XTOL = 0.03;
private static final double YTOL = 1.2;
private final PreTextBlock block;
private PreTextLine lastLine = null;
public ParagraphEstimator(PreTextBlock block) {
this.block = block;
}
public boolean isNewParagraph(final PreTextLine line) {
if (lastLine == null) {
lastLine = line;
return false;
}
/*
* Line Length Criterion: We assume to have a new paragraph if the last
* line is much shorter than the current one.
*/
final float lastLength = lastLine.length();
final float thisLength = line.length();
if (lastLength < LINEENDTOL * thisLength) {
lastLine = line;
return true;
}
/*
* Line Start Criterion: We assume to have a new paragraph if the
* current line starts later than the previous one (more then 5% line
* length).
*/
final float normalizedDiff = (line.getX_start() - lastLine.getX_start()) / thisLength;
if (normalizedDiff > XTOL) {
lastLine = line;
return true;
}
/*
* Gap Criterion: We assume to have a new paragraph if the line gap is
* bigger than usual or < 0 (change of column in 2-column text).
*/
if (!line.yHisto.getBackingMap().isEmpty()
&& !lastLine.yHisto.getBackingMap().isEmpty()) {
final float gap = line.yHisto.getMaxElement() - lastLine.yHisto.getMaxElement();
if (!block.yDistHisto.getBackingMap().isEmpty()
&& gap > YTOL * block.yDistHisto.getMaxElement()) {
lastLine = line;
return true;
}
if (gap < 0) {
lastLine = line;
return true;
}
}
lastLine = line;
return false;
}
}