All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.ParagraphEstimator Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

/**
 * This class is able to estimate if a line break also indicates a new
 * paragraph.
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class ParagraphEstimator {

	private static final double LINEENDTOL = 0.8;
	private static final double XTOL = 0.03;
	private static final double YTOL = 1.2;
	private final PreTextBlock block;
	private PreTextLine lastLine = null;

	public ParagraphEstimator(PreTextBlock block) {
		this.block = block;
	}

	public boolean isNewParagraph(final PreTextLine line) {
		if (lastLine == null) {
			lastLine = line;
			return false;
		}
		/*
		 * Line Length Criterion: We assume to have a new paragraph if the last
		 * line is much shorter than the current one.
		 */
		final float lastLength = lastLine.length();
		final float thisLength = line.length();
		if (lastLength < LINEENDTOL * thisLength) {
			lastLine = line;
			return true;
		}

		/*
		 * Line Start Criterion: We assume to have a new paragraph if the
		 * current line starts later than the previous one (more then 5% line
		 * length).
		 */
		final float normalizedDiff = (line.getX_start() - lastLine.getX_start()) / thisLength;
		if (normalizedDiff > XTOL) {
			lastLine = line;
			return true;
		}
		/*
		 * Gap Criterion: We assume to have a new paragraph if the line gap is
		 * bigger than usual or < 0 (change of column in 2-column text).
		 */
		if (!line.yHisto.getBackingMap().isEmpty()
				&& !lastLine.yHisto.getBackingMap().isEmpty()) {
			final float gap = line.yHisto.getMaxElement() - lastLine.yHisto.getMaxElement();
			if (!block.yDistHisto.getBackingMap().isEmpty()
					&& gap > YTOL * block.yDistHisto.getMaxElement()) {
				lastLine = line;
				return true;
			}
			if (gap < 0) {
				lastLine = line;
				return true;
			}
		}

		lastLine = line;
		return false;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy