All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.TextBlockRankEstimator Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.scie.pdf;

import de.citec.scie.pdf.structure.TextBlock;
import java.util.ArrayList;

/**
 * This estimator has the purpose to determine if a TextBlock has a larger usual
 * Font Size as the usual Font Size for the whole page, an equal or a smaller
 * one. A larger one is interpreted as a header, a smaller one as a footnote, an
 * equal one as body text.
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class TextBlockRankEstimator {

	private final Histogramm pageFontSizeHisto = new Histogramm<>();
	private final ArrayList blocks = new ArrayList<>();
	private final ArrayList> blockFontSizeHistos = new ArrayList<>();

	public TextBlockRankEstimator() {
	}

	public void addBlock(final TextBlock block, final PreTextBlock preTextBlock) {
		blocks.add(block);
		final Histogramm blockFontSizeHisto = new Histogramm<>();
		for (final PreTextLine line : preTextBlock.lines) {
			blockFontSizeHisto.addAll(line.fontSizeHisto);
		}
		blockFontSizeHistos.add(blockFontSizeHisto);
		pageFontSizeHisto.addAll(blockFontSizeHisto);
	}

	/**
	 * Returns the relativ font size of this block in relation to the whole
	 * page.
	 *
	 * @param block a textBlock
	 * @return the relativ font size of this block in relation to the whole
	 * page.
	 */
	public double getRelativeFontSize(final TextBlock block) {
		final int blockIdx = blocks.indexOf(block);
		if (blockIdx == -1) {
			return -1;
		}
		final Histogramm blockFontSizeHisto = blockFontSizeHistos.get(blockIdx);
		final double blockFontSize = blockFontSizeHisto.getMaxElement();
		final double pageFontSize = pageFontSizeHisto.getMaxElement();
		return blockFontSize / pageFontSize;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy