All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.PreTextLine Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

import java.util.ArrayList;
import org.apache.pdfbox.util.TextPosition;

/**
 * This just aggregates all TextPosition objects that are part of one line. As
 * not every line break in the original PDF document has meaning for the
 * structure of the text we usually throw the line information away and just use
 * it for the processing. Thus this is called a "PreTextLine".
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class PreTextLine {

	private float x_start;
	private float x_end;
	public final Histogramm yHisto = new Histogramm<>();
	public final Histogramm fontSizeHisto = new Histogramm<>();
	public final Histogramm fontHisto = new Histogramm<>();
	public final ArrayList content = new ArrayList<>();

	public PreTextLine() {
	}

	public void addElement(final TextPosition pos) {
		if (content.isEmpty()) {
			x_start = pos.getXDirAdj();
		}
		content.add(pos);
		yHisto.addDataPoint(pos.getYDirAdj());
		fontSizeHisto.addDataPoint(pos.getFontSizeInPt());
		if (pos.getFont() != null && pos.getFont().getFontDescriptor() != null) {
			fontHisto.addDataPoint(pos.getFont().getFontDescriptor().getFontName());
		}
	}

	public float getX_start() {
		return x_start;
	}

	public float getX_end() {
		return x_end;
	}

	public void setX_End() {
		x_end = content.get(content.size() - 1).getXDirAdj();
	}

	public float length() {
		return x_end - x_start;
	}

	public boolean isPartOfLine(final TextPosition pos) {
		if (content.isEmpty()) {
			return true;
		}
		final TextPosition lastPos = content.get(content.size() - 1);
		/*
		 * X Criterion: If the new x position is behind the old x position, we have a line break.
		 */
		final float oldXPos = lastPos.getXDirAdj();
		final float newXPos = pos.getXDirAdj();
		if (newXPos + pos.getWidthDirAdj() < oldXPos) {
			return false;
		}
		/*
		 * Y criterion: if the y value is not even remotely overlapping with our last position
		 * we have a line break, too.
		 */
		final float oldYStart = lastPos.getYDirAdj();
		final float oldHeight = lastPos.getHeightDir();
		final float newY = pos.getYDirAdj();
		if (newY < oldYStart - oldHeight || newY > oldYStart + 2 * oldHeight) {
			return false;
		}

		return true;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy