de.citec.scie.pdf.PreTextLine Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
import java.util.ArrayList;
import org.apache.pdfbox.util.TextPosition;
/**
* This just aggregates all TextPosition objects that are part of one line. As
* not every line break in the original PDF document has meaning for the
* structure of the text we usually throw the line information away and just use
* it for the processing. Thus this is called a "PreTextLine".
*
* @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
*/
public class PreTextLine {
private float x_start;
private float x_end;
public final Histogramm yHisto = new Histogramm<>();
public final Histogramm fontSizeHisto = new Histogramm<>();
public final Histogramm fontHisto = new Histogramm<>();
public final ArrayList content = new ArrayList<>();
public PreTextLine() {
}
public void addElement(final TextPosition pos) {
if (content.isEmpty()) {
x_start = pos.getXDirAdj();
}
content.add(pos);
yHisto.addDataPoint(pos.getYDirAdj());
fontSizeHisto.addDataPoint(pos.getFontSizeInPt());
if (pos.getFont() != null && pos.getFont().getFontDescriptor() != null) {
fontHisto.addDataPoint(pos.getFont().getFontDescriptor().getFontName());
}
}
public float getX_start() {
return x_start;
}
public float getX_end() {
return x_end;
}
public void setX_End() {
x_end = content.get(content.size() - 1).getXDirAdj();
}
public float length() {
return x_end - x_start;
}
public boolean isPartOfLine(final TextPosition pos) {
if (content.isEmpty()) {
return true;
}
final TextPosition lastPos = content.get(content.size() - 1);
/*
* X Criterion: If the new x position is behind the old x position, we have a line break.
*/
final float oldXPos = lastPos.getXDirAdj();
final float newXPos = pos.getXDirAdj();
if (newXPos + pos.getWidthDirAdj() < oldXPos) {
return false;
}
/*
* Y criterion: if the y value is not even remotely overlapping with our last position
* we have a line break, too.
*/
final float oldYStart = lastPos.getYDirAdj();
final float oldHeight = lastPos.getHeightDir();
final float newY = pos.getYDirAdj();
if (newY < oldYStart - oldHeight || newY > oldYStart + 2 * oldHeight) {
return false;
}
return true;
}
}