de.citec.scie.pdf.WhiteSpaceEstimator Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
import org.apache.pdfbox.util.TextPosition;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This is based on the work of Ben Litchfield in the PDFTextStripper of Apache
* PDFBox. It estimates whether a whitespace follows after a given character.
*/
public class WhiteSpaceEstimator {
/**
* These tolerance values were determined by the Apache PDFBox programmers.
*/
private static final float SPACINGTOL = 0.5f;
private static final float AVGCHARTOL = 0.3f;
private TextPosition lastPosition = null;
private float prevAvgCharWidth = -1;
public WhiteSpaceEstimator() {
}
public boolean hasWhiteSpace(final TextPosition position) {
if (lastPosition == null) {
//set the last values
lastPosition = position;
final float positionWidth = position.getWidthDirAdj();
final int wordCharCount = position.getIndividualWidths().length;
prevAvgCharWidth = positionWidth / wordCharCount;
return false;
}
final float positionWidth = position.getWidthDirAdj();
final float positionX = position.getXDirAdj();
final float endOfLastTextX = lastPosition.getXDirAdj()
+ lastPosition.getWidthDirAdj();
final float lastWordSpacing = lastPosition.getWidthOfSpace();
//The current amount of characters in a word
final int wordCharCount = position.getIndividualWidths().length;
/* Estimate the expected width of the space based on the
* space character with some margin. */
final float wordSpacing = position.getWidthOfSpace();
final float deltaSpace;
if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) {
deltaSpace = Float.MAX_VALUE;
} else {
if (lastWordSpacing < 0) {
deltaSpace = (wordSpacing * SPACINGTOL);
} else {
deltaSpace = (((wordSpacing + lastWordSpacing) / 2f)
* SPACINGTOL);
}
}
/* Estimate the expected width of the space based on the
* average character width with some margin. This calculation does not
* make a true average (average of averages) but we found that it gave the
* best results after numerous experiments. Based on experiments we also found that
* .3 worked well. */
final float averageCharWidth;
if (prevAvgCharWidth < 0) {
averageCharWidth = (positionWidth / wordCharCount);
} else {
averageCharWidth = (prevAvgCharWidth + (positionWidth
/ wordCharCount)) / 2f;
}
float deltaCharWidth = (averageCharWidth * AVGCHARTOL);
//Compares the values obtained by the average method and the wordSpacing method and picks
//the smaller number.
float expectedStartOfNextWordX = 0;
if (lastPosition != null) {
if (deltaCharWidth > deltaSpace) {
expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
} else {
expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
}
}
boolean hasWhitespace = false;
/*
* we have a space if the next word starts after we expected it to start.
*/
if (expectedStartOfNextWordX < positionX
&& //only bother adding a space if the last character was not a space
lastPosition.getCharacter() != null
&& !lastPosition.getCharacter().endsWith(" ")) {
hasWhitespace = true;
}
//set the last values
lastPosition = position;
prevAvgCharWidth = averageCharWidth;
return hasWhitespace;
}
}