All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.WhiteSpaceEstimator Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

import org.apache.pdfbox.util.TextPosition;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * This is based on the work of Ben Litchfield in the PDFTextStripper of Apache
 * PDFBox. It estimates whether a whitespace follows after a given character.
 */
public class WhiteSpaceEstimator {

	/**
	 * These tolerance values were determined by the Apache PDFBox programmers.
	 */
	private static final float SPACINGTOL = 0.5f;
	private static final float AVGCHARTOL = 0.3f;
	private TextPosition lastPosition = null;
	private float prevAvgCharWidth = -1;

	public WhiteSpaceEstimator() {
	}

	public boolean hasWhiteSpace(final TextPosition position) {
		if (lastPosition == null) {
			//set the last values
			lastPosition = position;
			final float positionWidth = position.getWidthDirAdj();
			final int wordCharCount = position.getIndividualWidths().length;
			prevAvgCharWidth = positionWidth / wordCharCount;
			return false;

		}
		final float positionWidth = position.getWidthDirAdj();
		final float positionX = position.getXDirAdj();

		final float endOfLastTextX = lastPosition.getXDirAdj()
				+ lastPosition.getWidthDirAdj();
		final float lastWordSpacing = lastPosition.getWidthOfSpace();

		//The current amount of characters in a word
		final int wordCharCount = position.getIndividualWidths().length;

		/* Estimate the expected width of the space based on the
		 * space character with some margin. */
		final float wordSpacing = position.getWidthOfSpace();
		final float deltaSpace;
		if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) {
			deltaSpace = Float.MAX_VALUE;
		} else {
			if (lastWordSpacing < 0) {
				deltaSpace = (wordSpacing * SPACINGTOL);
			} else {
				deltaSpace = (((wordSpacing + lastWordSpacing) / 2f)
						* SPACINGTOL);
			}
		}

		/* Estimate the expected width of the space based on the
		 * average character width with some margin. This calculation does not
		 * make a true average (average of averages) but we found that it gave the
		 * best results after numerous experiments. Based on experiments we also found that
		 * .3 worked well. */
		final float averageCharWidth;
		if (prevAvgCharWidth < 0) {
			averageCharWidth = (positionWidth / wordCharCount);
		} else {
			averageCharWidth = (prevAvgCharWidth + (positionWidth
					/ wordCharCount)) / 2f;
		}
		float deltaCharWidth = (averageCharWidth * AVGCHARTOL);

		//Compares the values obtained by the average method and the wordSpacing method and picks
		//the smaller number.
		float expectedStartOfNextWordX = 0;
		if (lastPosition != null) {
			if (deltaCharWidth > deltaSpace) {
				expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
			} else {
				expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
			}
		}
		boolean hasWhitespace = false;
		/* 
		 * we have a space if the next word starts after we expected it to start.
		 */
		if (expectedStartOfNextWordX < positionX
				&& //only bother adding a space if the last character was not a space
				lastPosition.getCharacter() != null
				&& !lastPosition.getCharacter().endsWith(" ")) {
			hasWhitespace = true;
		}

		//set the last values
		lastPosition = position;
		prevAvgCharWidth = averageCharWidth;
		return hasWhitespace;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy