All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy Maven / Gradle / Ivy

There is a newer version: 5.5.13.3
Show newest version
/*
 * $Id: LocationTextExtractionStrategy.java 4784 2011-03-15 08:33:00Z blowagie $
 *
 * This file is part of the iText (R) project.
 * Copyright (c) 1998-2011 1T3XT BVBA
 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3
 * as published by the Free Software Foundation with the addition of the
 * following permission added to Section 15 as permitted in Section 7(a):
 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Affero General Public License for more details.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program; if not, see http://www.gnu.org/licenses or write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
 * http://itextpdf.com/terms-of-use/
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU Affero General Public License.
 *
 * In accordance with Section 7(b) of the GNU Affero General Public License,
 * a covered work must retain the producer line in every PDF that is created
 * or manipulated using iText.
 *
 * You can be released from the requirements of the license by purchasing
 * a commercial license. Buying such a license is mandatory as soon as you
 * develop commercial activities involving the iText software without
 * disclosing the source code of your own applications.
 * These activities include: offering paid services to customers as an ASP,
 * serving PDFs on the fly in a web application, shipping iText with a closed
 * source product.
 *
 * For more information, please contact iText Software Corp. at this
 * address: [email protected]
 */
package com.itextpdf.text.pdf.parser;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;


/**
 * Development preview - this class (and all of the parser classes) are still experiencing
 * heavy development, and are subject to change both behavior and interface.
 * 
* A text extraction renderer that keeps track of relative position of text on page * The resultant text will be relatively consistent with the physical layout that most * PDF files have on screen. *
* This renderer keeps track of the orientation and distance (both perpendicular * and parallel) to the unit vector of the orientation. Text is ordered by * orientation, then perpendicular, then parallel distance. Text with the same * perpendicular distance, but different parallel distance is treated as being on * the same line. *
* This renderer also uses a simple strategy based on the font metrics to determine if * a blank space should be inserted into the output. * * @since 5.0.2 */ public class LocationTextExtractionStrategy implements TextExtractionStrategy { /** set to true for debugging */ static boolean DUMP_STATE = false; /** a summary of all found text */ private final List locationalResult = new ArrayList(); /** * Creates a new text extraction renderer. */ public LocationTextExtractionStrategy() { } /** * @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock() */ public void beginTextBlock(){ } /** * @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock() */ public void endTextBlock(){ } /** * Returns the result so far. * @return a String with the resulting text. */ public String getResultantText(){ if (DUMP_STATE) dumpState(); Collections.sort(locationalResult); StringBuffer sb = new StringBuffer(); TextChunk lastChunk = null; for (TextChunk chunk : locationalResult) { if (lastChunk == null){ sb.append(chunk.text); } else { if (chunk.sameLine(lastChunk)){ float dist = chunk.distanceFromEndOf(lastChunk); if (dist < -chunk.charSpaceWidth) sb.append(' '); // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space else if (dist > chunk.charSpaceWidth/2.0f && chunk.text.charAt(0) != ' ' && lastChunk.text.charAt(lastChunk.text.length()-1) != ' ') sb.append(' '); sb.append(chunk.text); } else { sb.append('\n'); sb.append(chunk.text); } } lastChunk = chunk; } return sb.toString(); } /** Used for debugging only */ private void dumpState(){ for (Iterator iterator = locationalResult.iterator(); iterator.hasNext(); ) { TextChunk location = (TextChunk) iterator.next(); location.printDiagnostics(); System.out.println(); } } /** * * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo) */ public void renderText(TextRenderInfo renderInfo) { LineSegment segment = renderInfo.getBaseline(); TextChunk location = new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(), renderInfo.getSingleSpaceWidth()); locationalResult.add(location); } /** * Represents a chunk of text, it's orientation, and location relative to the orientation vector */ private static class TextChunk implements Comparable{ /** the text of the chunk */ final String text; /** the starting location of the chunk */ final Vector startLocation; /** the ending location of the chunk */ final Vector endLocation; /** unit vector in the orientation of the chunk */ final Vector orientationVector; /** the orientation as a scalar for quick sorting */ final int orientationMagnitude; /** perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system) * we round to the nearest integer to handle the fuzziness of comparing floats */ final int distPerpendicular; /** distance of the start of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */ final float distParallelStart; /** distance of the end of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */ final float distParallelEnd; /** the width of a single space character in the font of the chunk */ final float charSpaceWidth; public TextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth) { this.text = string; this.startLocation = startLocation; this.endLocation = endLocation; this.charSpaceWidth = charSpaceWidth; orientationVector = endLocation.subtract(startLocation).normalize(); orientationMagnitude = (int)(Math.atan2(orientationVector.get(Vector.I2), orientationVector.get(Vector.I1))*1000); // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html // the two vectors we are crossing are in the same plane, so the result will be purely // in the z-axis (out of plane) direction, so we just take the I3 component of the result Vector origin = new Vector(0,0,1); distPerpendicular = (int)(startLocation.subtract(origin)).cross(orientationVector).get(Vector.I3); distParallelStart = orientationVector.dot(startLocation); distParallelEnd = orientationVector.dot(endLocation); } private void printDiagnostics(){ System.out.println("Text (@" + startLocation + " -> " + endLocation + "): " + text); System.out.println("orientationMagnitude: " + orientationMagnitude); System.out.println("distPerpendicular: " + distPerpendicular); System.out.println("distParallel: " + distParallelStart); } /** * @param as the location to compare to * @return true is this location is on the the same line as the other */ public boolean sameLine(TextChunk as){ if (orientationMagnitude != as.orientationMagnitude) return false; if (distPerpendicular != as.distPerpendicular) return false; return true; } /** * Computes the distance between the end of 'other' and the beginning of this chunk * in the direction of this chunk's orientation vector. Note that it's a bad idea * to call this for chunks that aren't on the same line and orientation, but we don't * explicitly check for that condition for performance reasons. * @param other * @return the number of spaces between the end of 'other' and the beginning of this chunk */ public float distanceFromEndOf(TextChunk other){ float distance = distParallelStart - other.distParallelEnd; return distance; } /** * Compares based on orientation, perpendicular distance, then parallel distance * @see java.lang.Comparable#compareTo(java.lang.Object) */ public int compareTo(TextChunk rhs) { if (this == rhs) return 0; // not really needed, but just in case int rslt; rslt = compareInts(orientationMagnitude, rhs.orientationMagnitude); if (rslt != 0) return rslt; rslt = compareInts(distPerpendicular, rhs.distPerpendicular); if (rslt != 0) return rslt; // note: it's never safe to check floating point numbers for equality, and if two chunks // are truly right on top of each other, which one comes first or second just doesn't matter // so we arbitrarily choose this way. rslt = distParallelStart < rhs.distParallelStart ? -1 : 1; return rslt; } /** * * @param int1 * @param int2 * @return comparison of the two integers */ private static int compareInts(int int1, int int2){ return int1 == int2 ? 0 : int1 < int2 ? -1 : 1; } } /** * no-op method - this renderer isn't interested in image events * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo) * @since 5.0.1 */ public void renderImage(ImageRenderInfo renderInfo) { // do nothing } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy