All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aowagie.text.pdf.parser.SimpleTextExtractingPdfContentStreamProcessor Maven / Gradle / Ivy

/*
 * Copyright 2008 by Kevin Day.
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * If you didn't download this code from the following link, you should check if
 * you aren't using an obsolete version:
 * http://www.lowagie.com/iText/
 */
package com.aowagie.text.pdf.parser;

/**
 * A simple text extraction processor.
 * @since	2.1.4
 */
class SimpleTextExtractingPdfContentStreamProcessor extends PdfContentStreamProcessor {

    /** keeps track of a text matrix. */
    private Matrix lastTextLineMatrix = null;
    /** keeps track of a text matrix. */
    private Matrix lastEndingTextMatrix = null;

    /** The StringBuffer used to write the resulting String. */
    private StringBuffer result = null;

    /**
     * Creates a new text extraction processor.
     */
    public SimpleTextExtractingPdfContentStreamProcessor() {
    }

    @Override
	public void reset() {
        super.reset();
        this.lastTextLineMatrix = null;
        this.lastEndingTextMatrix = null;
        this.result = new StringBuffer();
    }

    /**
     * Returns the result so far.
     * @return	a String with the resulting text.
     */
    public String getResultantText(){
        return this.result.toString();
    }

    /**
     * Writes text to the result.
     * @param text	The text that needs to be displayed
     * @param endingTextMatrix	a text matrix
     * @see com.aowagie.text.pdf.parser.PdfContentStreamProcessor#displayText(java.lang.String, com.aowagie.text.pdf.parser.Matrix)
     */
    @Override
	public void displayText(final String text, final Matrix endingTextMatrix){
        boolean hardReturn = false;
        if (this.lastTextLineMatrix != null && this.lastTextLineMatrix.get(Matrix.I32) != getCurrentTextLineMatrix().get(Matrix.I32)){
        //if (!textLineMatrix.equals(lastTextLineMatrix)){
            hardReturn = true;
        }

        final float currentX = getCurrentTextMatrix().get(Matrix.I31);
        if (hardReturn){
            //System.out.println("");
            this.result.append('\n');
        } else if (this.lastEndingTextMatrix != null){
            final float lastEndX = this.lastEndingTextMatrix.get(Matrix.I31);

            //System.out.println("Displaying '" + text + "' :: lastX + lastWidth = " + lastEndX + " =?= currentX = " + currentX + " :: Delta is " + (currentX - lastEndX));

            final float spaceGlyphWidth = gs().font.getWidth(' ')/1000f;
            final float spaceWidth = (spaceGlyphWidth * gs().fontSize + gs().characterSpacing + gs().wordSpacing) * gs().horizontalScaling; // this is unscaled!!
            final Matrix scaled = new Matrix(spaceWidth, 0).multiply(getCurrentTextMatrix());
            final float scaledSpaceWidth = scaled.get(Matrix.I31) - getCurrentTextMatrix().get(Matrix.I31);

            if (currentX - lastEndX > scaledSpaceWidth/2f ){
                //System.out.println(" lastEndX=" + lastEndX + ", currentX=" + currentX + ", spaceWidth=" + spaceWidth);
                this.result.append(' ');
            }
        } else {
            //System.out.println("Displaying first string of content '" + text + "' :: currentX = " + currentX);
        }

        //System.out.println("After displaying '" + text + "' :: Start at " + currentX + " end at " + endingTextMatrix.get(Matrix.I31));

        this.result.append(text);

        this.lastTextLineMatrix = getCurrentTextLineMatrix();
        this.lastEndingTextMatrix = endingTextMatrix;

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy