All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fit.pdfdom.PDFBoxTree Maven / Gradle / Ivy

Go to download

Pdf2Dom is a PDF parser that converts the documents to a HTML DOM representation. The obtained DOM tree may be then serialized to a HTML file or further processed. The inline CSS definitions contained in the resulting document are used for making the HTML page as similar as possible to the PDF input. A command-line utility for converting the PDF documents to HTML is included in the distribution package. Pdf2Dom may be also used as an independent Java library with a standard DOM interface for your DOM-based applications or as an alternative parser for the CSSBox rendering engine in order to add the PDF processing capability to CSSBox.

There is a newer version: 2.0.3
Show newest version
/**
 * PDFBoxTree.java
 * (c) Radek Burget, 2011
 *
 * Pdf2Dom is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Pdf2Dom is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with CSSBox. If not, see .
 *
 * Created on 27.9.2011, 16:56:55 by burgetr
 */
package org.fit.pdfdom;

import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.*;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.fit.pdfdom.resource.ImageResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.pdfbox.pdmodel.graphics.state.RenderingMode.*;

/**
 * A generic tree of boxes created from a PDF file. It processes the PDF document and calls
 * the appropriate abstract methods in order to render a page, text box, etc. The particular
 * implementations are expected to implement these actions in order to build the resulting
 * document tree.
 *
 * @author burgetr
 */
public abstract class PDFBoxTree extends PDFTextStripper
{
    private static Logger log = LoggerFactory.getLogger(PDFBoxTree.class);

    /** Length units used in the generated CSS */
    public static final String UNIT = "pt";

    /** Known font names that are recognized in the PDF files */
    protected static String[] cssFontFamily = { "Times New Roman", "Times", "Garamond", "Helvetica", "Arial Narrow", "Arial", "Verdana", "Courier New", "MS Sans Serif" };

    /** Known font subtypes recognized in PDF files */
    protected static String[] pdFontType =    { "normal", "roman",  "bold",   "italic", "bolditalic" };
    /** Font weights corresponding to the font subtypes in {@link PDFDomTree#pdFontType} */
    protected static String[] cssFontWeight = { "normal", "normal", "bold",   "normal", "bold"  };
    /** Font styles corresponding to the font subtypes in {@link PDFDomTree#pdFontType} */
    protected static String[] cssFontStyle =  { "normal", "normal", "normal", "italic", "italic"  };

    /** When set to true, the graphics in the PDF file will be ignored. */
    protected boolean disableGraphics = false;
    /** When set to true, the embedded images will be ignored. */
    protected boolean disableImages = false;
    /** When set to true, the image data will not be transferred to the HTML data: url. */
    protected boolean disableImageData = false;
    /** First page to be processed */
    protected int startPage;
    /** Last page to be processed */
    protected int endPage;

    /** Table of embedded fonts */
    protected FontTable fontTable;

    /** The PDF page currently being processed */
    protected PDPage pdpage;

    /** Current text coordinates (the coordinates of the last encountered text box). */
    protected float cur_x;
    /** Current text coordinates (the coordinates of the last encountered text box). */
    protected float cur_y;

    /** Current path construction position */
    protected float path_x;
    /** Current path construction position */
    protected float path_y;
    /** Starting path construction position */
    protected float path_start_x;
    /** Starting path construction position */
    protected float path_start_y;

    /** Previous positioned text. */
    protected TextPosition lastText = null;

    /** Last diacritic if any */
    protected TextPosition lastDia = null;

    /** The text box currently being created. */
    protected StringBuilder textLine;

    /** Current text line metrics */
    protected TextMetrics textMetrics;

    /** Current graphics path */
    protected Vector graphicsPath;

    /** The style of the future box being modified by the operators */
    protected BoxStyle style;

    /** The style of the text line being created */
    protected BoxStyle curstyle;



    public PDFBoxTree() throws IOException
    {
        super();
        super.setSortByPosition(true);
        super.setSuppressDuplicateOverlappingText(true);

        //add operators for tracking the graphic state
        addOperator(new SetStrokingColorSpace());
        addOperator(new SetNonStrokingColorSpace());
        addOperator(new SetLineDashPattern());
        addOperator(new SetStrokingDeviceGrayColor());
        addOperator(new SetNonStrokingDeviceGrayColor());
        addOperator(new SetFlatness());
        addOperator(new SetLineJoinStyle());
        addOperator(new SetLineCapStyle());
        addOperator(new SetStrokingDeviceCMYKColor());
        addOperator(new SetNonStrokingDeviceCMYKColor());
        addOperator(new SetLineMiterLimit());
        addOperator(new SetStrokingDeviceRGBColor());
        addOperator(new SetNonStrokingDeviceRGBColor());
        addOperator(new SetRenderingIntent());
        addOperator(new SetStrokingColor());
        addOperator(new SetNonStrokingColor());
        addOperator(new SetStrokingColorN());
        addOperator(new SetNonStrokingColorN());
        addOperator(new SetFontAndSize());
        addOperator(new SetLineWidth());

        init();
    }

    /**
     * Internal initialization.
     * @throws ParserConfigurationException
     */
    private void init()
    {
        style = new BoxStyle(UNIT);
        textLine = new StringBuilder();
        textMetrics = null;
        graphicsPath = new Vector();
        startPage = 0;
        endPage = Integer.MAX_VALUE;
        fontTable = new FontTable();
    }


    public void processPage(PDPage page) throws IOException
    {
        if (getCurrentPageNo() >= startPage && getCurrentPageNo() <= endPage)
        {
            pdpage = page;
            updateFontTable();
            startNewPage();
            super.processPage(page);
            finishBox();
        }
    }

    /**
     * Checks whether the graphics processing is disabled.
     * @return true when the graphics processing is disabled in the parser configuration.
     */
    public boolean getDisableGraphics()
    {
        return disableGraphics;
    }

    /**
     * Disables the processing of the graphic operators in the PDF files.
     * @param disableGraphics when set to true the graphics is ignored in the source file.
     */
    public void setDisableGraphics(boolean disableGraphics)
    {
        this.disableGraphics = disableGraphics;
    }

    /**
     * Checks whether processing of embedded images is disabled.
     * @return true when the processing of embedded images is disabled in the parser configuration.
     */
    public boolean getDisableImages()
    {
        return disableImages;
    }

    /**
     * Disables the processing of images contained in the PDF files.
     * @param disableImages when set to true the images are ignored in the source file.
     */
    public void setDisableImages(boolean disableImages)
    {
        this.disableImages = disableImages;
    }

    /**
     * Checks whether the copying of image data is disabled.
     * @return true when the copying of image data is disabled in the parser configuration.
     */
    public boolean getDisableImageData()
    {
        return disableImageData;
    }

    /**
     * Disables the copying the image data to the resulting DOM tree.
     * @param disableImageData when set to true the image data is not copied to the document tree.
     * The eventual img elements will have an empty src attribute. 
     */
    public void setDisableImageData(boolean disableImageData)
    {
        this.disableImageData = disableImageData;
    }

    @Override
    public int getStartPage()
    {
        return startPage;
    }

    @Override
    public void setStartPage(int startPage)
    {
        this.startPage = startPage;
    }

    @Override
    public int getEndPage()
    {
        return endPage;
    }

    @Override
    public void setEndPage(int endPage)
    {
        this.endPage = endPage;
    }

    //===========================================================================================

    /**
     * Adds a new page to the resulting document and makes it a current (active) page.
     */
    protected abstract void startNewPage();

    /**
     * Creates a new text box in the current page. The style and position of the text are contained
     * in the {@link PDFBoxTree#curstyle} property. 
     * @param data The text contents.
     */
    protected abstract void renderText(String data, TextMetrics metrics);

    /**
     * Adds a rectangle to the current page on the specified position.
     * @param rect the rectangle to be rendered
     * @param stroke should there be a stroke around?
     * @param fill should the rectangle be filled?
     */
    protected abstract void renderPath(List path, boolean stroke, boolean fill) throws IOException;

    /**
     * Adds an image to the current page.
     * @param type the image type: "png" or "jpeg"
     * @param x the X coordinate of the image
     * @param y the Y coordinate of the image
     * @param width the width coordinate of the image
     * @param height the height coordinate of the image
     * @param data the image data depending on the specified type
     * @return
     */
    protected abstract void renderImage(float x, float y, float width, float height, ImageResource data) throws IOException;

    protected float[] toRectangle(List path)
    {
        if (path.size() == 4)
        {
            Set xc = new HashSet();
            Set yc = new HashSet();
            //find x/y 1/2
            for (PathSegment line : path)
            {
                xc.add(line.getX1());
                xc.add(line.getX2());
                yc.add(line.getY1());
                yc.add(line.getY2());
            }
            if (xc.size() == 2 && yc.size() == 2)
            {
                return new float[]{Collections.min(xc), Collections.min(yc), Collections.max(xc), Collections.max(yc)};
            }
            else
                return null; //two different X and Y coordinates required
        }
        else
            return null; //four segments required
    }

    /**
     * Updates the font table by adding new fonts used at the current page.
     */
    protected void updateFontTable()
    {
        PDResources resources = pdpage.getResources();
        if (resources != null)
        {
            try
            {
                processFontResources(resources, fontTable);
            } catch (IOException e) {
                log.error("Error processing font resources: "
                        + "Exception: {} {}", e.getMessage(), e.getClass());
            }
        }
    }

    private void processFontResources(PDResources resources, FontTable table) throws IOException
    {
        String fontNotSupportedMessage = "Font: {} skipped because type '{}' is not supported.";

        for (COSName key : resources.getFontNames())
        {
            PDFont font = resources.getFont(key);
            if (font instanceof PDTrueTypeFont)
            {
                table.addEntry( font);
                log.debug("Font: " + font.getName() + " TTF");
            }
            else if (font instanceof PDType0Font)
            {
                PDCIDFont descendantFont = ((PDType0Font) font).getDescendantFont();
                if (descendantFont instanceof PDCIDFontType2)
                    table.addEntry(font);
                else
                    log.warn(fontNotSupportedMessage, font.getName(), font.getClass().getSimpleName());
            }
            else if (font instanceof PDType1CFont)
                table.addEntry(font);
            else
                log.warn(fontNotSupportedMessage, font.getName(), font.getClass().getSimpleName());
        }

        for (COSName name : resources.getXObjectNames())
        {
            PDXObject xobject = resources.getXObject(name);
            if (xobject instanceof PDFormXObject)
            {
                PDFormXObject xObjectForm = (PDFormXObject) xobject;
                PDResources formResources = xObjectForm.getResources();
                if (formResources != null && formResources != resources && formResources.getCOSObject() != resources.getCOSObject())
                    processFontResources(formResources, table);
            }
        }

    }

    //===========================================================================================

    @Override
    protected void processOperator(Operator operator, List arguments)
            throws IOException
    {
        String operation = operator.getName();
        /*System.out.println("Operator: " + operation + ":" + arguments.size());
        if (operation.equals("sc") || operation.equals("cs"))
        {
            System.out.print("  ");
            for (int i = 0; i < arguments.size(); i++)
                System.out.print(arguments.get(i) + " ");
            System.out.println();
        }*/

        //word spacing
        if (operation.equals("Tw"))
        {
            style.setWordSpacing(getLength(arguments.get(0)));
        }

        //letter spacing
        else if (operation.equals("Tc"))
        {
            style.setLetterSpacing(getLength(arguments.get(0)));
        }

        //graphics
        else if (operation.equals("m")) //move
        {
            if (!disableGraphics)
            {
                if (arguments.size() == 2)
                {
                    float[] pos = transformPosition(getLength(arguments.get(0)), getLength(arguments.get(1)));
                    path_x = pos[0];
                    path_y = pos[1];
                    path_start_x = pos[0];
                    path_start_y = pos[1];
                }
            }
        }
        else if (operation.equals("l")) //line
        {
            if (!disableGraphics)
            {
                if (arguments.size() == 2)
                {
                    float[] pos = transformPosition(getLength(arguments.get(0)), getLength(arguments.get(1)));
                    graphicsPath.add(new PathSegment(path_x, path_y, pos[0], pos[1]));
                    path_x = pos[0];
                    path_y = pos[1];
                }
            }
        }
        else if (operation.equals("h")) //end subpath
        {
            if (!disableGraphics)
            {
                graphicsPath.add(new PathSegment(path_x, path_y, path_start_x, path_start_y));
            }
        }

        //rectangle
        else if (operation.equals("re"))
        {
            if (!disableGraphics)
            {
                if (arguments.size() == 4)
                {
                	float x = getLength(arguments.get(0));
                	float y = getLength(arguments.get(1));
                	float width = getLength(arguments.get(2));
                	float height = getLength(arguments.get(3));

                    float[] p1 = transformPosition(x, y);
                    float[] p2 = transformPosition(x + width, y + height);

                	graphicsPath.add(new PathSegment(p1[0], p1[1], p2[0], p1[1]));
                    graphicsPath.add(new PathSegment(p2[0], p1[1], p2[0], p2[1]));
                    graphicsPath.add(new PathSegment(p2[0], p2[1], p1[0], p2[1]));
                    graphicsPath.add(new PathSegment(p1[0], p2[1], p1[0], p1[1]));
                }
            }
        }

        //fill
        else if (operation.equals("f") || operation.equals("F") || operation.equals("f*"))
        {
            renderPath(graphicsPath, false, true);
            graphicsPath.removeAllElements();
        }

        //stroke
        else if (operation.equals("S"))
        {
            renderPath(graphicsPath, true, false);
            graphicsPath.removeAllElements();
        }
        else if (operation.equals("s"))
        {
            graphicsPath.add(new PathSegment(path_x, path_y, path_start_x, path_start_y));
            renderPath(graphicsPath, true, false);
            graphicsPath.removeAllElements();
        }

        //stroke and fill
        else if (operation.equals("B") || operation.equals("B*"))
        {
            renderPath(graphicsPath, true, true);
            graphicsPath.removeAllElements();
        }
        else if (operation.equals("b") || operation.equals("b*"))
        {
            graphicsPath.add(new PathSegment(path_x, path_y, path_start_x, path_start_y));
            renderPath(graphicsPath, true, true);
            graphicsPath.removeAllElements();
        }

        //cancel path
        else if (operation.equals("n"))
        {
            graphicsPath.removeAllElements();
        }

        //invoke named object - images
        else if (operation.equals("Do"))
        {
            if (!disableImages)
                processImageOperation(arguments);
        }

        super.processOperator(operator, arguments);
    }

    protected void processImageOperation(List arguments) throws IOException
    {
        COSName objectName = (COSName)arguments.get( 0 );
        PDXObject xobject = getResources().getXObject( objectName );
        if (xobject instanceof PDImageXObject)
        {
            PDImageXObject pdfImage = (PDImageXObject) xobject;
            BufferedImage outputImage = pdfImage.getImage();
            outputImage = rotateImage(outputImage);

            ImageResource imageData = new ImageResource(getTitle(), outputImage);

            Rectangle2D bounds = calculateImagePosition(pdfImage);
            float x = (float) bounds.getX();
            float y = (float) bounds.getY();

            renderImage(x, y, (float) bounds.getWidth(), (float) bounds.getHeight(), imageData);
        }
    }

    private BufferedImage rotateImage(BufferedImage outputImage)
    {
        // x, y and size are handled by css attributes but still need to rotate the image so pulling
        // only rotation out of the matrix so no giant whitespace offset from translations
        Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();

        AffineTransform tr = ctm.createAffineTransform();
        double rotate = Math.atan2(tr.getShearY(), tr.getScaleY()) - Math.toRadians(pdpage.getRotation());
        outputImage = ImageUtils.rotateImage(outputImage, rotate);

        return outputImage;
    }

    private Rectangle2D calculateImagePosition(PDImageXObject pdfImage) throws IOException
    {
        Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
        Rectangle2D imageBounds = pdfImage.getImage().getRaster().getBounds();

        AffineTransform imageTransform = new AffineTransform(ctm.createAffineTransform());
        imageTransform.scale(1.0 / pdfImage.getWidth(), -1.0 / pdfImage.getHeight());
        imageTransform.translate(0, -pdfImage.getHeight());

        AffineTransform pageTransform = createCurrentPageTransformation();
        pageTransform.concatenate(imageTransform);

        return pageTransform.createTransformedShape(imageBounds).getBounds2D();
    }

    @Override
    protected void processTextPosition(TextPosition text)
    {
        if (text.isDiacritic())
        {
            lastDia = text;
        }
        else if (!text.getUnicode().trim().isEmpty())
        {
            if (lastDia != null)
            {
                if (text.contains(lastDia))
                    text.mergeDiacritic(lastDia);
                lastDia = null;
            }
            
            /*float[] c = transformPosition(text.getX(), text.getY());
            cur_x = c[0];
            cur_y = c[1];*/
            cur_x = text.getX();
            cur_y = text.getY();

            /*System.out.println("Text: " + text.getCharacter());
            System.out.println(" Font size: " + text.getFontSize() + " " + text.getFontSizeInPt() + "pt");
            System.out.println(" Width: " + text.getWidth());
            System.out.println(" Width adj: " + text.getWidthDirAdj());
            System.out.println(" Height: " + text.getHeight());
            System.out.println(" Height dir: " + text.getHeightDir());
            System.out.println(" XScale: " + text.getXScale());
            System.out.println(" YScale: " + text.getYScale());*/

            float distx = 0;
            float disty = 0;
            if (lastText != null)
            {
                distx = text.getX() - (lastText.getX() + lastText.getWidth());
                disty = text.getY() - lastText.getY();
            }

            //should we split the boxes?
            boolean split = lastText == null || distx > 1.0f || distx < -6.0f || Math.abs(disty) > 1.0f
                                || isReversed(getTextDirectionality(text)) != isReversed(getTextDirectionality(lastText));
            //if the style changed, we should split the boxes
            updateStyle(style, text);
            if (!style.equals(curstyle))
            	split = true;

            if (split) //start of a new box
            {
            	//finish current box (if any)
            	if (lastText != null)
            	{
            		finishBox();
            	}
                //start a new box
	            curstyle = new BoxStyle(style);
            }
            textLine.append(text.getUnicode());
            if (textMetrics == null)
                textMetrics = new TextMetrics(text);
            else
                textMetrics.append(text);
            lastText = text;
        }
    }

    /**
     * Finishes the current box - empties the text line buffer and creates a DOM element from it.
     */
    protected void finishBox()
    {
    	if (textLine.length() > 0)
    	{
            String s;
            if (isReversed(Character.getDirectionality(textLine.charAt(0))))
                s = textLine.reverse().toString();
            else
                s = textLine.toString();

            curstyle.setLeft(textMetrics.getX());
            curstyle.setTop(textMetrics.getTop());
            curstyle.setLineHeight(textMetrics.getHeight());

	        renderText(s, textMetrics);
	        textLine = new StringBuilder();
	        textMetrics = null;
    	}
    }

    /**
     * Checks whether the text directionality corresponds to reversed text (very rough) 
     * @param directionality the Character.directionality
     * @return
     */
    protected boolean isReversed(byte directionality)
    {
        switch (directionality)
        {
            case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
            case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
            case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
            case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
                return true;

            default:
                return false;
        }
    }

    /**
     * Updates the text style according to a new text position
     * @param bstyle the style to be updated
     * @param text the text position
     */
    protected void updateStyle(BoxStyle bstyle, TextPosition text)
    {
        String font = text.getFont().getName();
        String family = null;
        String weight = null;
        String fstyle = null;

        bstyle.setFontSize(text.getFontSizeInPt());
        bstyle.setLineHeight(text.getHeight());

        if (font != null)
        {
        	//font style and weight
            for (int i = 0; i < pdFontType.length; i++)
            {
                if (font.toLowerCase().lastIndexOf(pdFontType[i]) >= 0)
                {
                    weight = cssFontWeight[i];
                    fstyle = cssFontStyle[i];
                    break;
                }
            }
            if (weight != null)
            	bstyle.setFontWeight(weight);
            else
            	bstyle.setFontWeight(cssFontWeight[0]);
            if (fstyle != null)
            	bstyle.setFontStyle(fstyle);
            else
            	bstyle.setFontStyle(cssFontStyle[0]);

            //font family
            //If it's a known common font don't embed in html output to save space
            String knownFontFamily = findKnownFontFamily(font);
            if (!knownFontFamily.equals(""))
                family = knownFontFamily;
            else
            {
                family = fontTable.getUsedName(text.getFont());
                if (family == null)
                    family = font;
            }

            if (family != null)
            	bstyle.setFontFamily(family);
        }

        updateStyleForRenderingMode();
    }

    private String findKnownFontFamily(String font) {
        for (String fontFamilyOn : cssFontFamily)
        {
            if (font.toLowerCase().lastIndexOf(fontFamilyOn.toLowerCase().replaceAll("\\s+","")) >= 0)
                return fontFamilyOn;
        }

        return "";
    }

    private void updateStyleForRenderingMode()
    {
        String fillColor = colorString(getGraphicsState().getNonStrokingColor());
        String strokeColor = colorString(getGraphicsState().getStrokingColor());

        if (isTextFillEnabled())
            style.setColor(fillColor);
        else
            style.setColor(BoxStyle.transparentColor);
        if (isTextStrokeEnabled())
            style.setStrokeColor(strokeColor);
        else
            style.setStrokeColor(BoxStyle.transparentColor);
    }

    private boolean isTextStrokeEnabled()
    {
        RenderingMode mode = getGraphicsState().getTextState().getRenderingMode();
        return mode == STROKE || mode == STROKE_CLIP || mode == FILL_STROKE || mode == FILL_STROKE_CLIP;
    }

    private boolean isTextFillEnabled()
    {
        RenderingMode mode = getGraphicsState().getTextState().getRenderingMode();
        return mode == FILL || mode == FILL_CLIP || mode == FILL_STROKE || mode == FILL_STROKE_CLIP;
    }

    /**
     * Obtains the media box valid for the current page.
     * @return the media box rectangle
     */
    protected PDRectangle getCurrentMediaBox()
    {
        PDRectangle layout = pdpage.getCropBox();
        return layout;
    }

    //===========================================================================================

    /**
     * Transforms a length according to the current transformation matrix.
     */
    protected float transformLength(float w)
    {
        Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
        Matrix m = new Matrix();
        m.setValue(2, 0, w);
        return m.multiply(ctm).getTranslateX();
    }

    /**
     * Transforms a position according to the current transformation matrix and current page transformation.
     * @param x
     * @param y
     * @return
     */
    protected float[] transformPosition(float x, float y)
    {
        Point2D.Float point = super.transformedPoint(x, y);
        AffineTransform pageTransform = createCurrentPageTransformation();
        Point2D.Float transformedPoint = (Point2D.Float) pageTransform.transform(point, null);

        return new float[]{(float) transformedPoint.getX(), (float) transformedPoint.getY()};
    }

    protected AffineTransform createCurrentPageTransformation()
    {
        PDRectangle cb = pdpage.getCropBox();
        AffineTransform pageTransform = new AffineTransform();

        switch (pdpage.getRotation())
        {
            case 90:
                pageTransform.translate(cb.getHeight(), 0);
                break;
            case 180:
                pageTransform.translate(cb.getWidth(), cb.getHeight());
                break;
            case 270:
                pageTransform.translate(0, cb.getWidth());
                break;
        }

        pageTransform.rotate(Math.toRadians(pdpage.getRotation()));
        pageTransform.translate(0, cb.getHeight());
        pageTransform.scale(1, -1);
        pageTransform.translate(-cb.getLowerLeftX(), -cb.getLowerLeftY());

        return pageTransform;
    }

    /**
     * Obtains a number from a PDF number value
     * @param value the PDF value of the Integer or Fload type
     * @return the corresponging numeric value
     */
	protected int intValue(COSBase value)
    {
        if (value instanceof COSNumber)
            return ((COSNumber) value).intValue();
        else
            return 0;
    }

    /**
     * Obtains a number from a PDF number value
     * @param value the PDF value of the Integer or Float type
     * @return the corresponging numeric value
     */
    protected float floatValue(COSBase value)
    {
        if (value instanceof COSNumber)
            return ((COSNumber) value).floatValue();
        else
            return 0;
    }

    /**
     * Obtains a length in points from a PDF number value
     * @param value the PDF value of the Integer or Fload type
     * @return the resulting length in points
     */
    protected float getLength(COSBase value)
    {
        return floatValue(value); //no conversion is done right now, we count in PDF units
    }

    /**
     * Obtains a string from a PDF value
     * @param value the PDF value of the String, Integer or Float type
     * @return the corresponging string value
     */
    protected String stringValue(COSBase value)
    {
        if (value instanceof COSString)
            return ((COSString) value).getString();
        else if (value instanceof COSNumber)
            return String.valueOf(((COSNumber) value).floatValue());
        else
            return "";
    }

    /**
     * Creates a CSS rgb() specification from the color component values.
     * @param ir red value (0..255)
     * @param ig green value (0..255)
     * @param ib blue value (0..255)
     * @return the rgb() string
     */
    protected String colorString(int ir, int ig, int ib)
    {
    	return String.format("#%02x%02x%02x", ir, ig, ib);
    }

    /**
     * Creates a CSS rgb() specification from the color component values.
     * @param r red value (0..1)
     * @param g green value (0..1)
     * @param b blue value (0..1)
     * @return the rgb() string
     */
    protected String colorString(float r, float g, float b)
    {
        return colorString((int) (r * 255), (int) (g * 255), (int) (b * 255));
    }

    /**
     * Creates a CSS rgb specification from a PDF color
     * @param pdcolor
     * @return the rgb() string
     */
    protected String colorString(PDColor pdcolor)
    {
        String color = null;
        try
        {
            float[] rgb = pdcolor.getColorSpace().toRGB(pdcolor.getComponents());
            color = colorString(rgb[0], rgb[1], rgb[2]);
        } catch (IOException e) {
            log.error("colorString: IOException: {}", e.getMessage());
        } catch (UnsupportedOperationException e) {
            log.error("colorString: UnsupportedOperationException: {}", e.getMessage());
        }
        return color;
    }

    protected String getTitle() {
        String title = document.getDocumentInformation().getTitle();
        if (title == null || title.isEmpty())
            title = "PDF Document";

        return title;
    }

    protected byte getTextDirectionality(TextPosition text)
    {
        return getTextDirectionality(text.getUnicode());
    }

    protected byte getTextDirectionality(String s)
    {
        if (s.length() > 0)
            return Character.getDirectionality(s.charAt(0));
        else
            return Character.DIRECTIONALITY_UNDEFINED;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy