All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fit.cssbox.pdf.CSSBoxTree Maven / Gradle / Ivy

Go to download

Pdf2Dom is a PDF parser that converts the documents to a HTML DOM representation. The obtained DOM tree may be then serialized to a HTML file or further processed. The inline CSS definitions contained in the resulting document are used for making the HTML page as similar as possible to the PDF input. A command-line utility for converting the PDF documents to HTML is included in the distribution package. Pdf2Dom may be also used as an independent Java library with a standard DOM interface for your DOM-based applications or as an alternative parser for the CSSBox rendering engine in order to add the PDF processing capability to CSSBox.

There is a newer version: 2.0.3
Show newest version
/**
 * CSSBoxTree.java
 * (c) Radek Burget, 2011
 *
 * Pdf2Dom is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *  
 * Pdf2Dom is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *  
 * You should have received a copy of the GNU Lesser General Public License
 * along with CSSBox. If not, see .
 *
 * Created on 27.9.2011, 16:39:00 by burgetr
 */
package org.fit.cssbox.pdf;

import java.awt.Dimension;
import java.awt.Graphics2D;
import java.io.IOException;
import java.net.URL;
import java.util.Arrays;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;

import cz.vutbr.web.css.CSSFactory;
import cz.vutbr.web.css.Declaration;
import cz.vutbr.web.css.NodeData;
import cz.vutbr.web.css.Term;
import cz.vutbr.web.css.TermFactory;
import cz.vutbr.web.css.TermFunction;
import cz.vutbr.web.css.TermColor;
import cz.vutbr.web.css.TermNumeric.Unit;
import cz.vutbr.web.csskit.Color;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.fit.cssbox.layout.BlockBox;
import org.fit.cssbox.layout.BlockReplacedBox;
import org.fit.cssbox.layout.BrowserConfig;
import org.fit.cssbox.layout.ReplacedImage;
import org.fit.cssbox.layout.TextBox;
import org.fit.cssbox.layout.Viewport;
import org.fit.cssbox.layout.VisualContext;
import org.fit.pdfdom.BoxStyle;
import org.fit.pdfdom.PDFDomTree;
import org.fit.pdfdom.PathSegment;
import org.fit.pdfdom.TextMetrics;
import org.fit.pdfdom.resource.ImageResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import static org.fit.pdfdom.BoxStyle.transparentColor;

/**
 * This class implements direct creation of a CSSBox tree from a PDF file. It creates a tree if boxes compatible
 * with the original CSSBox {@link org.fit.cssbox.layout.BoxFactory} result for HTML documents. The resulting tree contains the boxes
 * together with their styles. Its further processing (layout and positioning) is the same as for the tree of boxes
 * obtained from the HTML documents.
 * 
 * @author burgetr
 */
public class CSSBoxTree extends PDFDomTree
{
    private static Logger log = LoggerFactory.getLogger(PDFDomTree.class);

    /** Scale factor for unknown fonts - it is used to prevent overlaping the boxes when an inappropriate font is used */
    protected float unknownFontScale = 0.95f;
    
    /** Length units used in the output */
    protected Unit unit = Unit.pt;
    
    /** Root graphics context */
	protected Graphics2D g;
	/** Root visual context */
	protected VisualContext ctx;
	/** Preferred dimensions of the result */
	protected Dimension dim;
	/** Base URL used for eventual references in the input file */
	protected URL baseurl;
	
	/** The resulting viewport */
	protected Viewport viewport;
	/** HTML box */
	protected BlockBox html;
	/** BODY box */
	protected BlockBox body;
	/** The box representing the page that is currently being created */
    protected BlockBox pagebox;

    /** Used CSSBox configuration */
    protected BrowserConfig config;
    
    /** Internal counter for assigning the node IDs */
    protected int next_order;
    
    /**
     * Creates a new instance bound to certain graphic context.
     * @param g The graphic context used for displaying the rendered result. It is used for obtaining the font metrics.
     * @param ctx The initial CSSBox visual context used for the viewport.
     * @param dim The initial dimensions of the viewport. The resulting dimensions may be updated according to the page contents.
     * @param baseurl Base url used for loading referenced objects.
     * @throws IOException
     * @throws ParserConfigurationException
     */
    public CSSBoxTree(Graphics2D g, VisualContext ctx, Dimension dim, URL baseurl) throws IOException, ParserConfigurationException
    {
        super();
    	this.g = g;
    	this.ctx = ctx;
    	this.dim = dim;
    	this.baseurl = baseurl;
    	init();
    }
    
    /**
     * Internal initialization.
     */
    private void init()
    {
        next_order = 0;
    }

    public BrowserConfig getConfig()
    {
        return config;
    }

    public void setConfig(BrowserConfig config)
    {
        this.config = config;
    }

    /**
     * Obtains the resulting viewport that represents the root node of the resulting box tree.
     * @return the viewport
     */
    public Viewport getViewport()
    {
        return viewport;
    }

    /**
     * Obtains the first unused ID of the box. This can be used for obtaining the box count.
     * @return the first unused box ID
     */
    public int getLastId()
    {
        return next_order;
    }

    public void processDocument(PDDocument pdfdocument, int startPage, int endPage) throws IOException
    {
        setStartPage(startPage);
        setEndPage(endPage);
        createDOM(pdfdocument);
    }
    
    @Override
    protected void createDocument() throws ParserConfigurationException
    {
    	super.createDocument();
    	//create viewport with the initial dimension
        Element vp = createAnonymousElement(getDocument(), "Xdiv", "block");
        Element root = getDocument().getDocumentElement();
        viewport = new Viewport(vp, g, ctx, null, root, dim.width, dim.height);
        viewport.setConfig(config);
        //create the root boxes
        html = createBlock(viewport, root, false);
        html.setStyle(createBlockStyle());
        viewport.addSubBox(html);
        body = createBlock(html, (Element) root.getElementsByTagName("body").item(0), false);
        body.setStyle(createBodyStyle());
        html.addSubBox(body);
    }

    //===========================================================================================
    
    @Override
    protected void startNewPage()
    {
        super.startNewPage();
        pagebox = createBlock(body, curpage, false);
        pagebox.setStyle(createPageStyle());
        body.addSubBox(pagebox);
    }

    @Override
    protected void renderText(String data, TextMetrics metrics)
    {
        //DOM element
        Element el = createTextElement(data, metrics.getWidth());
        curpage.appendChild(el);
        //Block box
        BlockBox block = createBlock(pagebox, el, false);
        block.setStyle(createTextStyle(curstyle, metrics.getWidth()));
        pagebox.addSubBox(block);
        //Text box
        TextBox text = createTextBox(block, (Text) el.getFirstChild());
        block.addSubBox(text);
    }

    @Override
    protected void renderPath(List path, boolean stroke, boolean fill)
    {
        float[] rect = toRectangle(path);
        if (rect != null)
        {
            //DOM element
            Element el = createRectangleElement(rect[0], rect[1], rect[2]-rect[0], rect[3]-rect[1], stroke, fill);
            curpage.appendChild(el);
            //Block box
            BlockBox block = createBlock(pagebox, el, false);
            block.setStyle(createRectangleStyle(rect[0], rect[1], rect[2]-rect[0], rect[3]-rect[1], stroke, fill));
            pagebox.addSubBox(block);
        }
        else if (stroke)
        {
            for (PathSegment segm : path)
            {
                //DOM element
                Element el = createLineElement(segm.getX1(), segm.getY1(), segm.getX2(), segm.getY2());
                curpage.appendChild(el);
                //Block box
                BlockBox block = createBlock(pagebox, el, false);
                block.setStyle(createLineStyle(segm.getX1(), segm.getY1(), segm.getX2(), segm.getY2()));
                pagebox.addSubBox(block);
            }
        }
    }
    
    /*protected void renderRectangle(float x, float y, float width, float height, boolean stroke, boolean fill)
    {
        //DOM element
        Element el = createRectangleElement(x, y, width, height, stroke, fill);
        curpage.appendChild(el);
        //Block box
        BlockBox block = createBlock(pagebox, el, false);
        block.setStyle(createRectangleStyle(x, y, width, height, stroke, fill));
        pagebox.addSubBox(block);
    }*/

    @Override
    protected void renderImage(float x, float y, float width, float height, ImageResource resource) throws IOException
    {
        //DOM element
        Element el = createImageElement(x, y, width, height, resource);
        curpage.appendChild(el);
        //Image box
        BlockBox block = createBlock(pagebox, el, true);
        block.setStyle(createRectangleStyle(x, y, width, height, false, false));
        pagebox.addSubBox(block);
    }
    
    //===========================================================================================
    
    /**
     * Creates a new DOM element that represents an anonymous box in a document.
     * @param doc the document
     * @param name the anonymous element name (generally arbitrary)
     * @param display the display style value for the block
     * @return the new element
     */
    protected Element createAnonymousElement(Document doc, String name, String display)
    {
        Element div = doc.createElement(name);
        div.setAttribute("class", "Xanonymous");
        div.setAttribute("style", "display:" + display);
        return div;
    }
    
    /**
     * Creates a new block box from the given element with the given parent. No style is assigned to the resulting box. 
     * @param parent The parent box in the tree of boxes.
     * @param n The element that this box belongs to.
     * @param replaced When set to true, a replaced block box will be created. Otherwise, a normal non-replaced block will be created.
     * @return The new block box.
     */
    protected BlockBox createBlock(BlockBox parent, Element n, boolean replaced)
    {
        BlockBox root;
        if (replaced)
        {
            BlockReplacedBox rbox = new BlockReplacedBox((Element) n, (Graphics2D) parent.getGraphics().create(), parent.getVisualContext().create());
            rbox.setViewport(viewport);
            rbox.setContentObj(new ReplacedImage(rbox, rbox.getVisualContext(), baseurl, n.getAttribute("src")));
            root = rbox;
        }
        else
        {
            root = new BlockBox((Element) n, (Graphics2D) parent.getGraphics().create(), parent.getVisualContext().create());
            root.setViewport(viewport);
        }
        root.setBase(baseurl);
        root.setParent(parent);
        root.setContainingBlockBox(parent);
        root.setClipBlock(viewport);
        root.setOrder(next_order++);
        return root;
    }
    
    /**
     * Creates a text box with the given parent and text node assigned.
     * @param contblock The parent node (and the containing block in the same time)
     * @param n The corresponding text node in the DOM tree.
     * @return The new text box.
     */
    protected TextBox createTextBox(BlockBox contblock, Text n)
    {
        TextBox text = new TextBox(n, (Graphics2D) contblock.getGraphics().create(), contblock.getVisualContext().create());
        text.setOrder(next_order++);
        text.setContainingBlockBox(contblock);
        text.setClipBlock(contblock);
        text.setViewport(viewport);
        text.setBase(baseurl);
        return text;
    }

    /**
     * Creates the style declaration for a text box based on the given {@link BoxStyle} structure.
     * @param style The source box style.
     * @return The element style definition.
     */
    protected NodeData createTextStyle(BoxStyle style, float width)
    {
        NodeData ret = CSSFactory.createNodeData();
        TermFactory tf = CSSFactory.getTermFactory();
        ret.push(createDeclaration("position", tf.createIdent("absolute")));
        ret.push(createDeclaration("overflow", tf.createIdent("hidden")));
        ret.push(createDeclaration("left", tf.createLength(style.getLeft(), unit)));
        ret.push(createDeclaration("top", tf.createLength(style.getTop(), unit)));
        ret.push(createDeclaration("line-height", tf.createLength(style.getLineHeight(), unit)));
        if (style.getFontFamily() != null)
            ret.push(createDeclaration("font-family", tf.createString(style.getFontFamily())));
        if (style.getFontSize() != 0)
        {
            float size = (float) style.getFontSize();
            if (style.getFontFamily() == null)
                size = size * unknownFontScale;
            ret.push(createDeclaration("font-size", tf.createLength(size, unit)));
        }
        if (style.getFontWeight() != null)
            ret.push(createDeclaration("font-weight", tf.createIdent(style.getFontWeight())));
        if (style.getFontStyle() != null)
            ret.push(createDeclaration("font-style", tf.createIdent(style.getFontStyle())));
        if (style.getWordSpacing() != 0)
            ret.push(createDeclaration("word-spacing", tf.createLength((float) style.getWordSpacing(), unit)));
        if (style.getLetterSpacing() != 0)
            ret.push(createDeclaration("letter-spacing", tf.createLength((float) style.getLetterSpacing(), unit)));
        if (style.getColor() != null)
        {
            String fillColor = style.getColor();

            // text stroke css attrs don't render atm but can use stroke as fall back for fill if fill is transparent
            boolean hasStrokeColor = style.getStrokeColor() != null && fillColor.equals(transparentColor);
            if (fillColor.equals(transparentColor) && hasStrokeColor)
                fillColor = style.getStrokeColor();

            ret.push(createDeclaration("color", createTermColor(fillColor)));
        }

		ret.push(createDeclaration("width", tf.createLength(width, unit)));
		
        return ret;
    }

    private static TermColor createTermColor(String color)
    {
        TermFactory tf = CSSFactory.getTermFactory();

        if (color.startsWith("rgba"))
        {
            color = color.replaceAll("rgba|\\)|\\(", "");
            String[] params = color.split(",");

            int[] colorValues = new int[params.length];
            for (int i = 0; i < params.length; i++)
                colorValues[i] = Integer.parseInt(params[i]);

            TermColor termColor = tf.createColor(0, 0, 0);
            termColor.setValue(new Color(colorValues[0], colorValues[1], colorValues[2], colorValues[3]));

            return termColor;
        }
        else
            return tf.createColor(color);
    }

    /**
     * Creates an empty block style definition.
     * @return 
     */
    protected NodeData createBlockStyle()
    {
        NodeData ret = CSSFactory.createNodeData();
        TermFactory tf = CSSFactory.getTermFactory();
        ret.push(createDeclaration("display", tf.createIdent("block")));
        return ret;
    }
    
    /**
     * Creates a style definition used for the body element.
     * @return The body style definition.
     */
    protected NodeData createBodyStyle()
    {
        NodeData ret = createBlockStyle();
        TermFactory tf = CSSFactory.getTermFactory();
        ret.push(createDeclaration("background-color", tf.createColor(255, 255, 255)));
        return ret;
    }
    
    /**
     * Creates a style definition used for pages.
     * @return The page style definition.
     */
    protected NodeData createPageStyle()
    {
        NodeData ret = createBlockStyle();
        TermFactory tf = CSSFactory.getTermFactory();
        ret.push(createDeclaration("position", tf.createIdent("relative")));
		ret.push(createDeclaration("border-width", tf.createLength(1f, Unit.px)));
		ret.push(createDeclaration("border-style", tf.createIdent("solid")));
		ret.push(createDeclaration("border-color", tf.createColor(0, 0, 255)));
		ret.push(createDeclaration("margin", tf.createLength(0.5f, Unit.em)));
		
        PDRectangle layout = getCurrentMediaBox();
        if (layout != null)
        {
            float w = layout.getWidth();
            float h = layout.getHeight();
            final int rot = pdpage.getRotation();
            if (rot == 90 || rot == 270)
            {
                float x = w; w = h; h = x;
            }
            
            ret.push(createDeclaration("width", tf.createLength(w, unit)));
            ret.push(createDeclaration("height", tf.createLength(h, unit)));
        }
        else
            log.warn("No media box found");
        
        return ret;
    }
    
    /**
     * Creates the style definition used for a rectangle element based on the given properties of the rectangle
     * @param x The X coordinate of the rectangle.
     * @param y The Y coordinate of the rectangle.
     * @param width The width of the rectangle.
     * @param height The height of the rectangle.
     * @param stroke Should there be a stroke around?
     * @param fill Should the rectangle be filled?
     * @return The resulting element style definition.
     */
    protected NodeData createRectangleStyle(float x, float y, float width, float height, boolean stroke, boolean fill)
    {
        float lineWidth = transformLength((float) getGraphicsState().getLineWidth());
        float lw = (lineWidth < 1f) ? 1f : lineWidth;
        float wcor = stroke ? lw : 0.0f;
        
        NodeData ret = CSSFactory.createNodeData();
        TermFactory tf = CSSFactory.getTermFactory();
        ret.push(createDeclaration("position", tf.createIdent("absolute")));
        ret.push(createDeclaration("left", tf.createLength(x, unit)));
        ret.push(createDeclaration("top", tf.createLength(y, unit)));
        ret.push(createDeclaration("width", tf.createLength(width - wcor, unit)));
        ret.push(createDeclaration("height", tf.createLength(height - wcor, unit)));
        
        if (stroke)
        {
            ret.push(createDeclaration("border-width", tf.createLength(lw, unit)));
            ret.push(createDeclaration("border-style", tf.createIdent("solid")));
            String color = colorString(getGraphicsState().getStrokingColor());
            ret.push(createDeclaration("border-color", tf.createColor(color)));
        }
        
        if (fill)
        {
            String color = colorString(getGraphicsState().getNonStrokingColor());
            if (color != null)
                ret.push(createDeclaration("background-color", tf.createColor(color)));
        }

        return ret;
    }
    
    protected NodeData createLineStyle(float x1, float y1, float x2, float y2)
    {
        HtmlDivLine line = new HtmlDivLine(x1, y1, x2, y2);
        String bside = line.getBorderSide();

        NodeData ret = CSSFactory.createNodeData();
        TermFactory tf = CSSFactory.getTermFactory();
        ret.push(createDeclaration("position", tf.createIdent("absolute")));
        ret.push(createDeclaration("position", tf.createIdent("absolute")));
        ret.push(createDeclaration("left", tf.createLength(line.getLeft(), unit)));
        ret.push(createDeclaration("top", tf.createLength(line.getTop(), unit)));
        ret.push(createDeclaration("width", tf.createLength(line.getWidth(), unit)));
        ret.push(createDeclaration("height", tf.createLength((float) line.getHeight(), unit)));
        ret.push(createDeclaration(bside + "-width", tf.createLength(line.getLineStrokeWidth(), unit)));
        ret.push(createDeclaration(bside + "-style", tf.createIdent("solid")));
        String color = colorString(getGraphicsState().getStrokingColor());
        ret.push(createDeclaration(bside + "-color", tf.createColor(color)));

        if (line.getAngleDegrees() != 0)
        {
            Term[] angle = new Term[] { tf.createAngle(String.valueOf(line.getAngleDegrees()), Unit.deg, 1) };
            List> args = Arrays.asList(angle);
            TermFunction rotate = tf.createFunction("rotate", args);
            ret.push(createDeclaration("transform", rotate));
        }

        return ret;
    }
    
    /**
     * Creates a single property declaration.
     * @param property Property name.
     * @param term Property value.
     * @return The resulting declaration.
     */
    protected Declaration createDeclaration(String property, Term term)
    {
        Declaration d = CSSFactory.getRuleFactory().createDeclaration();
        d.unlock();
        d.setProperty(property);
        d.add(term);
        return d;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy