
org.fit.pdfdom.PDFDomTree Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdf2dom Show documentation
Show all versions of pdf2dom Show documentation
Pdf2Dom is a PDF parser that converts the documents to a HTML DOM representation. The obtained DOM tree may be then serialized to a HTML file or further processed. The inline CSS definitions contained in the resulting document are used for making the HTML page as similar as possible to the PDF input. A command-line utility for converting the PDF documents to HTML is included in the distribution package. Pdf2Dom may be also used as an independent Java library with a standard DOM interface for your DOM-based applications or as an alternative parser for the CSSBox rendering engine in order to add the PDF processing capability to CSSBox.
/**
* PDFDomTree.java
* (c) Radek Burget, 2011
*
* Pdf2Dom is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Pdf2Dom is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with CSSBox. If not, see .
*
* Created on 13.9.2011, 14:17:24 by burgetr
*/
package org.fit.pdfdom;
import java.io.IOException;
import java.io.Writer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.pdfbox.exceptions.WrappedIOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import org.w3c.dom.bootstrap.DOMImplementationRegistry;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSOutput;
import org.w3c.dom.ls.LSSerializer;
/**
* A DOM representation of a PDF file.
*
* @author burgetr
*/
public class PDFDomTree extends PDFBoxTree
{
/** Default style placed in the begining of the resulting document */
protected String defaultStyle = ".page{position:relative; border:1px solid blue;margin:0.5em}\n" +
".p,.r{position:absolute;}";
/** The resulting document representing the PDF file. */
protected Document doc;
/** The head element of the resulting document. */
protected Element head;
/** The body element of the resulting document. */
protected Element body;
/** The title element of the resulting document. */
protected Element title;
/** The element representing the page currently being created in the resulting document. */
protected Element curpage;
/** Text element counter for assigning IDs to the text elements. */
protected int textcnt;
/** Page counter for assigning IDs to the pages. */
protected int pagecnt;
/**
* Creates a new PDF DOM parser.
* @throws IOException
* @throws ParserConfigurationException
*/
public PDFDomTree() throws IOException, ParserConfigurationException
{
super();
init();
}
/**
* Internal initialization.
* @throws ParserConfigurationException
*/
private void init() throws ParserConfigurationException
{
pagecnt = 0;
textcnt = 0;
}
/**
* Creates a new empty HTML document tree.
* @throws ParserConfigurationException
*/
protected void createDocument() throws ParserConfigurationException
{
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = builderFactory.newDocumentBuilder();
DocumentType doctype = builder.getDOMImplementation().createDocumentType("html", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
doc = builder.getDOMImplementation().createDocument("http://www.w3.org/1999/xhtml", "html", doctype);
head = doc.createElement("head");
Element meta = doc.createElement("meta");
meta.setAttribute("http-equiv", "content-type");
meta.setAttribute("content", "text/html;charset=utf-8");
head.appendChild(meta);
title = doc.createElement("title");
title.setTextContent("PDF Document");
head.appendChild(title);
Element gs = doc.createElement("style");
gs.setAttribute("type", "text/css");
gs.setTextContent(defaultStyle);
head.appendChild(gs);
body = doc.createElement("body");
Element root = doc.getDocumentElement();
root.appendChild(head);
root.appendChild(body);
}
/**
* Obtains the resulting document tree.
* @return The DOM root element.
*/
public Document getDocument()
{
return doc;
}
@Override
public void processDocument(PDDocument document, int startPage, int endPage)
throws IOException
{
try {
createDocument();
} catch (ParserConfigurationException e) {
throw new WrappedIOException("Error: parser configuration error", e);
}
super.processDocument(document, startPage, endPage);
//use the PDF title
String doctitle = document.getDocumentInformation().getTitle();
if (doctitle != null && doctitle.trim().length() > 0)
title.setTextContent(doctitle);
}
/**
* Parses a PDF document and serializes the resulting DOM tree to an output. This requires
* a DOM Level 3 capable implementation to be available.
*/
@Override
public void writeText(PDDocument doc, Writer outputStream) throws IOException
{
try
{
DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
DOMImplementationLS impl = (DOMImplementationLS)registry.getDOMImplementation("LS");
LSSerializer writer = impl.createLSSerializer();
LSOutput output = impl.createLSOutput();
writer.getDomConfig().setParameter("format-pretty-print", true);
output.setCharacterStream(outputStream);
processDocument(doc);
writer.write(getDocument(), output);
} catch (ClassCastException e) {
throw new WrappedIOException("Error: cannot initialize the DOM serializer", e);
} catch (ClassNotFoundException e) {
throw new WrappedIOException("Error: cannot initialize the DOM serializer", e);
} catch (InstantiationException e) {
throw new WrappedIOException("Error: cannot initialize the DOM serializer", e);
} catch (IllegalAccessException e) {
throw new WrappedIOException("Error: cannot initialize the DOM serializer", e);
}
}
//===========================================================================================
@Override
protected void startNewPage()
{
curpage = createPageElement();
body.appendChild(curpage);
}
@Override
protected void renderText(String data)
{
curpage.appendChild(createTextElement(data));
}
@Override
protected void renderRectangle(float x, float y, float width, float height, boolean stroke, boolean fill)
{
curpage.appendChild(createRectangleElement(x, y, width, height, stroke, fill));
}
@Override
protected void renderImage(float x, float y, float width, float height, String mimetype, byte[] data)
{
curpage.appendChild(createImageElement(x, y, width, height, mimetype, data));
}
//===========================================================================================
/**
* Creates an element that represents a single page.
* @return the resulting DOM element
*/
protected Element createPageElement()
{
String pstyle = "";
PDRectangle layout = getCurrentMediaBox();
if (layout != null)
{
pstyle = "width:" + layout.getUpperRightX() + UNIT + ";"
+ "height:" + layout.getUpperRightY() + UNIT;
}
else
System.err.println("Warning: no media box found");
Element el = doc.createElement("div");
el.setAttribute("id", "page_" + (pagecnt++));
el.setAttribute("class", "page");
el.setAttribute("style", pstyle);
return el;
}
/**
* Creates an element that represents a single positioned box with no content.
* @return the resulting DOM element
*/
protected Element createTextElement()
{
Element el = doc.createElement("div");
el.setAttribute("id", "p" + (textcnt++));
el.setAttribute("class", "p");
el.setAttribute("style", curstyle.toString());
return el;
}
/**
* Creates an element that represents a single positioned box containing the specified text string.
* @param data the text string to be contained in the created box.
* @return the resulting DOM element
*/
protected Element createTextElement(String data)
{
Element el = createTextElement();
Text text = doc.createTextNode(data);
el.appendChild(text);
return el;
}
/**
* Creates an element that represents a rectangle drawn at the specified coordinates in the page.
* @param x the X coordinate of the rectangle
* @param y the Y coordinate of the rectangle
* @param width the width of the rectangle
* @param height the height of the rectangle
* @param stroke should there be a stroke around?
* @param fill should the rectangle be filled?
* @return the resulting DOM element
*/
protected Element createRectangleElement(float x, float y, float width, float height, boolean stroke, boolean fill)
{
String color = "black";
if (strokingColor != null)
color = strokingColor;
StringBuilder pstyle = new StringBuilder(50);
pstyle.append("left:").append(style.formatLength(x)).append(';');
pstyle.append("bottom:").append(style.formatLength(y)).append(';');
pstyle.append("width:").append(style.formatLength(width)).append(';');
pstyle.append("height:").append(style.formatLength(height)).append(';');
if (stroke)
{
//float old = lineWidth;
lineWidth = transformLength((float) getGraphicsState().getLineWidth());
/*if (lineWidth != old)
System.out.println("LW:" + old + "->" + lineWidth);*/
String lw = lineWidth == 0 ? "1px" : lineWidth + "pt";
pstyle.append("border:").append(lw).append(" solid ").append(color).append(';');
}
if (fill)
{
pstyle.append("background-color:").append(style.getColor()).append(';');
}
Element el = doc.createElement("div");
el.setAttribute("class", "r");
el.setAttribute("style", pstyle.toString());
el.appendChild(doc.createEntityReference("nbsp"));
return el;
}
/**
* Creates an element that represents an image drawn at the specified coordinates in the page.
* @param x the X coordinate of the image
* @param y the Y coordinate of the image
* @param width the width coordinate of the image
* @param height the height coordinate of the image
* @param type the image type: "png"
or "jpeg"
* @param data the image data depending on the specified type
* @return
*/
protected Element createImageElement(float x, float y, float width, float height, String mimetype, byte[] data)
{
StringBuilder pstyle = new StringBuilder("position:absolute;");
pstyle.append("left:").append(x).append(UNIT).append(';');
pstyle.append("bottom:").append(y).append(UNIT).append(';');
pstyle.append("width:").append(width).append(UNIT).append(';');
pstyle.append("height:").append(height).append(UNIT).append(';');
//pstyle.append("border:1px solid red;");
Element el = doc.createElement("img");
el.setAttribute("style", pstyle.toString());
if (!disableImageData)
{
char[] cdata = Base64Coder.encode(data);
String imgdata = "data:" + mimetype + ";base64," + new String(cdata);
el.setAttribute("src", imgdata);
}
else
el.setAttribute("src", "");
return el;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy