org.jpedal.objects.structuredtext.MarkedContentGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of OpenViewerFX Show documentation
Show all versions of OpenViewerFX Show documentation
An Open Source JavaFX PDF Viewer
/*
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.idrsolutions.com
* Help section for developers at http://www.idrsolutions.com/support/
*
* (C) Copyright 1997-2017 IDRsolutions and Contributors.
*
* This file is part of JPedal/JPDF2HTML5
*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* ---------------
* MarkedContentGenerator.java
* ---------------
*/
package org.jpedal.objects.structuredtext;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.jpedal.PdfDecoderInt;
import org.jpedal.io.ObjectDecoder;
import org.jpedal.io.ObjectStore;
import org.jpedal.io.PdfObjectReader;
import org.jpedal.objects.PdfPageData;
import org.jpedal.objects.PdfResources;
import org.jpedal.objects.layers.PdfLayerList;
import org.jpedal.objects.raw.*;
import org.jpedal.parser.PdfStreamDecoder;
import org.jpedal.parser.ValueTypes;
import org.jpedal.render.SwingDisplay;
import org.jpedal.utils.LogWriter;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
/**
* extract as marked content
*/
public class MarkedContentGenerator {
private PdfObjectReader currentPdfFile;
private DocumentBuilder db;
private Document doc;
private Element root;
private final Map pageStreams=new HashMap();
private PdfResources res;
private PdfLayerList layers;
private PdfPageData pdfPageData;
private boolean isDecoding;
static boolean debug;
//used to indent debug output
static String indent="";
final Map reverseLookup=new HashMap();
final Map rolemapLookup = new HashMap();
boolean isHTML;
/**
* main entry paint
*/
public Document getMarkedContentTree(final PdfResources res, final PdfPageData pdfPageData, final PdfObjectReader currentPdfFile) {
final PdfObject structTreeRootObj=res.getPdfObject(PdfResources.StructTreeRootObj);
//PdfObject markInfoObj=res.getPdfObject(PdfResources.MarkInfoObj); //not used at present
this.res=res;
this.layers=res.getPdfLayerList();
this.pdfPageData=pdfPageData;
this.currentPdfFile=currentPdfFile;
//read values as needed
this.currentPdfFile.checkResolved(structTreeRootObj);
/*
* create the empty XMLtree and root to add data onto
**/
if(!isHTML){
setupTree();
final boolean hasTree=structTreeRootObj!=null && structTreeRootObj.getDictionary(PdfDictionary.ParentTree)!=null;
if(debug) {
System.out.println("hastree=" + hasTree);
}
//choose appropriate method
if(hasTree){
/*
* scan PDF and add nodes to XML tree
*/
buildTree(structTreeRootObj);
//flush all objects
pageStreams.clear();
}else{ //from the page stream
try {
decodePageForMarkedContent(1, null, doc);
} catch (final Exception e) {
LogWriter.writeLog("Exception: " + e.getMessage());
}
}
}
return doc;
}
/**
* create a blank XML structure and a root. Add comment to say created by JPedal
*/
private void setupTree() {
try {
final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
db = dbf.newDocumentBuilder();
} catch (final ParserConfigurationException e) {
LogWriter.writeLog("Exception: " + e.getMessage());
}
doc = db.newDocument();
doc.appendChild(doc.createComment(" Created from JPedal "));
doc.appendChild(doc.createComment(" http://www.idrsolutions.com "));
}
/**
* scan down PDF struct object, creating XML tree
*/
private void buildTree(final PdfObject structTreeRootObj) {
final PdfObject RoleMap=structTreeRootObj.getDictionary(PdfDictionary.RoleMap);
if(RoleMap!=null){
readRoleMap(RoleMap);
}
root = doc.createElement("TaggedPDF-doc");
doc.appendChild(root);
traverseContentTree(structTreeRootObj);
}
private void readRoleMap(final PdfObject roleMap) {
String key, value;
final PdfKeyPairsIterator keyPairs=roleMap.getKeyPairsIterator();
while(keyPairs.hasMorePairs()){
key=keyPairs.getNextKeyAsString();
value=keyPairs.getNextValueAsString();
rolemapLookup.put(key, value);
keyPairs.nextPair();
}
}
public void traverseContentTree(final PdfObject structTreeRootObj) {
/*
* read struct K value and decide what type
* (can be dictionary or Array so we check both options)
*/
final PdfObject K =structTreeRootObj.getDictionary(PdfDictionary.K);
if(K ==null){
final PdfArrayIterator Karray=structTreeRootObj.getMixedArray(PdfDictionary.K);
if(debug) {
System.out.println("Karray=");
}
readKarray(Karray, root,null, "");
if(debug) {
System.out.println("Karray read");
}
}else{
if(debug) {
System.out.println("read child=" + K.getObjectRefAsString());
}
readChildNode(K, root,null,"");
}
}
private void readChildNode(final PdfObject K, final Element root,Map pageStream, String fullS) {
if(debug){
indent += " ";
System.out.println(indent+"read child node "+K.getObjectRefAsString()+ ' ' +K.getInt(PdfDictionary.K));
}
final PdfObject Pg;
final PdfArrayIterator Karray = K.getMixedArray(PdfDictionary.K);
final int Kint = K.getInt(PdfDictionary.K);
final PdfObject Kdict = K.getDictionary(PdfDictionary.K);
final String lang = K.getTextStreamValue(PdfDictionary.Lang);
String S = K.getName(PdfDictionary.S);
if (rolemapLookup.containsKey(S)) {
S = rolemapLookup.get(S);
}
fullS=fullS+ '.' +S;
Element child=null;
if(debug){
System.out.println(indent+"S= "+S+ ' ');
if(S==null){
System.out.println("S is null in "+K.getObjectRefAsString());
}
}
//add child but collapse /Span into main Tag
if(S!=null){
if (S.equals("Span")) {
child = root;
} else {
if(doc!=null){
child = doc.createElement(cleanName(S));
}
if (lang != null) {
child.setAttribute("xml:lang", lang);
}
if(root!=null){
root.appendChild(child);
}
}
}
//get page object
Pg=K.getDictionary(PdfDictionary.Pg);
if(Pg!=null && pageStream==null && !isHTML){
//if not yet decoded, get values from it see if cached and decode if not
//pageStream=(Map)pageStreams.get(Pg);
if(debug) {
System.out.println(indent + "decode page ");
}
pageStream=new HashMap();
try {
decodePageForMarkedContent(-1, Pg,pageStream); //-1 deliberate bum value as should not be used
//20130717 - disabled by Mark for memory issues
//ie Postgres_Plus_Cloud_Database_Getting_Started_Guide_20130219.pdf
// pageStreams.put(Pg,pageStream);
} catch (final Exception e) {
LogWriter.writeLog("Exception: " + e.getMessage());
}
}
if(debug) {
System.out.println(indent + "page decoded karray" + Karray + " Kdict=" + Kdict + " kint=" + Kint);
}
if (Karray != null) {
readKarray(Karray,child,pageStream, fullS);
}else if(Kdict!=null){
readChildNode(Kdict, child,pageStream,fullS);
} else if (Kint != -1 && !isHTML) { // actual value
//reached the bottom so allow recursion to unwind naturally
addContentToNode(pageStream, String.valueOf(Kint), child);
} else if(K.getTextStreamValue(PdfDictionary.T)!=null){
//System.out.println("ANnot");
} else if(debug){
System.out.println("unimplemented "+K.getObjectRefAsString());
}
if(debug){
System.out.println(indent+"child node read "+K.getObjectRefAsString());
indent=indent.substring(0,indent.length()-3);
}
}
private void addContentToNode(final Map pageStream, final String Kint, final Element child) {
if(!isHTML){
String text = (String) pageStream.get(Kint);
if (text != null) {
text = handleXMLCharacters(text);
if(doc!=null){
final Text textNode = doc.createTextNode(text);
child.appendChild(textNode);
}
}
if(debug) {
System.out.println(indent + " added " + text);
}
}
}
private static String handleXMLCharacters(String text) {
text = text.replaceAll("<", "<");
text = text.replaceAll(">", ">");
return text;
}
private void readKarray(final PdfArrayIterator Karray, final Element root, final Map pageStream, final String fullS) {
final int count=Karray.getTokenCount();
PdfObject kidObj;
String KValue;
for(int i=0;i