Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.idrsolutions.com
* Help section for developers at http://www.idrsolutions.com/support/
*
* (C) Copyright 1997-2016 IDRsolutions and Contributors.
*
* This file is part of JPedal/JPDF2HTML5
*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* ---------------
* MarkedContentGenerator.java
* ---------------
*/
package org.jpedal.objects.structuredtext;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.jpedal.PdfDecoderInt;
import org.jpedal.io.ObjectStore;
import org.jpedal.io.PdfObjectReader;
import org.jpedal.objects.PdfPageData;
import org.jpedal.objects.PdfResources;
import org.jpedal.objects.layers.PdfLayerList;
import org.jpedal.objects.raw.*;
import org.jpedal.parser.PdfStreamDecoder;
import org.jpedal.parser.ValueTypes;
import org.jpedal.render.SwingDisplay;
import org.jpedal.utils.LogWriter;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
/**
* extract as marked content
*/
public class MarkedContentGenerator {
private PdfObjectReader currentPdfFile;
private DocumentBuilder db;
private Document doc;
private Element root;
private final Map pageStreams=new HashMap();
private PdfResources res;
private PdfLayerList layers;
private PdfPageData pdfPageData;
private boolean isDecoding;
static boolean debug;
//used to indent debug output
static String indent="";
final Map reverseLookup=new HashMap();
boolean isHTML;
/**
* main entry paint
*/
public Document getMarkedContentTree(final PdfResources res, final PdfPageData pdfPageData, final PdfObjectReader currentPdfFile) {
PdfObject structTreeRootObj=res.getPdfObject(PdfResources.StructTreeRootObj);
//PdfObject markInfoObj=res.getPdfObject(PdfResources.MarkInfoObj); //not used at present
this.res=res;
this.layers=res.getPdfLayerList();
this.pdfPageData=pdfPageData;
this.currentPdfFile=currentPdfFile;
//read values as needed
this.currentPdfFile.checkResolved(structTreeRootObj);
/*
* create the empty XMLtree and root to add data onto
**/
if(!isHTML){
setupTree();
final boolean hasTree=structTreeRootObj!=null && structTreeRootObj.getDictionary(PdfDictionary.ParentTree)!=null;
if(debug) {
System.out.println("hastree=" + hasTree);
}
//choose appropriate method
if(hasTree){
/*
* scan PDF and add nodes to XML tree
*/
buildTree(structTreeRootObj);
//flush all objects
pageStreams.clear();
}else{ //from the page stream
try {
decodePageForMarkedContent(1, null, doc);
} catch (final Exception e) {
LogWriter.writeLog("Exception: " + e.getMessage());
}
}
}
return doc;
}
/**
* create a blank XML structure and a root. Add comment to say created by JPedal
*/
private void setupTree() {
try {
final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
db = dbf.newDocumentBuilder();
} catch (final ParserConfigurationException e) {
LogWriter.writeLog("Exception: " + e.getMessage());
}
doc = db.newDocument();
doc.appendChild(doc.createComment(" Created from JPedal "));
doc.appendChild(doc.createComment(" http://www.idrsolutions.com "));
}
/**
* scan down PDF struct object, creating XML tree
*/
private void buildTree(PdfObject structTreeRootObj) {
root = doc.createElement("TaggedPDF-doc");
doc.appendChild(root);
traverseContentTree(structTreeRootObj);
}
public void traverseContentTree(PdfObject structTreeRootObj) {
/*
* read struct K value and decide what type
* (can be dictionary or Array so we check both options)
*/
final PdfObject K =structTreeRootObj.getDictionary(PdfDictionary.K);
if(K ==null){
final PdfArrayIterator Karray=structTreeRootObj.getMixedArray(PdfDictionary.K);
if(debug) {
System.out.println("Karray=");
}
readKarray(Karray, root,null, "");
if(debug) {
System.out.println("Karray read");
}
}else{
if(debug) {
System.out.println("read child=" + K.getObjectRefAsString());
}
readChildNode(K, root,null,"");
}
}
private void readChildNode(final PdfObject K, final Element root,Map pageStream, String fullS) {
if(debug){
indent += " ";
System.out.println(indent+"read child node "+K.getObjectRefAsString()+ ' ' +K.getInt(PdfDictionary.K));
}
final PdfObject Pg;
final PdfArrayIterator Karray = K.getMixedArray(PdfDictionary.K);
final int Kint = K.getInt(PdfDictionary.K);
final PdfObject Kdict = K.getDictionary(PdfDictionary.K);
final String lang = K.getTextStreamValue(PdfDictionary.Lang);
final String S = K.getName(PdfDictionary.S);
fullS=fullS+ '.' +S;
Element child=null;
if(debug){
System.out.println(indent+"S= "+S+ ' ');
if(S==null){
System.out.println("S is null in "+K.getObjectRefAsString());
}
}
//add child but collapse /Span into main Tag
if(S!=null){
if (S.equals("Span")) {
child = root;
} else {
if(doc!=null){
child = doc.createElement(cleanName(S));
}
if (lang != null) {
child.setAttribute("xml:lang", lang);
}
if(root!=null){
root.appendChild(child);
}
}
}
//get page object
Pg=K.getDictionary(PdfDictionary.Pg);
if(Pg!=null && pageStream==null && !isHTML){
//if not yet decoded, get values from it see if cached and decode if not
//pageStream=(Map)pageStreams.get(Pg);
if(debug) {
System.out.println(indent + "decode page ");
}
pageStream=new HashMap();
try {
decodePageForMarkedContent(-1, Pg,pageStream); //-1 deliberate bum value as should not be used
//20130717 - disabled by Mark for memory issues
//ie Postgres_Plus_Cloud_Database_Getting_Started_Guide_20130219.pdf
// pageStreams.put(Pg,pageStream);
} catch (final Exception e) {
LogWriter.writeLog("Exception: " + e.getMessage());
}
}
if(debug) {
System.out.println(indent + "page decoded karray" + Karray + " Kdict=" + Kdict + " kint=" + Kint);
}
if (Karray != null) {
readKarray(Karray,child,pageStream, fullS);
}else if(Kdict!=null){
readChildNode(Kdict, child,pageStream,fullS);
} else if (Kint != -1 && !isHTML) { // actual value
//reached the bottom so allow recursion to unwind naturally
addContentToNode(pageStream, String.valueOf(Kint), child);
} else if(K.getTextStreamValue(PdfDictionary.T)!=null){
//System.out.println("ANnot");
} else if(debug){
System.out.println("unimplemented "+K.getObjectRefAsString());
}
if(debug){
System.out.println(indent+"child node read "+K.getObjectRefAsString());
indent=indent.substring(0,indent.length()-3);
}
}
private void addContentToNode(final Map pageStream, final String Kint, final Element child) {
if(!isHTML){
String text = (String) pageStream.get(Kint);
if (text != null) {
text = handleXMLCharacters(text);
if(doc!=null){
final Text textNode = doc.createTextNode(text);
child.appendChild(textNode);
}
}
if(debug) {
System.out.println(indent + " added " + text);
}
}
}
private static String handleXMLCharacters(String text) {
text = text.replaceAll("<", "<");
text = text.replaceAll(">", ">");
return text;
}
private void readKarray(final PdfArrayIterator Karray, final Element root, final Map pageStream, String fullS) {
final int count=Karray.getTokenCount();
PdfObject kidObj;
String KValue;
for(int i=0;i