org.jpedal.objects.structuredtext.MarkedContentGenerator Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2016 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * MarkedContentGenerator.java
 * ---------------
 */
package org.jpedal.objects.structuredtext;

import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.jpedal.PdfDecoderInt;
import org.jpedal.io.ObjectStore;
import org.jpedal.io.PdfObjectReader;
import org.jpedal.objects.PdfPageData;
import org.jpedal.objects.PdfResources;
import org.jpedal.objects.layers.PdfLayerList;
import org.jpedal.objects.raw.*;
import org.jpedal.parser.PdfStreamDecoder;
import org.jpedal.parser.ValueTypes;
import org.jpedal.render.SwingDisplay;
import org.jpedal.utils.LogWriter;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;

/**
 * extract as marked content
 */
public class MarkedContentGenerator {
    
    private PdfObjectReader currentPdfFile;
    
    private DocumentBuilder db;
    
    private Document doc;
    
    private Element root;
    
    private final Map pageStreams=new HashMap();
    
    private PdfResources res;
    
    private PdfLayerList layers;
    
    private PdfPageData pdfPageData;
    
    private boolean isDecoding;
    
    static boolean debug;
    
    //used to indent debug output
    static String indent="";
    
    final Map reverseLookup=new HashMap();
    
    boolean isHTML;
    
    /**
     * main entry paint
     */
    public Document getMarkedContentTree(final PdfResources res, final PdfPageData pdfPageData, final PdfObjectReader currentPdfFile) {
        
        PdfObject structTreeRootObj=res.getPdfObject(PdfResources.StructTreeRootObj);
        //PdfObject markInfoObj=res.getPdfObject(PdfResources.MarkInfoObj);  //not used at present
        
        this.res=res;
        this.layers=res.getPdfLayerList();
        
        this.pdfPageData=pdfPageData;
        
        this.currentPdfFile=currentPdfFile;
        
        //read values as needed
        this.currentPdfFile.checkResolved(structTreeRootObj);
        
        /*
         * create the empty XMLtree and root to add data onto
         **/
        if(!isHTML){
            setupTree();
            
            final boolean hasTree=structTreeRootObj!=null && structTreeRootObj.getDictionary(PdfDictionary.ParentTree)!=null;
            
            if(debug) {
                System.out.println("hastree=" + hasTree);
            }
            
            //choose appropriate method
            if(hasTree){
                
                /*
                 * scan PDF and add nodes to XML tree
                 */
                buildTree(structTreeRootObj);
                
                //flush all objects
                pageStreams.clear();
                
            }else{ //from the page stream
                
                try {
                    decodePageForMarkedContent(1, null, doc);
                } catch (final Exception e) {
                    LogWriter.writeLog("Exception: " + e.getMessage());
                }
            }
        }
        
        return doc;
        
    }
    
    /**
     * create a blank XML structure and a root. Add comment to say created by JPedal
     */
    private void setupTree() {
        
        try {
            final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            db = dbf.newDocumentBuilder();
        } catch (final ParserConfigurationException e) {
            LogWriter.writeLog("Exception: " + e.getMessage());
        }
        
        doc =  db.newDocument();
        
        doc.appendChild(doc.createComment(" Created from JPedal "));
        doc.appendChild(doc.createComment(" http://www.idrsolutions.com "));
        
    }
    
    /**
     * scan down PDF struct object, creating XML tree
     */
    private void buildTree(PdfObject structTreeRootObj) {

        root = doc.createElement("TaggedPDF-doc");
        doc.appendChild(root);
        
        traverseContentTree(structTreeRootObj);
    }

    public void traverseContentTree(PdfObject structTreeRootObj) {
        /*
         * read struct K value and decide what type
         * (can be dictionary or Array so we check both options)
         */
        final PdfObject K =structTreeRootObj.getDictionary(PdfDictionary.K);
        if(K ==null){
            final PdfArrayIterator Karray=structTreeRootObj.getMixedArray(PdfDictionary.K);
            
            if(debug) {
                System.out.println("Karray=");
            }
            
        
            readKarray(Karray, root,null, "");
            
            if(debug) {
                System.out.println("Karray read");
            }
            
        }else{
            
            if(debug) {
                System.out.println("read child=" + K.getObjectRefAsString());
            }
            
            readChildNode(K, root,null,"");
        }
    }
    
    private void readChildNode(final PdfObject K, final Element root,Map pageStream, String fullS) {
        
        if(debug){
            indent += "   ";
                System.out.println(indent+"read child node "+K.getObjectRefAsString()+ ' ' +K.getInt(PdfDictionary.K));
        }
        
        final PdfObject Pg;
        final PdfArrayIterator Karray = K.getMixedArray(PdfDictionary.K);
        final int Kint = K.getInt(PdfDictionary.K);
        
        final PdfObject Kdict = K.getDictionary(PdfDictionary.K);
        
        final String lang = K.getTextStreamValue(PdfDictionary.Lang);
        final String S = K.getName(PdfDictionary.S);
        
        fullS=fullS+ '.' +S;
        
        Element child=null;
        
        if(debug){
                System.out.println(indent+"S= "+S+ ' ');
        
                if(S==null){
                    System.out.println("S is null in "+K.getObjectRefAsString());
                
                }
        }
        
        
        //add child but collapse /Span into main Tag
        if(S!=null){
            if (S.equals("Span")) {
                child = root;
            } else {
                if(doc!=null){
                    child = doc.createElement(cleanName(S));
                }
                if (lang != null) {
                    child.setAttribute("xml:lang", lang);
                }

                if(root!=null){
                    root.appendChild(child);
                }
            }
        }
        
        //get page object
        Pg=K.getDictionary(PdfDictionary.Pg);
        
        if(Pg!=null && pageStream==null && !isHTML){
            
            //if not yet decoded, get values from it see if cached and decode if not
            //pageStream=(Map)pageStreams.get(Pg);
            
           
        
                if(debug) {
                    System.out.println(indent + "decode page ");
                }
        
        
                pageStream=new HashMap();
                try {
                    decodePageForMarkedContent(-1, Pg,pageStream); //-1 deliberate bum value as should not be used

                    //20130717 - disabled by Mark for memory issues 
                    //ie Postgres_Plus_Cloud_Database_Getting_Started_Guide_20130219.pdf 
                    //  pageStreams.put(Pg,pageStream);
                    
                } catch (final Exception e) {
                    LogWriter.writeLog("Exception: " + e.getMessage());
                }
            }
        
        
        if(debug) {
            System.out.println(indent + "page decoded karray" + Karray + " Kdict=" + Kdict + " kint=" + Kint);
        }
        
        
        if (Karray != null) {
            readKarray(Karray,child,pageStream, fullS);
        }else if(Kdict!=null){
            readChildNode(Kdict, child,pageStream,fullS);
        } else if (Kint != -1 && !isHTML) { // actual value
            
            //reached the bottom so allow recursion to unwind naturally
            addContentToNode(pageStream, String.valueOf(Kint), child);
        } else if(K.getTextStreamValue(PdfDictionary.T)!=null){
            //System.out.println("ANnot");
        } else if(debug){
            System.out.println("unimplemented "+K.getObjectRefAsString());
        }
        
        if(debug){
                System.out.println(indent+"child node read "+K.getObjectRefAsString());
                
                indent=indent.substring(0,indent.length()-3);
        }
    }
    
    private void addContentToNode(final Map pageStream, final String Kint, final Element child) {
        
        if(!isHTML){
            
            String text = (String) pageStream.get(Kint);
            
            if (text != null) {
                text = handleXMLCharacters(text);
                
                if(doc!=null){
                    final Text textNode = doc.createTextNode(text);
                    child.appendChild(textNode);
                }
            }
            
            if(debug) {
                System.out.println(indent + " added " + text);
            }
        }
    }
    
    private static String handleXMLCharacters(String text) {
        text = text.replaceAll("<", "<");
        text = text.replaceAll(">", ">");
        
        return text;
    }
    
    private void readKarray(final PdfArrayIterator Karray, final Element root, final Map pageStream, String fullS) {
        
        final int count=Karray.getTokenCount();
        PdfObject kidObj;
        String KValue;
        
        for(int i=0;i