All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.objects.structuredtext.StructuredContentHandler Maven / Gradle / Ivy

There is a newer version: 20151002
Show newest version
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2015 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * StructuredContentHandler.java
 * ---------------
 */
package org.jpedal.objects.structuredtext;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.jpedal.objects.raw.PdfDictionary;
import org.jpedal.objects.raw.PdfObject;
import org.jpedal.render.DynamicVectorRenderer;
import org.w3c.dom.Element;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

/**
 * structured content
 */
public class StructuredContentHandler {

    /**store entries from BMC*/
    private final Map markedContentProperties;

    /**handle nested levels of marked content*/
    int markedContentLevel;

    /**stream of marked content*/
    private StringBuffer markedContentSequence;

    private static final boolean debug=false;

    private boolean contentExtracted;

    private String currentKey;

    final Map keys, dictionaries;
    Map values;

    final boolean buildDirectly;
    
    DynamicVectorRenderer current;

    Document doc;

    Element root;

	private float x1,y1,x2,y2;
    boolean isHTML;
    
    public StructuredContentHandler(final Object markedContent) {

        //build either tree of lookuptable
        if(markedContent instanceof Map){
        	buildDirectly=false;
            values=(Map)markedContent;
        }else{
            buildDirectly=true;
            doc=(Document)markedContent;
            root = doc.createElement("TaggedPDF-doc");
		    doc.appendChild(root);
        }
        
        if(debug) {
            System.out.println("BuildDirectly=" + buildDirectly);
        }

        //this.currentPdfFile=currentPdfFile;

        markedContentProperties=new HashMap();
        markedContentLevel = 0;

        markedContentSequence = new StringBuffer();

        currentKey="";

        
        keys=new HashMap();
        
        dictionaries=new HashMap();

    }

    public void DP(final PdfObject BDCobj) {

        if(debug){
            System.out.println("DP----------------------------------------------------------"+markedContentLevel);

            System.out.println(BDCobj);

            System.out.println("BDCobj="+BDCobj);

        }


    }

    public void BDC(final PdfObject BDCobj) {

        //if start of sequence, reinitialise settings
        if (markedContentLevel == 0) {
            markedContentSequence = new StringBuffer();
        }

        markedContentLevel++;

        //only used in direct mode and breaks non-direct code so remove
        if(buildDirectly) {
            BDCobj.setIntNumber(PdfDictionary.MCID, -1);
        }
        
        final int MCID=BDCobj.getInt(PdfDictionary.MCID);
        
        //save key

        if(MCID!=-1) {
            keys.put(markedContentLevel, String.valueOf(MCID));
        }
        
        dictionaries.put(String.valueOf(markedContentLevel),BDCobj);
    	
        
        if(debug){
            System.out.println("BDC----------------------------------------------------------"+markedContentLevel+" MCID="+MCID);
            System.out.println("BDCobj="+BDCobj);
        }
        
        
    }


    public void BMC(String op) {

        op = setBMCvalues(op);

		if(buildDirectly){
			//read any dictionay work out type
			//PdfObject dict=(PdfObject) dictionaries.get(currentKey);
			//boolean isBMC=dict==null;

			//add node with name for BMC
			if(op!=null){
				//System.out.println(op+" "+root.getElementsByTagName(op));
				Element newRoot=(Element) root.getElementsByTagName(op).item(0);
				
				if(newRoot==null){
					newRoot=doc.createElement(op);
					root.appendChild(newRoot);
				}
				root=newRoot;
			}
        }
	}

    String setBMCvalues(String op) {
        
        //stip off /
        if(op.startsWith("/")) {
            op = op.substring(1);
        }
        
        //if start of sequence, reinitialise settings
        if (markedContentLevel == 0 && !isHTML) {
            markedContentSequence = new StringBuffer();
        }
        markedContentProperties.put(markedContentLevel,op);
        markedContentLevel++;
        if(debug) {
            System.out.println("BMC----------------------------------------------------------level=" + markedContentLevel + " raw op=" + op);
        }
        
        //save label and any dictionary
        keys.put(markedContentLevel,op);

        return op;
    }


    public void EMC() {

    	setEMCValues();
        
        if(buildDirectly){

			final PdfObject BDCobj=(PdfObject) dictionaries.get(currentKey);
			
			final boolean isBMC=(BDCobj==null);
			
			if(debug) {
                System.out.println(isBMC + " " + currentKey + ' ' + BDCobj + " markedContentSequence=" + markedContentSequence);
            }

            //any custom tags
            if(BDCobj!=null){
                final Map metadata=BDCobj.getOtherDictionaries();
                if(metadata!=null){
                    final Iterator customValues=metadata.keySet().iterator();
                    Object key;
                    while(customValues.hasNext()){
                        key=customValues.next();
                        root.setAttribute(key.toString(), metadata.get(key).toString());

                        //if(addCoordinates){
                            root.setAttribute("x1", String.valueOf((int) x1));
                            root.setAttribute("y1", String.valueOf((int) y1));
                            root.setAttribute("x2", String.valueOf((int) x2));
                            root.setAttribute("y2", String.valueOf((int) y2));
                        //}
                    }
                }
            }
            
			//add node with name for BMC
			if(isBMC){
				if(currentKey!=null){
					
					final Node child=doc.createTextNode(stripEscapeChars(markedContentSequence.toString()));
					
					root.appendChild(child);

					final Node oldRoot=root.getParentNode();
					if(oldRoot instanceof Element) {
                        root = (Element) oldRoot;
                    }
				}
			}else{
				//get root key on dictionary (should only be 1)
				//and create node
				//Iterator keys=dict.keySet().iterator();
				String S="p";//(String) keys.next();
				

                //System.out.println("dict="+BDCobj.getObjectRefAsString());
                
				if(S==null) {
                    S = "p";
                }

				final Element tag = doc.createElement(S);
				root.appendChild(tag);

				//add the text
				final Node child=doc.createTextNode(markedContentSequence.toString());
				tag.appendChild(child);
            }

			//reset
			markedContentSequence=new StringBuffer();


        }else{
        	
        	final String ContentSequence = markedContentSequence.toString();

        	//System.out.println(currentKey+" "+markedContentSequence);
            if(debug) {
                System.out.println("write out " + currentKey + " text=" + markedContentSequence + '<');
            }
            
            final PdfObject BDCobj=(PdfObject) (dictionaries.get(String.valueOf(markedContentLevel)));
            
           // System.out.println("BDCobj="+BDCobj+" currentKey="+currentKey);
            
           
            //reset on MCID tag
            int MCID=-1;
            if(BDCobj!=null) {
                MCID = BDCobj.getInt(PdfDictionary.MCID);
            }
            
            if(MCID!=-1){
            	values.put(String.valueOf(MCID),ContentSequence);
            	//System.out.println(MCID+" "+ContentSequence);
                markedContentSequence=new StringBuffer();
            }
            
            //remove used dictionary
            dictionaries.remove(String.valueOf(markedContentLevel));

        }

        if (markedContentLevel > 0) {
            markedContentLevel--;
        }

        if(debug) {
            System.out.println("EMC----------------------------------------------------------" + markedContentLevel);
        }

        
    }

    void setEMCValues() {
        
        //set flag to show some content
        contentExtracted=true;
        
        /**
         * add current structure to tree
         **/
        currentKey=(String)keys.get(markedContentLevel);
        
        //if no MCID use current level as key
        if(currentKey==null) {
            currentKey = String.valueOf(markedContentLevel);
        }
        
        if(debug) {
            System.out.println("currentKey=" + currentKey + ' ' + keys);
        }
    }

    /**store the actual text in the stream*/
    public void setText(final StringBuffer current_value, final float x1, final float y1, final float x2, final float y2) {

    	if(markedContentSequence.length()==0){
    		markedContentSequence=current_value;
    		
    		//lose space at start
    		if(markedContentSequence.length()>0 && markedContentSequence.charAt(0)==' ') {
                markedContentSequence.deleteCharAt(0);
            }
    		
    	}else{ //add space to tidy up
    		
    		//char c=' ',c2=' ';

    		//if(current_value.length()>0)
    		//	c=current_value.charAt(0);

    		//int len=markedContentSequence.length()-1;
    		//if(len>0)
    		//	c2=markedContentSequence.charAt(len);

    		//if(c2!='-' && c!='-' && c!='.')
    		//	markedContentSequence.append(' ');

    		//System.out.println("\nbit=>"+current_value+"<");
    		//System.out.println("whole=>"+markedContentSequence+"<");

    		markedContentSequence.append(current_value);

    	}
    	
    	this.x1=x1;
    	this.y1=y1;
    	this.x2=x2;
    	this.y2=y2;

    }


    
    
    //delete escape chars such as \( but allow for \\
	private static String stripEscapeChars(final Object dict) {
		char c,lastC=' ';
		
		final StringBuilder str=new StringBuilder((String) dict);
		int length=str.length();
		for(int ii=0;ii




© 2015 - 2024 Weber Informatics LLC | Privacy Policy