org.apache.pdfbox.util.PDFStreamEngine Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.io.IOException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.exceptions.WrappedIOException;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;

import org.apache.pdfbox.pdmodel.common.PDMatrix;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontFactory;
import org.apache.pdfbox.pdmodel.font.PDType3Font;

import org.apache.pdfbox.pdmodel.graphics.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;

import org.apache.pdfbox.util.operator.OperatorProcessor;

/**
 * This class will run through a PDF content stream and execute certain operations
 * and provide a callback interface for clients that want to do things with the stream.
 * See the PDFTextStripper class for an example of how to use this class.
 *
 * @author Ben Litchfield
 * @version $Revision: 1.38 $
 */
public class PDFStreamEngine
{

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(PDFStreamEngine.class);

    /**
     * The PDF operators that are ignored by this engine.
     */
    private final Set unsupportedOperators = new HashSet();
    
    private PDGraphicsState graphicsState = null;

    private Matrix textMatrix = null;
    private Matrix textLineMatrix = null;
    private Stack graphicsStack = new Stack();

    private Map operators = new HashMap();

    private Stack streamResourcesStack = new Stack();

    private PDPage page;

    private int validCharCnt;
    private int totalCharCnt;

    /**
     * Flag to skip malformed or otherwise unparseable input where possible.
     */
    private boolean forceParsing = false;

    /**
     * Constructor.
     */
    public PDFStreamEngine()
    {
        //default constructor
        validCharCnt = 0;
        totalCharCnt = 0;
        
    }

    /**
     * Constructor with engine properties.  The property keys are all
     * PDF operators, the values are class names used to execute those
     * operators. An empty value means that the operator will be silently
     * ignored.
     *
     * @param properties The engine properties.
     *
     * @throws IOException If there is an error setting the engine properties.
     */
    public PDFStreamEngine( Properties properties ) throws IOException
    {
        if( properties == null ) 
        {
            throw new NullPointerException( "properties cannot be null" );
        }
        Enumeration names = properties.propertyNames();
        for ( Object name : Collections.list( names ) )
        {
            String operator = name.toString();
            String processorClassName = properties.getProperty( operator );
            if( "".equals( processorClassName ) )
            {
                unsupportedOperators.add( operator );
            }
            else
            {
                try
                {
                    Class klass = Class.forName( processorClassName );
                    OperatorProcessor processor =
                        (OperatorProcessor) klass.newInstance();
                    registerOperatorProcessor( operator, processor );
                }
                catch( Exception e )
                {
                    throw new WrappedIOException(
                            "OperatorProcessor class " + processorClassName
                            + " could not be instantiated", e );
                }
            }
        }
        validCharCnt = 0;
        totalCharCnt = 0;
    }

    /**
     * Indicates if force parsing is activated.
     * 
     * @return true if force parsing is active
     */
    public boolean isForceParsing() 
    {
        return forceParsing;
    }

    /**
     * Enable/Disable force parsing.
     * 
     * @param forceParsingValue true activates force parsing
     */
    public void setForceParsing(boolean forceParsingValue) 
    {
        forceParsing = forceParsingValue;
    }

    /**
     * Register a custom operator processor with the engine.
     *
     * @param operator The operator as a string.
     * @param op Processor instance.
     */
    public void registerOperatorProcessor( String operator, OperatorProcessor op )
    {
        op.setContext( this );
        operators.put( operator, op );
    }

    /**
     * This method must be called between processing documents.  The
     * PDFStreamEngine caches information for the document between pages
     * and this will release the cached information.  This only needs
     * to be called if processing a new document.
     *
     */
    public void resetEngine()
    {
        validCharCnt = 0;
        totalCharCnt = 0;
    }

    /**
     * This will process the contents of the stream.
     *
     * @param aPage The page.
     * @param resources The location to retrieve resources.
     * @param cosStream the Stream to execute.
     *
     *
     * @throws IOException if there is an error accessing the stream.
     */
    public void processStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException
    {
        graphicsState = new PDGraphicsState(aPage.findCropBox());
        textMatrix = null;
        textLineMatrix = null;
        graphicsStack.clear();
        streamResourcesStack.clear();
        processSubStream( aPage, resources, cosStream );
    }

    /**
     * Process a sub stream of the current stream.
     *
     * @param aPage The page used for drawing.
     * @param resources The resources used when processing the stream.
     * @param cosStream The stream to process.
     *
     * @throws IOException If there is an exception while processing the stream.
     */
    public void processSubStream(PDPage aPage, PDResources resources, COSStream cosStream) throws IOException 
    {
        page = aPage;
        if (resources != null)
        {
            streamResourcesStack.push(resources);
            try
            {
                processSubStream(cosStream);
            }
            finally
            {
                streamResourcesStack.pop().clear();
            }
        }
        else
        {
            processSubStream(cosStream);
        }
    }

    private void processSubStream(COSStream cosStream) throws IOException 
    {
        List arguments = new ArrayList();
        PDFStreamParser parser = new PDFStreamParser(cosStream, forceParsing);
        try 
        {
            Iterator