All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.util.PDFTextStripperByArea Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Vector;

import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;

/**
 * This will extract text from a specified region in the PDF.
 *
 * @author Ben Litchfield
 * @version $Revision: 1.5 $
 */
public class PDFTextStripperByArea extends PDFTextStripper
{
    private List regions = new ArrayList();
    private Map regionArea = new HashMap();
    private Map>> regionCharacterList = 
        new HashMap>>();
    private Map regionText = new HashMap();

    /**
     * Constructor.
     * @throws IOException If there is an error loading properties.
     */
    public PDFTextStripperByArea() throws IOException
    {
        super();
        setPageSeparator( "" );
    }

        
    /**
     * Instantiate a new PDFTextStripperArea object. Loading all of the operator
     * mappings from the properties object that is passed in. Does not convert
     * the text to more encoding-specific output.
     * 
     * @param props
     *            The properties containing the mapping of operators to
     *            PDFOperator classes.
     * 
     * @throws IOException
     *             If there is an error reading the properties.
     */
    public PDFTextStripperByArea(Properties props) throws IOException
    {
        super(props);
        setPageSeparator("");
    }

    /**
     * Instantiate a new PDFTextStripperArea object. This object will load
     * properties from PDFTextStripper.properties and will apply
     * encoding-specific conversions to the output text.
     * 
     * @param encoding
     *            The encoding that the output will be written in.
     * @throws IOException
     *             If there is an error reading the properties.
     */
    public PDFTextStripperByArea(String encoding) throws IOException
    {
        super(encoding);
        setPageSeparator("");
    }
    
   /**
     * Add a new region to group text by.
     *
     * @param regionName The name of the region.
     * @param rect The rectangle area to retrieve the text from.
     */
    public void addRegion( String regionName, Rectangle2D rect )
    {
        regions.add( regionName );
        regionArea.put( regionName, rect );
    }

    /**
     * Delete a region to group text by. If the region does not exist, this method does nothing.
     *
     * @param regionName The name of the region to delete.
     */
    public void removeRegion(String regionName)
    {
        regions.remove(regionName);
        regionArea.remove(regionName);
    }

    /**
     * Get the list of regions that have been setup.
     *
     * @return A list of java.lang.String objects to identify the region names.
     */
    public List getRegions()
    {
        return regions;
    }

    /**
     * Get the text for the region, this should be called after extractRegions().
     *
     * @param regionName The name of the region to get the text from.
     * @return The text that was identified in that region.
     */
    public String getTextForRegion( String regionName )
    {
        StringWriter text = regionText.get( regionName );
        return text.toString();
    }

    /**
     * Process the page to extract the region text.
     *
     * @param page The page to extract the regions from.
     * @throws IOException If there is an error while extracting text.
     */
    public void extractRegions( PDPage page ) throws IOException
    {
        Iterator regionIter = regions.iterator();
        while( regionIter.hasNext() )
        {
            setStartPage(getCurrentPageNo());
            setEndPage(getCurrentPageNo());
            //reset the stored text for the region so this class
            //can be reused.
            String regionName = regionIter.next();
            Vector> regionCharactersByArticle = new Vector>();
            regionCharactersByArticle.add( new ArrayList() );
            regionCharacterList.put( regionName, regionCharactersByArticle );
            regionText.put( regionName, new StringWriter() );
        }

        PDStream contentStream = page.getContents();
        if( contentStream != null )
        {
            COSStream contents = contentStream.getStream();
            processPage( page, contents );
        }
    }

    
    /**
     * {@inheritDoc}
     */
    protected void processTextPosition( TextPosition text )
    {
        Iterator regionIter = regionArea.keySet().iterator();
        while( regionIter.hasNext() )
        {
            String region = regionIter.next();
            Rectangle2D rect = regionArea.get( region );
            if( rect.contains( text.getX(), text.getY() ) )
            {
                charactersByArticle = (Vector)regionCharacterList.get( region );
                super.processTextPosition( text );
            }
        }
    }

    
    /**
     * This will print the processed page text to the output stream.
     *
     * @throws IOException If there is an error writing the text.
     */
    protected void writePage() throws IOException
    {
        Iterator regionIter = regionArea.keySet().iterator();
        while( regionIter.hasNext() )
        {
            String region = regionIter.next();
            charactersByArticle = (Vector)regionCharacterList.get( region );
            output = regionText.get( region );
            super.writePage();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy