org.apache.pdfbox.util.Splitter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * Split a document into several other documents.
 *
 * @author Mario Ivankovits ([email protected])
 * @author Ben Litchfield
 * @version $Revision: 1.7 $
 */
public class Splitter
{

    /**
     * The source PDF document.
     */
    protected PDDocument pdfDocument;

    /**
     * The current PDF document that contains the splitted page.
     */
    protected PDDocument currentDocument = null;

    private int splitAtPage = 1;
    private int startPage = Integer.MIN_VALUE;
    private int endPage = Integer.MAX_VALUE;
    private List newDocuments = null;

    /**
     * The current page number that we are processing, zero based.
     */
    protected int pageNumber = 0;

    /**
     * This will take a document and split into several other documents.
     *
     * @param document The document to split.
     *
     * @return A list of all the split documents.
     *
     * @throws IOException If there is an IOError
     */
    public List split( PDDocument document ) throws IOException
    {
        newDocuments = new ArrayList();
        pdfDocument = document;

        List pages = pdfDocument.getDocumentCatalog().getAllPages();
        processPages(pages);
        return newDocuments;
    }

    /**
     * This will tell the splitting algorithm where to split the pages.  The default
     * is 1, so every page will become a new document.  If it was to then each document would
     * contain 2 pages.  So it the source document had 5 pages it would split into
     * 3 new documents, 2 documents containing 2 pages and 1 document containing one
     * page.
     *
     * @param split The number of pages each split document should contain.
     */
    public void setSplitAtPage( int split )
    {
        if( split <= 0 )
        {
            throw new RuntimeException( "Error split must be at least one page." );
        }
        splitAtPage = split;
    }

    /**
     * This will return how many pages each split document will contain.
     *
     * @return The split parameter.
     */
    public int getSplitAtPage()
    {
        return splitAtPage;
    }

    /**
     * This will set the start page.
     * 
     * @param start the start page
     */
    public void setStartPage( int start )
    {
        if( start <= 0 )
        {
            throw new RuntimeException( "Error split must be at least one page." );
        }
        startPage = start;
    }
    /**
     * This will return the start page.
     *
     * @return The start page.
     */
    public int getStartPage()
    {
        return startPage;
    }

    /**
     * This will set the end page.
     * 
     * @param end the end page
     */
    public void setEndPage( int end )
    {
        if( end <= 0 )
        {
            throw new RuntimeException( "Error split must be at least one page." );
        }
        endPage = end;
    }
    
    /**
     * This will return the end page.
     *
     * @return The end page.
     */
    public int getEndPage()
    {
        return endPage;
    }

    /**
     * Interface method to handle the start of the page processing.
     *
     * @param pages The list of pages from the source document.
     *
     * @throws IOException If an IO error occurs.
     */
    protected void processPages(List pages) throws IOException
    {
        Iterator iter = pages.iterator();
        while( iter.hasNext() )
        {
            PDPage page = (PDPage)iter.next();
            if (pageNumber+1 >= startPage && pageNumber+1 <= endPage)
            {
                processNextPage( page );
            }
            else
            {
                if (pageNumber > endPage)
                {
                    break;
                }
                else
                {
                    pageNumber++;
                }
            }
        }
    }

    /**
     * Interface method, you can control where a document gets split by implementing
     * this method.  By default a split occurs at every page.  If you wanted to split
     * based on some complex logic then you could override this method.  For example.
     * 
     * protected void createNewDocumentIfNecessary()
     * {
     *     if( isPrime( pageNumber ) )
     *     {
     *         super.createNewDocumentIfNecessary();
     *     }
     * }
     * 
     *
     * @throws IOException If there is an error creating the new document.
     */
    protected void createNewDocumentIfNecessary() throws IOException
    {
        if (isNewDocNecessary())
        {
            createNewDocument();
        }
    }

    /**
     * Check if it is necessary to create a new document.
     *
     * @return true If a new document should be created.
     */
    protected boolean isNewDocNecessary()
    {
        return pageNumber % splitAtPage == 0 || currentDocument == null;
    }

    /**
     * Create a new document to write the splitted contents to.
     *
     * @throws IOException If there is an problem creating the new document.
     */
    protected void createNewDocument() throws IOException
    {
        currentDocument = new PDDocument();
        currentDocument.setDocumentInformation(pdfDocument.getDocumentInformation());
        PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog();
        PDDocumentCatalog currentCatalog = currentDocument.getDocumentCatalog();
        currentCatalog.setViewerPreferences(catalog.getViewerPreferences());
        // copy global resources to the new pdf document
        currentCatalog.getPages().setResources(catalog.getPages().getResources());
        newDocuments.add(currentDocument);
    }



    /**
     * Interface to start processing a new page.
     *
     * @param page The page that is about to get processed.
     *
     * @throws IOException If there is an error creating the new document.
     */
    protected void processNextPage( PDPage page ) throws IOException
    {
        createNewDocumentIfNecessary();
        PDPage imported = currentDocument.importPage( page );
        imported.setCropBox( page.findCropBox() );
        imported.setMediaBox( page.findMediaBox() );
        // only the resources of the page will be copied
        imported.setResources( page.getResources() );
        imported.setRotation( page.findRotation() );
        // remove page links to avoid copying not needed resources 
        processAnnotations(imported);
        pageNumber++;
    }
    
    private void processAnnotations(PDPage imported) throws IOException
    {
        List annotations = imported.getAnnotations();
        for (PDAnnotation annotation : annotations)
        {
            if (annotation instanceof PDAnnotationLink)
            {
                PDAnnotationLink link = (PDAnnotationLink)annotation;   
                PDDestination destination = link.getDestination();
                if (destination == null && link.getAction() != null)
                {
                    PDAction action = link.getAction();
                    if (action instanceof PDActionGoTo)
                    {
                        destination = ((PDActionGoTo)action).getDestination();
                    }
                }
                if (destination instanceof PDPageDestination)
                {
                    // TODO preserve links to pages within the splitted result  
                    ((PDPageDestination) destination).setPage(null);
                }
            }
            // TODO preserve links to pages within the splitted result  
            annotation.setPage(null);
        }
    }

}