All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.util.Splitter Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * Split a document into several other documents.
 *
 * @author Mario Ivankovits ([email protected])
 * @author Ben Litchfield
 * @version $Revision: 1.7 $
 */
public class Splitter
{

    /**
     * The source PDF document.
     */
    protected PDDocument pdfDocument;

    /**
     * The current PDF document that contains the splitted page.
     */
    protected PDDocument currentDocument = null;

    private int splitAtPage = 1;
    private int startPage = Integer.MIN_VALUE;
    private int endPage = Integer.MAX_VALUE;
    private List newDocuments = null;

    /**
     * The current page number that we are processing, zero based.
     */
    protected int pageNumber = 0;

    /**
     * This will take a document and split into several other documents.
     *
     * @param document The document to split.
     *
     * @return A list of all the split documents.
     *
     * @throws IOException If there is an IOError
     */
    public List split( PDDocument document ) throws IOException
    {
        newDocuments = new ArrayList();
        pdfDocument = document;

        List pages = pdfDocument.getDocumentCatalog().getAllPages();
        processPages(pages);
        return newDocuments;
    }

    /**
     * This will tell the splitting algorithm where to split the pages.  The default
     * is 1, so every page will become a new document.  If it was to then each document would
     * contain 2 pages.  So it the source document had 5 pages it would split into
     * 3 new documents, 2 documents containing 2 pages and 1 document containing one
     * page.
     *
     * @param split The number of pages each split document should contain.
     */
    public void setSplitAtPage( int split )
    {
        if( split <= 0 )
        {
            throw new RuntimeException( "Error split must be at least one page." );
        }
        splitAtPage = split;
    }

    /**
     * This will return how many pages each split document will contain.
     *
     * @return The split parameter.
     */
    public int getSplitAtPage()
    {
        return splitAtPage;
    }

    /**
     * This will set the start page.
     * 
     * @param start the start page
     */
    public void setStartPage( int start )
    {
        if( start <= 0 )
        {
            throw new RuntimeException( "Error split must be at least one page." );
        }
        startPage = start;
    }
    /**
     * This will return the start page.
     *
     * @return The start page.
     */
    public int getStartPage()
    {
        return startPage;
    }

    /**
     * This will set the end page.
     * 
     * @param end the end page
     */
    public void setEndPage( int end )
    {
        if( end <= 0 )
        {
            throw new RuntimeException( "Error split must be at least one page." );
        }
        endPage = end;
    }
    
    /**
     * This will return the end page.
     *
     * @return The end page.
     */
    public int getEndPage()
    {
        return endPage;
    }

    /**
     * Interface method to handle the start of the page processing.
     *
     * @param pages The list of pages from the source document.
     *
     * @throws IOException If an IO error occurs.
     */
    protected void processPages(List pages) throws IOException
    {
        Iterator iter = pages.iterator();
        while( iter.hasNext() )
        {
            PDPage page = (PDPage)iter.next();
            if (pageNumber+1 >= startPage && pageNumber+1 <= endPage)
            {
                processNextPage( page );
            }
            else
            {
                if (pageNumber > endPage)
                {
                    break;
                }
                else
                {
                    pageNumber++;
                }
            }
        }
    }

    /**
     * Interface method, you can control where a document gets split by implementing
     * this method.  By default a split occurs at every page.  If you wanted to split
     * based on some complex logic then you could override this method.  For example.
     * 
     * protected void createNewDocumentIfNecessary()
     * {
     *     if( isPrime( pageNumber ) )
     *     {
     *         super.createNewDocumentIfNecessary();
     *     }
     * }
     * 
     *
     * @throws IOException If there is an error creating the new document.
     */
    protected void createNewDocumentIfNecessary() throws IOException
    {
        if (isNewDocNecessary())
        {
            createNewDocument();
        }
    }

    /**
     * Check if it is necessary to create a new document.
     *
     * @return true If a new document should be created.
     */
    protected boolean isNewDocNecessary()
    {
        return pageNumber % splitAtPage == 0 || currentDocument == null;
    }

    /**
     * Create a new document to write the splitted contents to.
     *
     * @throws IOException If there is an problem creating the new document.
     */
    protected void createNewDocument() throws IOException
    {
        currentDocument = new PDDocument();
        currentDocument.setDocumentInformation(pdfDocument.getDocumentInformation());
        PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog();
        PDDocumentCatalog currentCatalog = currentDocument.getDocumentCatalog();
        currentCatalog.setViewerPreferences(catalog.getViewerPreferences());
        // copy global resources to the new pdf document
        currentCatalog.getPages().setResources(catalog.getPages().getResources());
        newDocuments.add(currentDocument);
    }



    /**
     * Interface to start processing a new page.
     *
     * @param page The page that is about to get processed.
     *
     * @throws IOException If there is an error creating the new document.
     */
    protected void processNextPage( PDPage page ) throws IOException
    {
        createNewDocumentIfNecessary();
        PDPage imported = currentDocument.importPage( page );
        imported.setCropBox( page.findCropBox() );
        imported.setMediaBox( page.findMediaBox() );
        // only the resources of the page will be copied
        imported.setResources( page.getResources() );
        imported.setRotation( page.findRotation() );
        // remove page links to avoid copying not needed resources 
        processAnnotations(imported);
        pageNumber++;
    }
    
    private void processAnnotations(PDPage imported) throws IOException
    {
        List annotations = imported.getAnnotations();
        for (PDAnnotation annotation : annotations)
        {
            if (annotation instanceof PDAnnotationLink)
            {
                PDAnnotationLink link = (PDAnnotationLink)annotation;   
                PDDestination destination = link.getDestination();
                if (destination == null && link.getAction() != null)
                {
                    PDAction action = link.getAction();
                    if (action instanceof PDActionGoTo)
                    {
                        destination = ((PDActionGoTo)action).getDestination();
                    }
                }
                if (destination instanceof PDPageDestination)
                {
                    // TODO preserve links to pages within the splitted result  
                    ((PDPageDestination) destination).setPage(null);
                }
            }
            // TODO preserve links to pages within the splitted result  
            annotation.setPage(null);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy