All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.io.ScratchFile Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.io;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Implements a memory page handling mechanism as base for creating (multiple)
 * {@link RandomAccess} buffers each having its set of pages (implemented by
 * {@link ScratchFileBuffer}). A buffer is created calling {@link #createBuffer()}.
 * 
 * 

Pages can be stored in main memory or in a temporary file. A mixed mode * is supported storing a certain amount of pages in memory and only the * additional ones in temporary file (defined by maximum main memory to * be used).

* *

Pages can be marked as 'free' in order to re-use them. For in-memory pages * this will release the used memory while for pages in temporary file this * simply marks the area as free to re-use.

* *

If a temporary file was created (done with the first page to be stored * in temporary file) it is deleted when {@link ScratchFile#close()} is called.

* *

Using this class for {@link RandomAccess} buffers allows for a direct control * on the maximum memory usage and allows processing large files for which we * otherwise would get an {@link OutOfMemoryError} in case of using {@link RandomAccessBuffer}.

* *

This base class for providing pages is thread safe (the buffer implementations are not).

*/ public class ScratchFile implements Closeable { private static final Log LOG = LogFactory.getLog(ScratchFile.class); /** number of pages by which we enlarge the scratch file (reduce I/O-operations) */ private static final int ENLARGE_PAGE_COUNT = 16; /** in case of unrestricted main memory usage this is the initial number of pages * {@link #inMemoryPages} is setup for */ private static final int INIT_UNRESTRICTED_MAINMEM_PAGECOUNT = 100000; private static final int PAGE_SIZE = 4096; private final Object ioLock = new Object(); private final File scratchFileDirectory; /** scratch file; only to be accessed under synchronization of {@link #ioLock} */ private File file; /** random access to scratch file; only to be accessed under synchronization of {@link #ioLock} */ private java.io.RandomAccessFile raf; private volatile int pageCount = 0; private final BitSet freePages = new BitSet(); /** holds pointers to in-memory page content; will be initialized once in case of restricted * main memory, otherwise it is enlarged as needed and first initialized to a size of * {@link #INIT_UNRESTRICTED_MAINMEM_PAGECOUNT} */ private volatile byte[][] inMemoryPages; private final int inMemoryMaxPageCount; private final int maxPageCount; private final boolean useScratchFile; private final boolean maxMainMemoryIsRestricted; private volatile boolean isClosed = false; /** * Initializes page handler. If a scratchFileDirectory is supplied, * then the scratch file will be created in that directory. * *

All pages will be stored in the scratch file.

* * @param scratchFileDirectory The directory in which to create the scratch file * or null to created it in the default temporary directory. * * @throws IOException If scratch file directory was given but don't exist. */ public ScratchFile(File scratchFileDirectory) throws IOException { this(MemoryUsageSetting.setupTempFileOnly().setTempDir(scratchFileDirectory)); } /** * Initializes page handler. If a scratchFileDirectory is supplied, * then the scratch file will be created in that directory. * *

Depending on the size of allowed memory usage a number of pages (memorySize/{@link #PAGE_SIZE}) * will be stored in-memory and only additional pages will be written to/read from scratch file.

* * @param memUsageSetting set how memory/temporary files are used for buffering streams etc. * * @throws IOException If scratch file directory was given but don't exist. */ public ScratchFile(MemoryUsageSetting memUsageSetting) throws IOException { maxMainMemoryIsRestricted = (!memUsageSetting.useMainMemory()) || memUsageSetting.isMainMemoryRestricted(); useScratchFile = maxMainMemoryIsRestricted ? memUsageSetting.useTempFile() : false; scratchFileDirectory = useScratchFile ? memUsageSetting.getTempDir() : null; if ((scratchFileDirectory != null) && (!scratchFileDirectory.isDirectory())) { throw new IOException("Scratch file directory does not exist: " + this.scratchFileDirectory); } maxPageCount = memUsageSetting.isStorageRestricted() ? (int) Math.min(Integer.MAX_VALUE, memUsageSetting.getMaxStorageBytes() / PAGE_SIZE) : Integer.MAX_VALUE; inMemoryMaxPageCount = memUsageSetting.useMainMemory() ? (memUsageSetting.isMainMemoryRestricted() ? (int) Math.min(Integer.MAX_VALUE, memUsageSetting.getMaxMainMemoryBytes() / PAGE_SIZE) : Integer.MAX_VALUE) : 0; inMemoryPages = new byte[maxMainMemoryIsRestricted ? inMemoryMaxPageCount : INIT_UNRESTRICTED_MAINMEM_PAGECOUNT][]; freePages.set(0, inMemoryPages.length); } /** * Getter for an instance using only unrestricted main memory for buffering * (same as new ScratchFile(MemoryUsageSetting.setupMainMemoryOnly())). * * @return instance configured to only use main memory with no size restriction */ public static ScratchFile getMainMemoryOnlyInstance() { try { return new ScratchFile(MemoryUsageSetting.setupMainMemoryOnly()); } catch (IOException ioe) { // cannot happen for main memory setup LOG.error("Unexpected exception occurred creating main memory scratch file instance: " + ioe.getMessage() ); return null; } } /** * Returns a new free page, either from free page pool * or by enlarging scratch file (may be created). * * @return index of new page */ int getNewPage() throws IOException { synchronized (freePages) { int idx = freePages.nextSetBit( 0 ); if (idx < 0) { enlarge(); idx = freePages.nextSetBit( 0 ); if (idx < 0) { throw new IOException("Maximum allowed scratch file memory exceeded."); } } freePages.clear(idx); if (idx >= pageCount) { pageCount = idx + 1; } return idx; } } /** * This will provide new free pages by either enlarging the scratch file * by a number of pages defined by {@link #ENLARGE_PAGE_COUNT} - in case * scratch file usage is allowed - or increase the {@link #inMemoryPages} * array in case main memory was not restricted. If neither of both is * allowed/the case than free pages count won't be changed. The same is true * if no new pages could be added because we reached the maximum of * {@link Integer#MAX_VALUE} pages. * *

If scratch file uage is allowed and scratch file does not exist already * it will be created.

* *

Only to be called under synchronization on {@link #freePages}.

*/ private void enlarge() throws IOException { synchronized (ioLock) { checkClosed(); if (pageCount >= maxPageCount) { return; } if (useScratchFile) { // create scratch file is needed if ( raf == null ) { file = File.createTempFile("PDFBox", ".tmp", scratchFileDirectory); try { raf = new java.io.RandomAccessFile(file, "rw"); } catch (IOException e) { if (!file.delete()) { LOG.warn("Error deleting scratch file: " + file.getAbsolutePath()); } throw e; } } long fileLen = raf.length(); long expectedFileLen = ((long)pageCount - inMemoryMaxPageCount) * PAGE_SIZE; if (expectedFileLen != fileLen) { throw new IOException("Expected scratch file size of " + expectedFileLen + " but found " + fileLen); } // enlarge if we do not overflow if (pageCount + ENLARGE_PAGE_COUNT > pageCount) { fileLen += ENLARGE_PAGE_COUNT * PAGE_SIZE; raf.setLength(fileLen); freePages.set(pageCount, pageCount + ENLARGE_PAGE_COUNT); } } else if (!maxMainMemoryIsRestricted) { // increase number of in-memory pages int oldSize = inMemoryPages.length; int newSize = (int) Math.min( ((long)oldSize) * 2, Integer.MAX_VALUE); // this handles integer overflow if (newSize > oldSize) { byte[][] newInMemoryPages = new byte[newSize][]; System.arraycopy(inMemoryPages, 0, newInMemoryPages, 0, oldSize); inMemoryPages = newInMemoryPages; freePages.set(oldSize, newSize); } } } } /** * Returns byte size of a page. * * @return byte size of a page */ int getPageSize() { return PAGE_SIZE; } /** * Reads the page with specified index. * * @param pageIdx index of page to read * * @return byte array of size {@link #PAGE_SIZE} filled with page data read from file * * @throws IOException */ byte[] readPage(int pageIdx) throws IOException { if ((pageIdx < 0) || (pageIdx >= pageCount)) { checkClosed(); throw new IOException("Page index out of range: " + pageIdx + ". Max value: " + (pageCount - 1) ); } // check if we have the page in memory if (pageIdx < inMemoryMaxPageCount) { byte[] page = inMemoryPages[pageIdx]; // handle case that we are closed if (page == null) { checkClosed(); throw new IOException("Requested page with index " + pageIdx + " was not written before."); } return page; } synchronized (ioLock) { if (raf == null) { checkClosed(); throw new IOException("Missing scratch file to read page with index " + pageIdx + " from."); } byte[] page = new byte[PAGE_SIZE]; raf.seek(((long)pageIdx - inMemoryMaxPageCount) * PAGE_SIZE); raf.readFully(page); return page; } } /** * Writes updated page. Page is either kept in-memory if pageIdx < {@link #inMemoryMaxPageCount} * or is written to scratch file. * *

Provided page byte array must not be re-used for other pages since we * store it as is in case of in-memory handling.

* * @param pageIdx index of page to write * @param page page to write (length has to be {@value #PAGE_SIZE}) * * @throws IOException in case page index is out of range or page has wrong length * or writing to file failed */ void writePage(int pageIdx, byte[] page) throws IOException { if ((pageIdx<0) || (pageIdx>=pageCount)) { checkClosed(); throw new IOException("Page index out of range: " + pageIdx + ". Max value: " + (pageCount - 1) ); } if (page.length != PAGE_SIZE) { throw new IOException("Wrong page size to write: " + page.length + ". Expected: " + PAGE_SIZE ); } if (pageIdx < inMemoryMaxPageCount) { if (maxMainMemoryIsRestricted) { inMemoryPages[pageIdx] = page; } else { // need synchronization since inMemoryPages may change synchronized (ioLock) { inMemoryPages[pageIdx] = page; } } // in case we were closed in between throw exception checkClosed(); } else { synchronized (ioLock) { checkClosed(); raf.seek(((long)pageIdx - inMemoryMaxPageCount) * PAGE_SIZE); raf.write(page); } } } /** * Checks if this page handler has already been closed. If so, * an {@link IOException} is thrown. * * @throws IOException If {@link #close()} has already been called. */ void checkClosed() throws IOException { if (isClosed) { throw new IOException("Scratch file already closed"); } } /** * Creates a new buffer using this page handler. * * @return A new buffer. * * @throws IOException If an error occurred. */ public RandomAccess createBuffer() throws IOException { return new ScratchFileBuffer(this); } /** * Creates a new buffer using this page handler and initializes it with the * data read from provided input stream (input stream is copied to buffer). * The buffer data pointer is reset to point to first byte. * * @param input The input stream that is to be copied into the buffer. * @return A new buffer containing data read from input stream. * * @throws IOException If an error occurred. */ public RandomAccess createBuffer(InputStream input) throws IOException { ScratchFileBuffer buf = new ScratchFileBuffer(this); byte[] byteBuffer = new byte[8192]; int bytesRead = 0; while ((bytesRead = input.read(byteBuffer)) > -1) { buf.write(byteBuffer, 0, bytesRead); } buf.seek(0); return buf; } /** * Allows a buffer which is cleared/closed to release its pages to be re-used. * * @param pageIndexes pages indexes of pages to release * @param count number of page indexes contained in provided array */ void markPagesAsFree(int[] pageIndexes, int off, int count) { synchronized (freePages) { for (int aIdx = off; aIdx < count; aIdx++) { int pageIdx = pageIndexes[aIdx]; if ((pageIdx>=0) && (pageIdx




© 2015 - 2024 Weber Informatics LLC | Privacy Policy