org.apache.pdfbox.io.ScratchFile Maven / Gradle / Ivy
Show all versions of pdfbox Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.io;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Implements a memory page handling mechanism as base for creating (multiple)
* {@link RandomAccess} buffers each having its set of pages (implemented by
* {@link ScratchFileBuffer}). A buffer is created calling {@link #createBuffer()}.
*
* Pages can be stored in main memory or in a temporary file. A mixed mode
* is supported storing a certain amount of pages in memory and only the
* additional ones in temporary file (defined by maximum main memory to
* be used).
*
* Pages can be marked as 'free' in order to re-use them. For in-memory pages
* this will release the used memory while for pages in temporary file this
* simply marks the area as free to re-use.
*
* If a temporary file was created (done with the first page to be stored
* in temporary file) it is deleted when {@link ScratchFile#close()} is called.
*
* Using this class for {@link RandomAccess} buffers allows for a direct control
* on the maximum memory usage and allows processing large files for which we
* otherwise would get an {@link OutOfMemoryError} in case of using {@link RandomAccessBuffer}.
*
* This base class for providing pages is thread safe (the buffer implementations are not).
*/
public class ScratchFile implements Closeable
{
private static final Log LOG = LogFactory.getLog(ScratchFile.class);
/** number of pages by which we enlarge the scratch file (reduce I/O-operations) */
private static final int ENLARGE_PAGE_COUNT = 16;
/** in case of unrestricted main memory usage this is the initial number of pages
* {@link #inMemoryPages} is setup for */
private static final int INIT_UNRESTRICTED_MAINMEM_PAGECOUNT = 100000;
private static final int PAGE_SIZE = 4096;
private final Object ioLock = new Object();
private final File scratchFileDirectory;
/** scratch file; only to be accessed under synchronization of {@link #ioLock} */
private File file;
/** random access to scratch file; only to be accessed under synchronization of {@link #ioLock} */
private java.io.RandomAccessFile raf;
private volatile int pageCount = 0;
private final BitSet freePages = new BitSet();
/** holds pointers to in-memory page content; will be initialized once in case of restricted
* main memory, otherwise it is enlarged as needed and first initialized to a size of
* {@link #INIT_UNRESTRICTED_MAINMEM_PAGECOUNT} */
private volatile byte[][] inMemoryPages;
private final int inMemoryMaxPageCount;
private final int maxPageCount;
private final boolean useScratchFile;
private final boolean maxMainMemoryIsRestricted;
private volatile boolean isClosed = false;
/**
* Initializes page handler. If a scratchFileDirectory
is supplied,
* then the scratch file will be created in that directory.
*
* All pages will be stored in the scratch file.
*
* @param scratchFileDirectory The directory in which to create the scratch file
* or null
to created it in the default temporary directory.
*
* @throws IOException If scratch file directory was given but don't exist.
*/
public ScratchFile(File scratchFileDirectory) throws IOException
{
this(MemoryUsageSetting.setupTempFileOnly().setTempDir(scratchFileDirectory));
}
/**
* Initializes page handler. If a scratchFileDirectory
is supplied,
* then the scratch file will be created in that directory.
*
* Depending on the size of allowed memory usage a number of pages (memorySize/{@link #PAGE_SIZE})
* will be stored in-memory and only additional pages will be written to/read from scratch file.
*
* @param memUsageSetting set how memory/temporary files are used for buffering streams etc.
*
* @throws IOException If scratch file directory was given but don't exist.
*/
public ScratchFile(MemoryUsageSetting memUsageSetting) throws IOException
{
maxMainMemoryIsRestricted = (!memUsageSetting.useMainMemory()) || memUsageSetting.isMainMemoryRestricted();
useScratchFile = maxMainMemoryIsRestricted ? memUsageSetting.useTempFile() : false;
scratchFileDirectory = useScratchFile ? memUsageSetting.getTempDir() : null;
if ((scratchFileDirectory != null) && (!scratchFileDirectory.isDirectory()))
{
throw new IOException("Scratch file directory does not exist: " + this.scratchFileDirectory);
}
maxPageCount = memUsageSetting.isStorageRestricted() ?
(int) Math.min(Integer.MAX_VALUE, memUsageSetting.getMaxStorageBytes() / PAGE_SIZE) :
Integer.MAX_VALUE;
inMemoryMaxPageCount = memUsageSetting.useMainMemory() ?
(memUsageSetting.isMainMemoryRestricted() ?
(int) Math.min(Integer.MAX_VALUE, memUsageSetting.getMaxMainMemoryBytes() / PAGE_SIZE) :
Integer.MAX_VALUE) :
0;
inMemoryPages = new byte[maxMainMemoryIsRestricted ? inMemoryMaxPageCount : INIT_UNRESTRICTED_MAINMEM_PAGECOUNT][];
freePages.set(0, inMemoryPages.length);
}
/**
* Getter for an instance using only unrestricted main memory for buffering
* (same as new ScratchFile(MemoryUsageSetting.setupMainMemoryOnly())
).
*
* @return instance configured to only use main memory with no size restriction
*/
public static ScratchFile getMainMemoryOnlyInstance()
{
try
{
return new ScratchFile(MemoryUsageSetting.setupMainMemoryOnly());
}
catch (IOException ioe)
{
// cannot happen for main memory setup
LOG.error("Unexpected exception occurred creating main memory scratch file instance: " + ioe.getMessage() );
return null;
}
}
/**
* Returns a new free page, either from free page pool
* or by enlarging scratch file (may be created).
*
* @return index of new page
*/
int getNewPage() throws IOException
{
synchronized (freePages)
{
int idx = freePages.nextSetBit( 0 );
if (idx < 0)
{
enlarge();
idx = freePages.nextSetBit( 0 );
if (idx < 0)
{
throw new IOException("Maximum allowed scratch file memory exceeded.");
}
}
freePages.clear(idx);
if (idx >= pageCount)
{
pageCount = idx + 1;
}
return idx;
}
}
/**
* This will provide new free pages by either enlarging the scratch file
* by a number of pages defined by {@link #ENLARGE_PAGE_COUNT} - in case
* scratch file usage is allowed - or increase the {@link #inMemoryPages}
* array in case main memory was not restricted. If neither of both is
* allowed/the case than free pages count won't be changed. The same is true
* if no new pages could be added because we reached the maximum of
* {@link Integer#MAX_VALUE} pages.
*
* If scratch file uage is allowed and scratch file does not exist already
* it will be created.
*
* Only to be called under synchronization on {@link #freePages}.
*/
private void enlarge() throws IOException
{
synchronized (ioLock)
{
checkClosed();
if (pageCount >= maxPageCount)
{
return;
}
if (useScratchFile)
{
// create scratch file is needed
if ( raf == null )
{
file = File.createTempFile("PDFBox", ".tmp", scratchFileDirectory);
try
{
raf = new java.io.RandomAccessFile(file, "rw");
}
catch (IOException e)
{
if (!file.delete())
{
LOG.warn("Error deleting scratch file: " + file.getAbsolutePath());
}
throw e;
}
}
long fileLen = raf.length();
long expectedFileLen = ((long)pageCount - inMemoryMaxPageCount) * PAGE_SIZE;
if (expectedFileLen != fileLen)
{
throw new IOException("Expected scratch file size of " + expectedFileLen + " but found " + fileLen);
}
// enlarge if we do not overflow
if (pageCount + ENLARGE_PAGE_COUNT > pageCount)
{
fileLen += ENLARGE_PAGE_COUNT * PAGE_SIZE;
raf.setLength(fileLen);
freePages.set(pageCount, pageCount + ENLARGE_PAGE_COUNT);
}
}
else if (!maxMainMemoryIsRestricted)
{
// increase number of in-memory pages
int oldSize = inMemoryPages.length;
int newSize = (int) Math.min( ((long)oldSize) * 2, Integer.MAX_VALUE); // this handles integer overflow
if (newSize > oldSize)
{
byte[][] newInMemoryPages = new byte[newSize][];
System.arraycopy(inMemoryPages, 0, newInMemoryPages, 0, oldSize);
inMemoryPages = newInMemoryPages;
freePages.set(oldSize, newSize);
}
}
}
}
/**
* Returns byte size of a page.
*
* @return byte size of a page
*/
int getPageSize()
{
return PAGE_SIZE;
}
/**
* Reads the page with specified index.
*
* @param pageIdx index of page to read
*
* @return byte array of size {@link #PAGE_SIZE} filled with page data read from file
*
* @throws IOException
*/
byte[] readPage(int pageIdx) throws IOException
{
if ((pageIdx < 0) || (pageIdx >= pageCount))
{
checkClosed();
throw new IOException("Page index out of range: " + pageIdx + ". Max value: " + (pageCount - 1) );
}
// check if we have the page in memory
if (pageIdx < inMemoryMaxPageCount)
{
byte[] page = inMemoryPages[pageIdx];
// handle case that we are closed
if (page == null)
{
checkClosed();
throw new IOException("Requested page with index " + pageIdx + " was not written before.");
}
return page;
}
synchronized (ioLock)
{
if (raf == null)
{
checkClosed();
throw new IOException("Missing scratch file to read page with index " + pageIdx + " from.");
}
byte[] page = new byte[PAGE_SIZE];
raf.seek(((long)pageIdx - inMemoryMaxPageCount) * PAGE_SIZE);
raf.readFully(page);
return page;
}
}
/**
* Writes updated page. Page is either kept in-memory if pageIdx < {@link #inMemoryMaxPageCount}
* or is written to scratch file.
*
* Provided page byte array must not be re-used for other pages since we
* store it as is in case of in-memory handling.
*
* @param pageIdx index of page to write
* @param page page to write (length has to be {@value #PAGE_SIZE})
*
* @throws IOException in case page index is out of range or page has wrong length
* or writing to file failed
*/
void writePage(int pageIdx, byte[] page) throws IOException
{
if ((pageIdx<0) || (pageIdx>=pageCount))
{
checkClosed();
throw new IOException("Page index out of range: " + pageIdx + ". Max value: " + (pageCount - 1) );
}
if (page.length != PAGE_SIZE)
{
throw new IOException("Wrong page size to write: " + page.length + ". Expected: " + PAGE_SIZE );
}
if (pageIdx < inMemoryMaxPageCount)
{
if (maxMainMemoryIsRestricted)
{
inMemoryPages[pageIdx] = page;
}
else
{
// need synchronization since inMemoryPages may change
synchronized (ioLock)
{
inMemoryPages[pageIdx] = page;
}
}
// in case we were closed in between throw exception
checkClosed();
}
else
{
synchronized (ioLock)
{
checkClosed();
raf.seek(((long)pageIdx - inMemoryMaxPageCount) * PAGE_SIZE);
raf.write(page);
}
}
}
/**
* Checks if this page handler has already been closed. If so,
* an {@link IOException} is thrown.
*
* @throws IOException If {@link #close()} has already been called.
*/
void checkClosed() throws IOException
{
if (isClosed)
{
throw new IOException("Scratch file already closed");
}
}
/**
* Creates a new buffer using this page handler.
*
* @return A new buffer.
*
* @throws IOException If an error occurred.
*/
public RandomAccess createBuffer() throws IOException
{
return new ScratchFileBuffer(this);
}
/**
* Creates a new buffer using this page handler and initializes it with the
* data read from provided input stream (input stream is copied to buffer).
* The buffer data pointer is reset to point to first byte.
*
* @param input The input stream that is to be copied into the buffer.
* @return A new buffer containing data read from input stream.
*
* @throws IOException If an error occurred.
*/
public RandomAccess createBuffer(InputStream input) throws IOException
{
ScratchFileBuffer buf = new ScratchFileBuffer(this);
byte[] byteBuffer = new byte[8192];
int bytesRead = 0;
while ((bytesRead = input.read(byteBuffer)) > -1)
{
buf.write(byteBuffer, 0, bytesRead);
}
buf.seek(0);
return buf;
}
/**
* Allows a buffer which is cleared/closed to release its pages to be re-used.
*
* @param pageIndexes pages indexes of pages to release
* @param count number of page indexes contained in provided array
*/
void markPagesAsFree(int[] pageIndexes, int off, int count) {
synchronized (freePages)
{
for (int aIdx = off; aIdx < count; aIdx++)
{
int pageIdx = pageIndexes[aIdx];
if ((pageIdx>=0) && (pageIdx