org.archive.util.ms.DefaultBlockFileSystem Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-commons Show documentation
The Archive Commons Code Libraries project contains general Java utility libraries, as used by the Heritrix crawler and other projects.
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.util.ms;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Map;

import org.archive.io.SeekInputStream;
import org.archive.util.ArchiveUtils;
import org.archive.util.LRU;


/**
 * Default implementation of the Block File System.
 * 
 * The overall structure of a BlockFileSystem file (such as a .doc file) is
 * as follows.  The file is divided into blocks, which are of uniform length
 * (512 bytes).  The first block (at file pointer 0) is called the header
 * block.  It's used to look up other blocks in the file.
 * 
 * 
Subfiles contained within the .doc file are organized using a Block
 * Allocation Table, or BAT.  The BAT is basically a linked list; given a 
 * block number, the BAT will tell you the next block number.  Note that
 * the header block has no number; block #0 is the first block after the
 * header.  Thus, to convert a block number to a file pointer:
 * int filePointer = (blockNumber + 1) * BLOCK_SIZE.
 * 
 * 
The BAT itself is discontinuous, however.  To find the blocks that 
 * comprise the BAT, you have to look in the header block.  The header block
 * contains an array of 109 pointers to the blocks that comprise the BAT.
 * If more than 109 BAT blocks are required (in other words, if the .doc
 * file is larger than ~6 megabytes), then something called the 
 * XBAT comes into play.
 * 
 * 
XBAT blocks contain pointers to the 110th BAT block and beyond.
 * The first XBAT block is stored at a file pointer listed in the header.
 * The other XBAT blocks are always stored in order after the first; the 
 * XBAT table is continuous.  One is inclined to wonder why the BAT itself
 * is not so stored, but oh well.
 * 
 * 
The BAT only tells you the next block for a given block.  To find the 
 * first block for a subfile, you have to look up that subfile's directory
 * entry.  Each directory entry is a 128 byte structure in the file, so four
 * of them fit in a block.  The number of the first block of the entry list
 * is stored in the header.  To find subsequent entry blocks, the BAT must
 * be used.
 * 
 * 
I'm telling you all this so that you understand the caching that this
 * class provides.
 * 
 * 
First, directory entries are not cached.  It's assumed that they will
 * be looked up at the beginning of a lengthy operation, and then forgotten
 * about.  This is certainly the case for {@link Doc#getText(BlockFileSystem, int)}. 
 * If you need to remember directory entries, you can manually store the Entry 
 * objects in a map or something, as they don't grow stale.
 * 
 * 
This class keeps all 512 bytes of the header block in memory at all 
 * times.  This prevents a potentially expensive file pointer repositioning
 * every time you're trying to figure out what comes next.
 * 
 * 
BAT and XBAT blocks are stored in a least-recently used cache.  The 
 * n most recent BAT and XBAT blocks are remembered, where n
 * is set at construction time.  The minimum value of n is 1.  For small
 * files, this can prevent file pointer repositioning for BAT look ups.
 * 
 * 
The BAT/XBAT cache only takes up memory as needed.  If the specified
 * cache size is 100 blocks, but the file only has 4 BAT blocks, then only 
 * 2048 bytes will be used by the cache.
 * 
 * Note this class only caches BAT and XBAT blocks.  It does not cache the
 * blocks that actually make up a subfile's contents.  It is assumed that those
 * blocks will only be accessed once per operation (again, this is what
 * {Doc.getText(BlockFileSystem)} typically requires.)
 * 
 * @author pjack
 * @see http://jakarta.apache.org/poi/poifs/fileformat.html
 */
public class DefaultBlockFileSystem implements BlockFileSystem {


    /**
     * Pointers per BAT block.
     */
    final private static int POINTERS_PER_BAT = 128;


    /**
     * Size of a BAT pointer in bytes.  (In other words, 4).
     */
    final private static int BAT_POINTER_SIZE = BLOCK_SIZE / POINTERS_PER_BAT;

    
    /**
     * The number of BAT pointers in the header block.  After this many 
     * BAT blocks, the XBAT blocks must be consulted.
     */
    final private static int HEADER_BAT_LIMIT = 109;
    
    
    /**
     * The size of an entry record in bytes.
     */
    final private static int ENTRY_SIZE = 128;
    
    
    /**
     * The number of entries that can fit in a block.
     */
    final private static int ENTRIES_PER_BLOCK = BLOCK_SIZE / ENTRY_SIZE;


    /**
     * The .doc file as a stream.
     */
    private SeekInputStream input;
    
    
    /**
     * The header block.
     */
    private HeaderBlock header;


    /**
     * Cache of BAT and XBAT blocks.
     */
    private Map cache;
    
    
    /**
     * Constructor.
     * 
     * @param input   the file to read from
     * @param batCacheSize  number of BAT and XBAT blocks to cache
     * @throws IOException  if an IO error occurs
     */
    public DefaultBlockFileSystem(SeekInputStream input, int batCacheSize)
    throws IOException {
        this.input = input;
        byte[] temp = new byte[BLOCK_SIZE];
        ArchiveUtils.readFully(input, temp);
        this.header = new HeaderBlock(ByteBuffer.wrap(temp));
        this.cache = new LRU(batCacheSize);
    }


    public Entry getRoot() throws IOException {
        // Position to the first block of the entry list.
        int block = header.getEntriesStart();
        input.position((block + 1) * BLOCK_SIZE);
        
        // The root entry is always entry #0.
        return new DefaultEntry(this, input, 0);
    }


    /**
     * Returns the entry with the given number.
     * 
     * @param entryNumber   the number of the entry to return
     * @return   that entry, or null if no such entry exists
     * @throws IOException  if an IO error occurs
     */
    protected Entry getEntry(int entryNumber) throws IOException {
        // Entry numbers < 0 typically indicate an end-of-stream.
        if (entryNumber < 0) {
            return null;
        }
        
        // It's impossible to check against the upper bound, because the
        // upper bound is not recorded anywhere.
        
        // Advance to the block containing the desired entry.
        int blockCount = entryNumber / ENTRIES_PER_BLOCK;
        int remainder = entryNumber % ENTRIES_PER_BLOCK;        
        int block = header.getEntriesStart();
        for (int i = 0; i < blockCount; i++) {
            block = getNextBlock(block);
        }
        
        if (block < 0) {
            // Given entry number exceeded the number of available entries.
            return null;
        }

        int filePos = (block + 1) * BLOCK_SIZE + remainder * ENTRY_SIZE;
        input.position(filePos);
        
        return new DefaultEntry(this, input, entryNumber);
    }


    public int getNextBlock(int block) throws IOException {
        if (block < 0) {
            return block;
        }
        
        // Index into the header array of BAT blocks.
        int headerIndex = block / POINTERS_PER_BAT;
        
        // Index within that BAT block of the block we're interested in.
        int batBlockIndex = block % POINTERS_PER_BAT;

        int batBlockNumber = batLookup(headerIndex);
        ByteBuffer batBlock = getBATBlock(batBlockNumber);
        return batBlock.getInt(batBlockIndex * BAT_POINTER_SIZE);
    }

    
    /**
     * Looks up the block number of a BAT block.
     * 
     * @param headerIndex  
     * @return
     * @throws IOException
     */
    private int batLookup(int headerIndex) throws IOException {
        if (headerIndex < HEADER_BAT_LIMIT + 1) {
            return header.getBATBlockNumber(headerIndex);
        }
        
        // Find the XBAT block of interest
        headerIndex -= HEADER_BAT_LIMIT;
        int xbatBlockNumber = headerIndex / POINTERS_PER_BAT;
        xbatBlockNumber += header.getExtendedBATStart();
        ByteBuffer xbat = getBATBlock(xbatBlockNumber);

        // Find the bat Block number inside the XBAT block
        int xbatBlockIndex = headerIndex % POINTERS_PER_BAT;
        return xbat.getInt(xbatBlockIndex * BAT_POINTER_SIZE);
    }

    
    /**
     * Returns the BAT block with the given block number.
     * If the BAT block were previously cached, then the cached version
     * is returned.  Otherwise, the file pointer is repositioned to 
     * the start of the given block, and the 512 bytes are read and
     * stored in the cache.
     * 
     * @param block   the block number of the BAT block to return
     * @return   the BAT block
     * @throws IOException
     */
    private ByteBuffer getBATBlock(int block) throws IOException {
        ByteBuffer r = cache.get(block);
        if (r != null) {
            return r;
        }

        byte[] buf = new byte[BLOCK_SIZE];
        input.position((block + 1) * BLOCK_SIZE);
        ArchiveUtils.readFully(input, buf);

        r = ByteBuffer.wrap(buf);
        r.order(ByteOrder.LITTLE_ENDIAN);
        cache.put(block, r);
        return r;
    }


    public SeekInputStream getRawInput() {
        return input;
    }
}