org.archive.util.ms.PieceTable Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-commons Show documentation
The Archive Commons Code Libraries project contains general Java utility libraries, as used by the Heritrix crawler and other projects.
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.util.ms;

import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.io.BufferedSeekInputStream;
import org.archive.io.Endian;
import org.archive.io.OriginSeekInputStream;
import org.archive.io.SafeSeekInputStream;
import org.archive.io.SeekInputStream;


/**
 * The piece table of a .doc file.  
 * 
 * The piece table maps logical character positions of a document's text
 * stream to actual file stream positions.  The piece table is stored as two
 * parallel arrays.  The first array contains 32-bit integers representing
 * the logical character positions.  The second array contains 64-bit data
 * structures that are mostly mysterious to me, except that they contain a
 * 32-bit subfile offset.  The second array is stored immediately after the
 * first array.  I call the first array the charPos array and the 
 * second array the filePos array.
 * 
 * 
The arrays are preceded by a special tag byte (2), followed by the
 * combined size of both arrays in bytes.  The number of piece table entries 
 * must be deduced from this byte size.  
 * 
 * 
Because of this bizarre structure, caching piece table entries is 
 * something of a challenge.  A single piece table entry is actually located
 * in two different file locations.  If there are many piece table entries,
 * then the charPos and filePos information may be separated by many bytes,
 * potentially crossing block boundaries.  The approach I took was to use
 * two different buffered streams.  Up to n charPos offsets and n filePos
 * structures can be buffered in the two streams, preventing any file seeking
 * from occurring when looking up piece information.  (File seeking must 
 * still occur to jump from one piece to the next.)
 * 
 * 
Note that the vast majority of .doc files in the world will have exactly
 * 1 piece table entry, representing the complete text of the document.  Only
 * those documents that were "fast-saved" should have multiple pieces.
 * 
 * Finally, the text contained in a .doc file can either contain 16-bit
 * unicode characters (charset UTF-16LE) or 8-bit CP1252 characters.  One
 * .doc file can contain both kinds of pieces.  Whether or not a piece is
 * Cp1252 is stored as a flag in the filePos value, bizarrely enough.  If
 * the flag is set, then the actual file position is the filePos with the
 * flag cleared, then divided by 2.
 * 
 * @author pjack
 */
class PieceTable {

    private final static Logger LOGGER
     = Logger.getLogger(PieceTable.class.getName());

    /** The bit that indicates if a piece uses Cp1252 or unicode. */
    protected final static int CP1252_INDICATOR = 1 << 30;
    
    /** The mask to use to clear the Cp1252 flag bit. */
    protected final static int CP1252_MASK = ~(3 << 30);

    /** The total number of pieces in the table. */
    private int count;
    
    /** The total number of characters in the text stream. */
    private int maxCharPos;

    /** The index of the current piece. */
    private int current;
    
    /** The most recently returned piece from this table. */
    private Piece currentPiece;


    /** The buffered stream that provides character position information. */
    private SeekInputStream charPos;
    
    /** The buffered stream that provides file pointer information. */
    private SeekInputStream filePos;


    /**
     * Constructor.
     * 
     * @param tableStream   the stream containing the piece table
     * @param offset        the starting offset of the piece table
     * @param maxCharPos     the total number of characters in the document
     * @param cachedRecords  the number of piece table entries to cache
     * @throws IOException   if an IO error occurs
     */
    public PieceTable(SeekInputStream tableStream, int offset, 
            int maxCharPos, int cachedRecords) throws IOException {
        tableStream.position(offset);
        skipProperties(tableStream);
        int sizeInBytes = Endian.littleInt(tableStream);
        this.count = (sizeInBytes - 4) / 12;
        cachedRecords = Math.min(cachedRecords, count);
        long tp = tableStream.position() + 4;
        long charPosStart = tp;
        long filePosStart = tp + count * 4 + 2;
        
        this.filePos = wrap(tableStream, filePosStart, cachedRecords * 8);
        this.charPos = wrap(tableStream, charPosStart, cachedRecords * 4);
        this.maxCharPos = maxCharPos;
        
        if (LOGGER.isLoggable(Level.FINEST)) {
            LOGGER.finest("Size in bytes: " + sizeInBytes);
            LOGGER.finest("Piece table count: " + count);
            for (Piece piece = next(); piece != null; piece = next()) {
                LOGGER.finest("#" + current + ": " + piece.toString());
            }
            current = 0;
        }
    }
    
    
    /**
     * Wraps the raw table stream.  This is used to create the charPos and
     * filePos streams.  The streams that this method returns are "safe",
     * meaning that the charPos and filePos position() fields never clobber
     * each other.  They are buffered, meaning that up to n elements
     * can be read before the disk is accessed again.  And they are "origined",
     * meaning result.position(0) actually positions the stream at the 
     * beginning of the piece table array, not the beginning of the file.
     * 
     * @param input   the stream to wrap
     * @param pos     the origin for the returned stream
     * @param cache   the number of bytes for the returned stream to buffer
     * @return   the wrapped stream
     * @throws IOException  if an IO error occurs
     */
    private SeekInputStream wrap(SeekInputStream input, long pos, int cache) 
    throws IOException {
        input.position(pos);
        SeekInputStream r = new SafeSeekInputStream(input);
        r = new OriginSeekInputStream(r, pos);
        r = new BufferedSeekInputStream(r, cache);
        return r;
    }
    
    
    /**
     * Skips over any property information that may precede a piece table.
     * These property structures contain stylesheet information that applies
     * to the piece table.  Since we're only interested in the text itself,
     * we just ignore this property stuff.  (I suppose a third buffered
     * stream could be used to add style information to {@link Piece}, but
     * we don't need it.)
     * 
     * @param input  the input stream containing the piece table
     * @throws IOException  if an IO error occurs
     */
    private static void skipProperties(SeekInputStream input) throws IOException {
        int tag = input.read();
        while (tag == 1) {
            int size = Endian.littleChar(input);
            while (size > 0) {
                size -= input.skip(size);
            }
            tag = input.read();
        }
        if (tag != 2) {
            throw new IllegalStateException();
        }
    }


    /**
     * Returns the maximum character position.  Put another way, returns the
     * total number of characters in the document.
     * 
     * @return  the maximum character position
     */
    public int getMaxCharPos() {
        return maxCharPos;
    }


    /**
     * Returns the next piece in the piece table.
     * 
     * @return  the next piece in the piece table, or null if there is no 
     *   next piece
     * @throws IOException  if an IO error occurs
     */
    public Piece next() throws IOException {
        if (current >= count) {
            currentPiece = null;
            return null;
        }
                
        int cp;
        if (current == count - 1) {
            cp = maxCharPos;
        } else {
            charPos.position(current * 4);
            cp = Endian.littleInt(charPos);
        }
        filePos.position(current * 8);
        int encoded = Endian.littleInt(filePos);

        if (LOGGER.isLoggable(Level.FINEST)) {
            StringBuffer sb = new StringBuffer(Integer.toBinaryString(encoded));
            while (sb.length() < 32) {
                sb.insert(0, '0');
            }
            LOGGER.finest("Encoded offset: " + sb.toString());
        }
        
        current++;

        int start;
        if (currentPiece == null) {
            start = 0;
        } else {
            start = currentPiece.getCharPosLimit();
        }
        if ((encoded & CP1252_INDICATOR) == 0) {
            Piece piece = new Piece(encoded, start, cp, true);
            currentPiece = piece;
            return piece;
        } else {
            int filePos = (encoded & CP1252_MASK) / 2;
            Piece piece = new Piece(filePos, start, cp, false);
            currentPiece = piece;
            return piece;
        }
    }

    
    /**
     * Returns the piece containing the given character position.
     * 
     * @param charPos   the character position whose piece to return
     * @return   that piece, or null if no such piece exists (if charPos 
     *   is greater than getMaxCharPos())
     * @throws IOException   if an IO error occurs
     */
    public Piece pieceFor(int charPos) throws IOException {
        if (currentPiece.contains(charPos)) {
            return currentPiece;
        }
     
        // FIXME: Use binary search to find piece index
        
        current = 0;
        currentPiece = null;
        next();
        
        while (currentPiece != null) {
            if (currentPiece.contains(charPos)) {
                return currentPiece;
            }
            next();
        }
        
        return null;
    }

}