org.archive.util.ms.PieceTable Maven / Gradle / Ivy
Show all versions of heritrix-commons Show documentation
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.util.ms;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.io.BufferedSeekInputStream;
import org.archive.io.Endian;
import org.archive.io.OriginSeekInputStream;
import org.archive.io.SafeSeekInputStream;
import org.archive.io.SeekInputStream;
/**
* The piece table of a .doc file.
*
* The piece table maps logical character positions of a document's text
* stream to actual file stream positions. The piece table is stored as two
* parallel arrays. The first array contains 32-bit integers representing
* the logical character positions. The second array contains 64-bit data
* structures that are mostly mysterious to me, except that they contain a
* 32-bit subfile offset. The second array is stored immediately after the
* first array. I call the first array the charPos array and the
* second array the filePos array.
*
*
The arrays are preceded by a special tag byte (2), followed by the
* combined size of both arrays in bytes. The number of piece table entries
* must be deduced from this byte size.
*
*
Because of this bizarre structure, caching piece table entries is
* something of a challenge. A single piece table entry is actually located
* in two different file locations. If there are many piece table entries,
* then the charPos and filePos information may be separated by many bytes,
* potentially crossing block boundaries. The approach I took was to use
* two different buffered streams. Up to n charPos offsets and n filePos
* structures can be buffered in the two streams, preventing any file seeking
* from occurring when looking up piece information. (File seeking must
* still occur to jump from one piece to the next.)
*
*
Note that the vast majority of .doc files in the world will have exactly
* 1 piece table entry, representing the complete text of the document. Only
* those documents that were "fast-saved" should have multiple pieces.
*
*
Finally, the text contained in a .doc file can either contain 16-bit
* unicode characters (charset UTF-16LE) or 8-bit CP1252 characters. One
* .doc file can contain both kinds of pieces. Whether or not a piece is
* Cp1252 is stored as a flag in the filePos value, bizarrely enough. If
* the flag is set, then the actual file position is the filePos with the
* flag cleared, then divided by 2.
*
* @author pjack
*/
class PieceTable {
private final static Logger LOGGER
= Logger.getLogger(PieceTable.class.getName());
/** The bit that indicates if a piece uses Cp1252 or unicode. */
protected final static int CP1252_INDICATOR = 1 << 30;
/** The mask to use to clear the Cp1252 flag bit. */
protected final static int CP1252_MASK = ~(3 << 30);
/** The total number of pieces in the table. */
private int count;
/** The total number of characters in the text stream. */
private int maxCharPos;
/** The index of the current piece. */
private int current;
/** The most recently returned piece from this table. */
private Piece currentPiece;
/** The buffered stream that provides character position information. */
private SeekInputStream charPos;
/** The buffered stream that provides file pointer information. */
private SeekInputStream filePos;
/**
* Constructor.
*
* @param tableStream the stream containing the piece table
* @param offset the starting offset of the piece table
* @param maxCharPos the total number of characters in the document
* @param cachedRecords the number of piece table entries to cache
* @throws IOException if an IO error occurs
*/
public PieceTable(SeekInputStream tableStream, int offset,
int maxCharPos, int cachedRecords) throws IOException {
tableStream.position(offset);
skipProperties(tableStream);
int sizeInBytes = Endian.littleInt(tableStream);
this.count = (sizeInBytes - 4) / 12;
cachedRecords = Math.min(cachedRecords, count);
long tp = tableStream.position() + 4;
long charPosStart = tp;
long filePosStart = tp + count * 4 + 2;
this.filePos = wrap(tableStream, filePosStart, cachedRecords * 8);
this.charPos = wrap(tableStream, charPosStart, cachedRecords * 4);
this.maxCharPos = maxCharPos;
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.finest("Size in bytes: " + sizeInBytes);
LOGGER.finest("Piece table count: " + count);
for (Piece piece = next(); piece != null; piece = next()) {
LOGGER.finest("#" + current + ": " + piece.toString());
}
current = 0;
}
}
/**
* Wraps the raw table stream. This is used to create the charPos and
* filePos streams. The streams that this method returns are "safe",
* meaning that the charPos and filePos position() fields never clobber
* each other. They are buffered, meaning that up to n elements
* can be read before the disk is accessed again. And they are "origined",
* meaning result.position(0) actually positions the stream at the
* beginning of the piece table array, not the beginning of the file.
*
* @param input the stream to wrap
* @param pos the origin for the returned stream
* @param cache the number of bytes for the returned stream to buffer
* @return the wrapped stream
* @throws IOException if an IO error occurs
*/
private SeekInputStream wrap(SeekInputStream input, long pos, int cache)
throws IOException {
input.position(pos);
SeekInputStream r = new SafeSeekInputStream(input);
r = new OriginSeekInputStream(r, pos);
r = new BufferedSeekInputStream(r, cache);
return r;
}
/**
* Skips over any property information that may precede a piece table.
* These property structures contain stylesheet information that applies
* to the piece table. Since we're only interested in the text itself,
* we just ignore this property stuff. (I suppose a third buffered
* stream could be used to add style information to {@link Piece}, but
* we don't need it.)
*
* @param input the input stream containing the piece table
* @throws IOException if an IO error occurs
*/
private static void skipProperties(SeekInputStream input) throws IOException {
int tag = input.read();
while (tag == 1) {
int size = Endian.littleChar(input);
while (size > 0) {
size -= input.skip(size);
}
tag = input.read();
}
if (tag != 2) {
throw new IllegalStateException();
}
}
/**
* Returns the maximum character position. Put another way, returns the
* total number of characters in the document.
*
* @return the maximum character position
*/
public int getMaxCharPos() {
return maxCharPos;
}
/**
* Returns the next piece in the piece table.
*
* @return the next piece in the piece table, or null if there is no
* next piece
* @throws IOException if an IO error occurs
*/
public Piece next() throws IOException {
if (current >= count) {
currentPiece = null;
return null;
}
int cp;
if (current == count - 1) {
cp = maxCharPos;
} else {
charPos.position(current * 4);
cp = Endian.littleInt(charPos);
}
filePos.position(current * 8);
int encoded = Endian.littleInt(filePos);
if (LOGGER.isLoggable(Level.FINEST)) {
StringBuffer sb = new StringBuffer(Integer.toBinaryString(encoded));
while (sb.length() < 32) {
sb.insert(0, '0');
}
LOGGER.finest("Encoded offset: " + sb.toString());
}
current++;
int start;
if (currentPiece == null) {
start = 0;
} else {
start = currentPiece.getCharPosLimit();
}
if ((encoded & CP1252_INDICATOR) == 0) {
Piece piece = new Piece(encoded, start, cp, true);
currentPiece = piece;
return piece;
} else {
int filePos = (encoded & CP1252_MASK) / 2;
Piece piece = new Piece(filePos, start, cp, false);
currentPiece = piece;
return piece;
}
}
/**
* Returns the piece containing the given character position.
*
* @param charPos the character position whose piece to return
* @return that piece, or null if no such piece exists (if charPos
* is greater than getMaxCharPos())
* @throws IOException if an IO error occurs
*/
public Piece pieceFor(int charPos) throws IOException {
if (currentPiece.contains(charPos)) {
return currentPiece;
}
// FIXME: Use binary search to find piece index
current = 0;
currentPiece = null;
next();
while (currentPiece != null) {
if (currentPiece.contains(charPos)) {
return currentPiece;
}
next();
}
return null;
}
}