All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twelvemonkeys.io.ole2.CompoundDocument Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2008, Harald Kuhr
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of the copyright holder nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package com.twelvemonkeys.io.ole2;

import com.twelvemonkeys.io.*;
import com.twelvemonkeys.lang.StringUtil;

import javax.imageio.stream.ImageInputStream;
import java.io.*;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.UUID;

import static com.twelvemonkeys.lang.Validate.notNull;

/**
 * Represents a read-only OLE2 compound document.
 * 

* * NOTE: This class is not synchronized. Accessing the document or its * entries from different threads, will need synchronization on the document * instance. *

* * @author Harald Kuhr * @author last modified by $Author: haku $ * @version $Id: //depot/branches/personal/haraldk/twelvemonkeys/release-2/twelvemonkeys-core/src/main/java/com/twelvemonkeys/io/ole2/CompoundDocument.java#4 $ */ public final class CompoundDocument implements AutoCloseable { // TODO: Write support... // TODO: Properties: http://support.microsoft.com/kb/186898 static final byte[] MAGIC = new byte[]{ (byte) 0xD0, (byte) 0xCF, (byte) 0x11, (byte) 0xE0, (byte) 0xA1, (byte) 0xB1, (byte) 0x1A, (byte) 0xE1, }; private static final int FREE_SID = -1; private static final int END_OF_CHAIN_SID = -2; private static final int SAT_SECTOR_SID = -3; // Sector used by SAT private static final int MSAT_SECTOR_SID = -4; // Sector used my Master SAT public static final int HEADER_SIZE = 512; /** The epoch offset of CompoundDocument time stamps */ public final static long EPOCH_OFFSET = -11644477200000L; private final DataInput input; private UUID uUID; private int sectorSize; private int shortSectorSize; private int directorySId; private int minStreamSize; private int shortSATSId; private int shortSATSize; // Master Sector Allocation Table private int[] masterSAT; private int[] SAT; private int[] shortSAT; private Entry rootEntry; private SIdChain shortStreamSIdChain; private SIdChain directorySIdChain; /** * Creates a (for now) read only {@code CompoundDocument}. *

* Warning! You must invoke {@link #close()} on the compound document * created from this constructor when done, to avoid leaking file * descriptors. *

* * @param file the file to read from * * @throws IOException if an I/O exception occurs while reading the header */ public CompoundDocument(final File file) throws IOException { // TODO: We need to close this (or it's underlying RAF)! Otherwise we're leaking file descriptors! input = new LittleEndianRandomAccessFile(FileUtil.resolve(file), "r"); // TODO: Might be better to read header on first read operation?! // OTOH: It's also good to be fail-fast, so at least we should make // sure we're reading a valid document readHeader(); } /** * Creates a read only {@code CompoundDocument}. * * @param pInput the input to read from. * * @throws IOException if an I/O exception occurs while reading the header */ public CompoundDocument(final InputStream pInput) throws IOException { this(new MemoryCacheSeekableStream(pInput)); } // For testing only, consider exposing later CompoundDocument(final SeekableInputStream stream) throws IOException { input = new SeekableLittleEndianDataInputStream(stream); // TODO: Might be better to read header on first read operation?! // OTOH: It's also good to be fail-fast, so at least we should make // sure we're reading a valid document readHeader(); } /** * Creates a read only {@code CompoundDocument}. * * @param input the input to read from * * @throws IOException if an I/O exception occurs while reading the header */ public CompoundDocument(final ImageInputStream input) throws IOException { this.input = notNull(input, "input"); // This implementation only supports little endian (Intel) CompoundDocuments input.setByteOrder(ByteOrder.LITTLE_ENDIAN); // TODO: Might be better to read header on first read operation?! // OTOH: It's also good to be fail-fast, so at least we should make // sure we're reading a valid document readHeader(); } /** * This method will close the underlying {@link RandomAccessFile} if any, * but will leave any stream created outside the document open. * * @see #CompoundDocument(File) * @see RandomAccessFile#close() * * @throws IOException if an I/O error occurs. */ @Override public void close() throws IOException { if (input instanceof RandomAccessFile) { ((RandomAccessFile) input).close(); } else if (input instanceof LittleEndianRandomAccessFile) { ((LittleEndianRandomAccessFile) input).close(); } // Other streams are left open } public static boolean canRead(final DataInput pInput) { return canRead(pInput, true); } // TODO: Refactor.. Figure out what we really need to expose to ImageIO for // easy reading of the Thumbs.db file // It's probably safer to create one version for InputStream and one for File private static boolean canRead(final DataInput pInput, final boolean pReset) { long pos = FREE_SID; if (pReset) { try { if (pInput instanceof InputStream && ((InputStream) pInput).markSupported()) { ((InputStream) pInput).mark(8); } else if (pInput instanceof ImageInputStream) { ((ImageInputStream) pInput).mark(); } else if (pInput instanceof RandomAccessFile) { pos = ((RandomAccessFile) pInput).getFilePointer(); } else if (pInput instanceof LittleEndianRandomAccessFile) { pos = ((LittleEndianRandomAccessFile) pInput).getFilePointer(); } else { return false; } } catch (IOException ignore) { return false; } } try { byte[] magic = new byte[8]; pInput.readFully(magic); return Arrays.equals(magic, MAGIC); } catch (IOException ignore) { // Ignore } finally { if (pReset) { try { if (pInput instanceof InputStream && ((InputStream) pInput).markSupported()) { ((InputStream) pInput).reset(); } else if (pInput instanceof ImageInputStream) { ((ImageInputStream) pInput).reset(); } else if (pInput instanceof RandomAccessFile) { ((RandomAccessFile) pInput).seek(pos); } else if (pInput instanceof LittleEndianRandomAccessFile) { ((LittleEndianRandomAccessFile) pInput).seek(pos); } } catch (IOException e) { // TODO: This isn't actually good enough... // Means something fucked up, and will fail... e.printStackTrace(); } } } return false; } private void readHeader() throws IOException { if (masterSAT != null) { return; } if (!canRead(input, false)) { throw new CorruptDocumentException("Not an OLE 2 Compound Document"); } // UID (seems to be all 0s) uUID = new UUID(input.readLong(), input.readLong()); // System.out.println("uUID: " + uUID); // int version = input.readUnsignedShort(); // System.out.println("version: " + version); // int revision = input.readUnsignedShort(); // System.out.println("revision: " + revision); int byteOrder = input.readUnsignedShort(); // System.out.printf("byteOrder: 0x%04x\n", byteOrder); if (byteOrder == 0xffff) { throw new CorruptDocumentException("Cannot read big endian OLE 2 Compound Documents"); } else if (byteOrder != 0xfffe) { // Reversed, as I'm already reading little-endian throw new CorruptDocumentException(String.format("Unknown byte order marker: 0x%04x, expected 0xfffe or 0xffff", byteOrder)); } sectorSize = 1 << input.readUnsignedShort(); // System.out.println("sectorSize: " + sectorSize + " bytes"); shortSectorSize = 1 << input.readUnsignedShort(); // System.out.println("shortSectorSize: " + shortSectorSize + " bytes"); // Reserved if (skipBytesFully(10) != 10) { throw new CorruptDocumentException(); } int SATSize = input.readInt(); // System.out.println("normalSATSize: " + SATSize); directorySId = input.readInt(); // System.out.println("directorySId: " + directorySId); // Reserved if (skipBytesFully(4) != 4) { throw new CorruptDocumentException(); } minStreamSize = input.readInt(); // System.out.println("minStreamSize: " + minStreamSize + " bytes"); shortSATSId = input.readInt(); // System.out.println("shortSATSId: " + shortSATSId); shortSATSize = input.readInt(); // System.out.println("shortSATSize: " + shortSATSize); int masterSATSId = input.readInt(); // System.out.println("masterSATSId: " + masterSATSId); int masterSATSize = input.readInt(); // System.out.println("masterSATSize: " + masterSATSize); // Read masterSAT: 436 bytes, containing up to 109 SIDs //System.out.println("MSAT:"); masterSAT = new int[SATSize]; final int headerSIds = Math.min(SATSize, 109); for (int i = 0; i < headerSIds; i++) { masterSAT[i] = input.readInt(); //System.out.println("\tSID(" + i + "): " + masterSAT[i]); } if (masterSATSId == END_OF_CHAIN_SID) { // End of chain int freeSIdLength = 436 - (SATSize * 4); if (skipBytesFully(freeSIdLength) != freeSIdLength) { throw new CorruptDocumentException(); } } else { // Parse the SIDs in the extended MasterSAT sectors... seekToSId(masterSATSId, FREE_SID); int index = headerSIds; for (int i = 0; i < masterSATSize; i++) { for (int j = 0; j < 127; j++) { int sid = input.readInt(); switch (sid) { case FREE_SID:// Free break; default: masterSAT[index++] = sid; break; } } int next = input.readInt(); if (next == END_OF_CHAIN_SID) {// End of chain break; } seekToSId(next, FREE_SID); } } } private int skipBytesFully(final int n) throws IOException { int toSkip = n; while (toSkip > 0) { int skipped = input.skipBytes(n); if (skipped <= 0) { break; } toSkip -= skipped; } return n - toSkip; } private void readSAT() throws IOException { if (SAT != null) { return; } final int intsPerSector = sectorSize / 4; // Read the Sector Allocation Table SAT = new int[masterSAT.length * intsPerSector]; for (int i = 0; i < masterSAT.length; i++) { seekToSId(masterSAT[i], FREE_SID); for (int j = 0; j < intsPerSector; j++) { int nextSID = input.readInt(); int index = (j + (i * intsPerSector)); SAT[index] = nextSID; } } // Read the short-stream Sector Allocation Table SIdChain chain = getSIdChain(shortSATSId, FREE_SID); shortSAT = new int[shortSATSize * intsPerSector]; for (int i = 0; i < shortSATSize; i++) { seekToSId(chain.get(i), FREE_SID); for (int j = 0; j < intsPerSector; j++) { int nextSID = input.readInt(); int index = (j + (i * intsPerSector)); shortSAT[index] = nextSID; } } } /** * Gets the SIdChain for the given stream Id * * @param pSId the stream Id * @param pStreamSize the size of the stream, or -1 for system control streams * @return the SIdChain for the given stream Id * @throws IOException if an I/O exception occurs */ private SIdChain getSIdChain(final int pSId, final long pStreamSize) throws IOException { SIdChain chain = new SIdChain(); int[] sat = isShortStream(pStreamSize) ? shortSAT : SAT; int sid = pSId; while (sid != END_OF_CHAIN_SID && sid != FREE_SID) { chain.addSID(sid); sid = sat[sid]; } return chain; } private boolean isShortStream(final long pStreamSize) { return pStreamSize != FREE_SID && pStreamSize < minStreamSize; } /** * Seeks to the start pos for the given stream Id * * @param pSId the stream Id * @param pStreamSize the size of the stream, or -1 for system control streams * @throws IOException if an I/O exception occurs */ private void seekToSId(final int pSId, final long pStreamSize) throws IOException { long pos; if (isShortStream(pStreamSize)) { // The short stream is not continuous... Entry root = getRootEntry(); if (shortStreamSIdChain == null) { shortStreamSIdChain = getSIdChain(root.startSId, root.streamSize); } // System.err.println("pSId: " + pSId); int shortPerSId = sectorSize / shortSectorSize; // System.err.println("shortPerSId: " + shortPerSId); int offset = pSId / shortPerSId; // System.err.println("offset: " + offset); int shortOffset = pSId - (offset * shortPerSId); // System.err.println("shortOffset: " + shortOffset); // System.err.println("shortStreamSIdChain.offset: " + shortStreamSIdChain.get(offset)); pos = HEADER_SIZE + (shortStreamSIdChain.get(offset) * (long) sectorSize) + (shortOffset * (long) shortSectorSize); // System.err.println("pos: " + pos); } else { pos = HEADER_SIZE + pSId * (long) sectorSize; } if (input instanceof LittleEndianRandomAccessFile) { ((LittleEndianRandomAccessFile) input).seek(pos); } else if (input instanceof ImageInputStream) { ((ImageInputStream) input).seek(pos); } else { ((SeekableLittleEndianDataInputStream) input).seek(pos); } } private void seekToDId(final int pDId) throws IOException { if (directorySIdChain == null) { directorySIdChain = getSIdChain(directorySId, FREE_SID); } int dIdsPerSId = sectorSize / Entry.LENGTH; int sIdOffset = pDId / dIdsPerSId; int dIdOffset = pDId - (sIdOffset * dIdsPerSId); int sId = directorySIdChain.get(sIdOffset); seekToSId(sId, FREE_SID); if (input instanceof LittleEndianRandomAccessFile) { LittleEndianRandomAccessFile input = (LittleEndianRandomAccessFile) this.input; input.seek(input.getFilePointer() + dIdOffset * Entry.LENGTH); } else if (input instanceof ImageInputStream) { ImageInputStream input = (ImageInputStream) this.input; input.seek(input.getStreamPosition() + dIdOffset * Entry.LENGTH); } else { SeekableLittleEndianDataInputStream input = (SeekableLittleEndianDataInputStream) this.input; input.seek(input.getStreamPosition() + dIdOffset * Entry.LENGTH); } } SeekableInputStream getInputStreamForSId(final int pStreamId, final int pStreamSize) throws IOException { SIdChain chain = getSIdChain(pStreamId, pStreamSize); // TODO: Detach? Means, we have to copy to a byte buffer, or keep track of // positions, and seek back and forth (would be cool, but difficult).. int sectorSize = pStreamSize < minStreamSize ? shortSectorSize : this.sectorSize; return new MemoryCacheSeekableStream(new Stream(chain, pStreamSize, sectorSize, this)); } private InputStream getDirectoryStreamForDId(final int pDirectoryId) throws IOException { // This is always exactly 128 bytes, so we'll just read it all, // and buffer (we might want to optimize this later). byte[] bytes = new byte[Entry.LENGTH]; seekToDId(pDirectoryId); input.readFully(bytes); return new ByteArrayInputStream(bytes); } Entry getEntry(final int pDirectoryId, Entry pParent) throws IOException { Entry entry = Entry.readEntry(new LittleEndianDataInputStream( getDirectoryStreamForDId(pDirectoryId) )); entry.parent = pParent; entry.document = this; return entry; } SortedSet getEntries(final int pDirectoryId, final Entry pParent) throws IOException { return getEntriesRecursive(pDirectoryId, pParent, new TreeSet()); } private SortedSet getEntriesRecursive(final int pDirectoryId, final Entry pParent, final SortedSet pEntries) throws IOException { //System.out.println("pDirectoryId: " + pDirectoryId); Entry entry = getEntry(pDirectoryId, pParent); //System.out.println("entry: " + entry); if (!pEntries.add(entry)) { // TODO: This occurs in some Thumbs.db files, and Windows will // still parse the file gracefully somehow... // Deleting and regenerating the file will remove the cyclic // references, but... How can Windows parse this file? throw new CorruptDocumentException("Cyclic chain reference for entry: " + pDirectoryId); } if (entry.prevDId != FREE_SID) { //System.out.println("prevDId: " + entry.prevDId); getEntriesRecursive(entry.prevDId, pParent, pEntries); } if (entry.nextDId != FREE_SID) { //System.out.println("nextDId: " + entry.nextDId); getEntriesRecursive(entry.nextDId, pParent, pEntries); } return pEntries; } /*public*/ Entry getEntry(String pPath) throws IOException { if (StringUtil.isEmpty(pPath) || !pPath.startsWith("/")) { throw new IllegalArgumentException("Path must be absolute, and contain a valid path: " + pPath); } Entry entry = getRootEntry(); if (pPath.equals("/")) { // '/' means root entry return entry; } else { // Otherwise get children recursively: String[] pathElements = StringUtil.toStringArray(pPath, "/"); for (String pathElement : pathElements) { entry = entry.getChildEntry(pathElement); // No such child... if (entry == null) { break;// TODO: FileNotFoundException? Should behave like Entry.getChildEntry!! } } return entry; } } public Entry getRootEntry() throws IOException { if (rootEntry == null) { readSAT(); rootEntry = getEntry(0, null); if (rootEntry.type != Entry.ROOT_STORAGE) { throw new CorruptDocumentException("Invalid root storage type: " + rootEntry.type); } } return rootEntry; } // This is useless, as most documents on file have all-zero UUIDs... // @Override // public int hashCode() { // return uUID.hashCode(); // } // // @Override // public boolean equals(final Object pOther) { // if (pOther == this) { // return true; // } // // if (pOther == null) { // return true; // } // // if (pOther.getClass() == getClass()) { // return uUID.equals(((CompoundDocument) pOther).uUID); // } // // return false; // } @Override public String toString() { return String.format( "%s[uuid: %s, sector size: %d/%d bytes, directory SID: %d, master SAT: %s entries]", getClass().getSimpleName(), uUID, sectorSize, shortSectorSize, directorySId, masterSAT.length ); } /** * Converts the given time stamp to standard Java time representation, * milliseconds since January 1, 1970. * The time stamp parameter is assumed to be in units of * 100 nano seconds since January 1, 1601. *

* If the timestamp is {@code 0L} (meaning not specified), no conversion * is done, to behave like {@code java.io.File}. *

* * @param pMSTime an unsigned long value representing the time stamp (in * units of 100 nano seconds since January 1, 1601). * * @return the time stamp converted to Java time stamp in milliseconds, * or {@code 0L} if {@code pMSTime == 0L} */ public static long toJavaTimeInMillis(final long pMSTime) { // NOTE: The time stamp field is an unsigned 64-bit integer value that // contains the time elapsed since 1601-Jan-01 00:00:00 (Gregorian // calendar). // One unit of this value is equal to 100 nanoseconds). // That means, each second the time stamp value will be increased by // 10 million units. if (pMSTime == 0L) { return 0L; // This is just less confusing... } // Convert to milliseconds (signed), // then convert to Java std epoch (1970-Jan-01 00:00:00) return ((pMSTime >> 1) / 5000) + EPOCH_OFFSET; } static class Stream extends InputStream { private final SIdChain chain; private final CompoundDocument document; private final long length; private long streamPos; private int nextSectorPos; private byte[] buffer; private int bufferPos; public Stream(SIdChain chain, int streamSize, int sectorSize, CompoundDocument document) { this.chain = chain; this.length = streamSize; this.buffer = new byte[sectorSize]; this.bufferPos = buffer.length; this.document = document; } @Override public int available() throws IOException { return (int) Math.min(buffer.length - bufferPos, length - streamPos); } public int read() throws IOException { if (available() <= 0) { if (!fillBuffer()) { return -1; } } streamPos++; return buffer[bufferPos++] & 0xff; } private boolean fillBuffer() throws IOException { if (streamPos < length && nextSectorPos < chain.length()) { // TODO: Sync on document.input here, and we are completely detached... :-) // TODO: Update: We also need to sync other places... :-P synchronized (document) { document.seekToSId(chain.get(nextSectorPos), length); document.input.readFully(buffer); } nextSectorPos++; bufferPos = 0; return true; } return false; } @Override public int read(byte b[], int off, int len) throws IOException { if (available() <= 0) { if (!fillBuffer()) { return -1; } } int toRead = Math.min(len, available()); System.arraycopy(buffer, bufferPos, b, off, toRead); bufferPos += toRead; streamPos += toRead; return toRead; } @Override public void close() throws IOException { buffer = null; } } static class SeekableLittleEndianDataInputStream extends LittleEndianDataInputStream implements Seekable { private final SeekableInputStream seekable; public SeekableLittleEndianDataInputStream(final SeekableInputStream pInput) { super(pInput); seekable = pInput; } public void seek(final long pPosition) throws IOException { seekable.seek(pPosition); } public boolean isCachedFile() { return seekable.isCachedFile(); } public boolean isCachedMemory() { return seekable.isCachedMemory(); } public boolean isCached() { return seekable.isCached(); } public long getStreamPosition() throws IOException { return seekable.getStreamPosition(); } public long getFlushedPosition() throws IOException { return seekable.getFlushedPosition(); } public void flushBefore(final long pPosition) throws IOException { seekable.flushBefore(pPosition); } public void flush() throws IOException { seekable.flush(); } @Override public void reset() throws IOException { seekable.reset(); } public void mark() { seekable.mark(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy