
org.apache.tika.parser.microsoft.onenote.OneNotePtr Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.onenote;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.EndianUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
/**
* This is the main class used during parsing. This will contain an offset position and end
* position for reading bytes from the byte stream.
*
* It contains all the deserialize methods used to read the different data elements from a one
* note file.
*
* You can construct a new one note pointer and it will reposition the byte channel and will
* read until
*/
class OneNotePtr {
public static final long FOOTER_CONST = 0x8BC215C38233BA4BL;
public static final String UNKNOWN = "unknown";
public static final int IFNDF_GUID_LENGTH = 38; // 36 char guid with a { and a } char.
public static final int NUM_RESERVED_BYTES_AT_END_OF_HEADER = 728;
private static final Logger LOG = LoggerFactory.getLogger(OneNoteParser.class);
private static final byte[] IFNDF =
new byte[]{60, 0, 105, 0, 102, 0, 110, 0, 100, 0, 102, 0, 62, 0};
private static final String PACKAGE_STORAGE_FILE_FORMAT_GUID =
"{638DE92F-A6D4-4BC1-9A36-B3FC2511A5B7}";
int indentLevel = 0;
long offset;
long end;
OneNoteDocument document;
OneNoteDirectFileResource dif;
public OneNotePtr(OneNoteDocument document, OneNoteDirectFileResource oneNoteDirectFileResource)
throws IOException {
this.document = document;
this.dif = oneNoteDirectFileResource;
offset = oneNoteDirectFileResource.position();
end = oneNoteDirectFileResource.size();
}
public OneNotePtr(OneNotePtr oneNotePtr) {
this.document = oneNotePtr.document;
this.dif = oneNotePtr.dif;
this.offset = oneNotePtr.offset;
this.end = oneNotePtr.end;
this.indentLevel = oneNotePtr.indentLevel;
}
public OneNoteHeader deserializeHeader() throws IOException, TikaException {
OneNoteHeader data = new OneNoteHeader();
data.setGuidFileType(deserializeGUID()).setGuidFile(deserializeGUID())
.setGuidLegacyFileVersion(deserializeGUID()).setGuidFileFormat(deserializeGUID())
.setFfvLastCodeThatWroteToThisFile(deserializeLittleEndianInt())
.setFfvOldestCodeThatHasWrittenToThisFile(deserializeLittleEndianInt())
.setFfvNewestCodeThatHasWrittenToThisFile(deserializeLittleEndianInt())
.setFfvOldestCodeThatMayReadThisFile(deserializeLittleEndianInt())
.setFcrLegacyFreeChunkList(deserializeFileChunkReference64())
.setFcrLegacyTransactionLog(deserializeFileChunkReference64())
.setcTransactionsInLog(deserializeLittleEndianInt())
.setCbExpectedFileLength(deserializeLittleEndianInt())
.setRgbPlaceholder(deserializeLittleEndianLong())
.setFcrLegacyFileNodeListRoot(deserializeFileChunkReference64())
.setCbLegacyFreeSpaceInFreeChunkList(deserializeLittleEndianInt())
.setIgnoredZeroA(deserializeLittleEndianChar())
.setIgnoredZeroB(deserializeLittleEndianChar())
.setIgnoredZeroC(deserializeLittleEndianChar())
.setIgnoredZeroD(deserializeLittleEndianChar()).setGuidAncestor(deserializeGUID())
.setCrcName(deserializeLittleEndianInt())
.setFcrHashedChunkList(deserializeFileChunkReference64x32())
.setFcrTransactionLog(deserializeFileChunkReference64x32())
.setFcrFileNodeListRoot(deserializeFileChunkReference64x32())
.setFcrFreeChunkList(deserializeFileChunkReference64x32())
.setCbExpectedFileLength(deserializeLittleEndianLong())
.setCbFreeSpaceInFreeChunkList(deserializeLittleEndianLong())
.setGuidFileVersion(deserializeGUID())
.setnFileVersionGeneration(deserializeLittleEndianLong())
.setGuidDenyReadFileVersion(deserializeGUID())
.setGrfDebugLogFlags(deserializeLittleEndianInt())
.setFcrDebugLogA(deserializeFileChunkReference64x32())
.setFcrDebugLogB(deserializeFileChunkReference64x32())
.setBuildNumberCreated(deserializeLittleEndianInt())
.setBuildNumberLastWroteToFile(deserializeLittleEndianInt())
.setBuildNumberOldestWritten(deserializeLittleEndianInt())
.setBuildNumberNewestWritten(deserializeLittleEndianInt());
if (data.getGuidFileFormat().toString().equals(PACKAGE_STORAGE_FILE_FORMAT_GUID)) {
return data.setLegacyOrAlternativePackaging(true);
}
ByteBuffer reservedBytesAtEndOfHeader =
ByteBuffer.allocate(NUM_RESERVED_BYTES_AT_END_OF_HEADER);
deserializeBytes(reservedBytesAtEndOfHeader);
return data;
}
private GUID deserializeGUID() throws IOException {
int[] guid = new int[16];
for (int i = 0; i < 16; ++i) {
guid[i] = dif.read();
}
int[] guid2 = new int[16];
// re-order [0,1,2,3] to little endian
guid2[0] = guid[3];
guid2[1] = guid[2];
guid2[2] = guid[1];
guid2[3] = guid[0];
// re-order [4,5,6,7] to little endian
guid2[4] = guid[5];
guid2[5] = guid[4];
guid2[6] = guid[7];
guid2[7] = guid[6];
// the rest is already in right order.
guid2[8] = guid[8];
guid2[9] = guid[9];
guid2[10] = guid[10];
guid2[11] = guid[11];
guid2[12] = guid[12];
guid2[13] = guid[13];
guid2[14] = guid[14];
guid2[15] = guid[15];
offset = dif.position();
return new GUID(guid2);
}
private byte[] deserializedReservedHeader() throws IOException {
if (dif.position() != offset) {
dif.position(offset);
}
ByteBuffer data = ByteBuffer.allocate(728);
dif.read(data);
offset = dif.position();
return data.array();
}
private FileChunkReference deserializeFileChunkReference64() throws IOException {
long stp = deserializeLittleEndianInt();
long cb = deserializeLittleEndianInt();
offset = dif.position();
return new FileChunkReference(stp, cb);
}
private FileChunkReference deserializeFileChunkReference64x32() throws IOException {
long stp = deserializeLittleEndianLong();
long cb = deserializeLittleEndianInt();
offset = dif.position();
return new FileChunkReference(stp, cb);
}
private char deserializeLittleEndianChar() throws IOException {
if (dif.position() != offset) {
dif.position(offset);
}
char res = (char) dif.read();
++offset;
return res;
}
private long deserializeLittleEndianInt() throws IOException {
if (dif.position() != offset) {
dif.position(offset);
}
ByteBuffer byteBuffer = ByteBuffer.allocate(4);
dif.read(byteBuffer);
long res = EndianUtils.readSwappedUnsignedInteger(byteBuffer.array(), 0);
offset = dif.position();
return res;
}
private long deserializeLittleEndianLong() throws IOException {
if (dif.position() != offset) {
dif.position(offset);
}
ByteBuffer byteBuffer = ByteBuffer.allocate(8);
dif.read(byteBuffer);
long res = EndianUtils.readSwappedLong(byteBuffer.array(), 0);
offset = dif.position();
return res;
}
private long deserializeLittleEndianShort() throws IOException {
if (dif.position() != offset) {
dif.position(offset);
}
int c1 = dif.read();
int c2 = dif.read();
long res = (((c1 & 0xff)) + ((c2 & 0xff) << 8));
offset = dif.position();
return res;
}
private String getIndent() {
StringBuilder retval = new StringBuilder();
for (int i = 0; i < indentLevel; ++i) {
retval.append(" ");
}
return retval.toString();
}
public void reposition(FileChunkReference loc) throws IOException {
reposition(loc.stp);
this.end = offset + loc.cb;
}
private void reposition(long offset) throws IOException {
this.offset = offset;
dif.position(offset);
}
/**
* Keep parsing file node list fragments until a nil file chunk reference is encountered.
*
* A file node list can be divided into one or more FileNodeListFragment
* structures. Each fragment can specify whether there are more fragments in the list and
* the location of the next fragment. Each fragment specifies a sub-sequence
* of FileNode structures from the file node list.
*
* When specifying the structure of a specific file node list in this document, the division
* of the list into fragments is ignored and FileNode structures with FileNode.FileNodeID
* field values equal to 0x0FF ("ChunkTerminatorFND") are not specified.
*
* @param ptr The current OneNotePtr we are at currently.
* @param fileNodeList The file node list to populate as we parse.
* @param curPath The current FileNodePtr.
* @return The resulting one note pointer after node lists are all parsed.
*/
public OneNotePtr internalDeserializeFileNodeList(OneNotePtr ptr, FileNodeList fileNodeList,
FileNodePtr curPath)
throws IOException, TikaException {
OneNotePtr localPtr = new OneNotePtr(document, dif);
FileNodePtrBackPush bp = new FileNodePtrBackPush(curPath);
try {
while (true) {
FileChunkReference next = FileChunkReference.nil();
ptr.deserializeFileNodeListFragment(fileNodeList, next, curPath);
if (FileChunkReference.nil().equals(next)) {
break;
}
localPtr.reposition(next);
ptr = localPtr;
}
return ptr;
} finally {
bp.dec();
}
}
public OneNotePtr deserializeFileNodeList(FileNodeList fileNodeList, FileNodePtr curPath)
throws IOException, TikaException {
return internalDeserializeFileNodeList(this, fileNodeList, curPath);
}
/**
* Deserializes a FileNodeListFragment.
*
* The FileNodeListFragment structure specifies a sequence of file nodes from a file node
* list. The size of the FileNodeListFragment structure is specified by the structure that
* references it.
*
* All fragments in the same file node list MUST have the same FileNodeListFragment.header
* .FileNodeListID field.
*
* @param data List of file nodes that we collect while deserializing.
* @param next The next file chunk we are referencing.
* @param curPath The current FileNodePtr.
*/
void deserializeFileNodeListFragment(FileNodeList data, FileChunkReference next,
FileNodePtr curPath) throws IOException, TikaException {
data.fileNodeListHeader = deserializeFileNodeListHeader();
boolean terminated = false;
while (offset + 24 <= end) { // while there are at least 24 bytes free
// 24 = sizeof(nextFragment) [12 bytes] + sizeof(footer) [8 bytes]
// + 4 bytes for the FileNode header
CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data);
try {
long initialOffset = offset;
FileNode fileNode =
deserializeFileNode(data.children.get(data.children.size() - 1), curPath);
if (initialOffset == offset) {
//nothing read; avoid an infinite loop
break;
}
if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) {
terminated = true;
break;
}
pushBack.commit();
FileNode dereference = curPath.dereference(document);
FileNode lastChild = data.children.get(data.children.size() - 1);
assert dereference.equals(lastChild); // is this correct? or should we be
// checking the pointer?
Integer curPathOffset =
curPath.nodeListPositions.get(curPath.nodeListPositions.size() - 1);
curPath.nodeListPositions.set(curPath.nodeListPositions.size() - 1,
curPathOffset + 1);
} finally {
pushBack.popBackIfNotCommitted();
}
}
reposition(end - 20);
FileChunkReference nextChunkRef = deserializeFileChunkReference64x32();
next.cb = nextChunkRef.cb;
next.stp = nextChunkRef.stp;
if (terminated) {
LOG.debug("{}Chunk terminator found NextChunkRef.cb={}, NextChunkRef.stp={}," +
" Offset={}, End={}", getIndent(), nextChunkRef.cb, nextChunkRef.stp, offset,
end);
// TODO check that next is OK
}
long footer = deserializeLittleEndianLong();
if (footer != FOOTER_CONST) {
throw new TikaException(
"Invalid footer constant. Expected " + FOOTER_CONST + " but was " + footer);
}
}
private FileNode deserializeFileNode(FileNode data, FileNodePtr curPath)
throws IOException, TikaException {
OneNotePtr backup = new OneNotePtr(this);
long reserved;
data.isFileData = false;
data.gosid = ExtendedGUID.nil();
long fileNodeHeader = deserializeLittleEndianInt();
data.id = fileNodeHeader & 0x3ff;
if (data.id == 0) {
return data;
}
LOG.debug("{}Start Node {} ({}) - Offset={}, End={}", getIndent(),
FndStructureConstants.nameOf(data.id), data.id, offset, end);
++indentLevel;
data.size = (fileNodeHeader >> 10) & 0x1fff;
// reset the size to only be in scope of this FileNode
end = backup.offset + data.size;
long stpFormat = (fileNodeHeader >> 23) & 0x3;
long cbFormat = (fileNodeHeader >> 25) & 0x3;
data.baseType = (fileNodeHeader >> 27) & 0xf;
reserved = (fileNodeHeader >> 31);
data.ref = FileChunkReference.nil();
if (data.baseType == 1 || data.baseType == 2) {
data.ref = deserializeVarFileChunkReference(stpFormat, cbFormat);
} // otherwise ignore the data ref, since we're a type 0
if (data.baseType == 1 && !data.ref.equals(FileChunkReference.nil())) {
OneNotePtr content = new OneNotePtr(this);
content.reposition(data.ref);
// would have thrown an error if invalid.
}
if (data.id == FndStructureConstants.ObjectGroupStartFND) {
data.idDesc = "oid(group)";
data.gosid = deserializeExtendedGUID();
} else if (data.id == FndStructureConstants.ObjectGroupEndFND) {
// no data
} else if (data.id == FndStructureConstants.ObjectSpaceManifestRootFND ||
data.id == FndStructureConstants.ObjectSpaceManifestListStartFND) {
if (data.id == FndStructureConstants.ObjectSpaceManifestRootFND) {
data.idDesc = "gosidRoot";
} else {
data.idDesc = "gosid";
}
// Specifies the identity of the object space being specified by this object
// space manifest list. MUST match the ObjectSpaceManifestListReferenceFND.gosid
// field of the FileNode structure that referenced
// this file node list.
data.gosid = deserializeExtendedGUID();
//LOG.debug("{}gosid {}", getIndent(), data.gosid.toString().c_str());
} else if (data.id == FndStructureConstants.ObjectSpaceManifestListReferenceFND) {
data.gosid = deserializeExtendedGUID();
data.idDesc = "gosid";
//LOG.debug("{}gosid {}", getIndent(),data.gosid.toString().c_str());
//children parsed in generic base_type 2 parser
} else if (data.id == FndStructureConstants.RevisionManifestListStartFND) {
data.gosid = deserializeExtendedGUID();
data.idDesc = "gosid";
FileNodePtr parentPath = new FileNodePtr(curPath);
parentPath.nodeListPositions.remove(parentPath.nodeListPositions.size() - 1);
document.registerRevisionManifestList(data.gosid, parentPath);
//LOG.debug("{}gosid {}", getIndent(),data.gosid.toString().c_str());
data.subType.revisionManifestListStart.nInstanceIgnored = deserializeLittleEndianInt();
} else if (data.id == FndStructureConstants.RevisionManifestStart4FND) {
data.gosid = deserializeExtendedGUID(); // the rid
data.idDesc = "rid";
//LOG.debug("{}gosid {}", getIndent(), data.gosid.toString().c_str());
data.subType.revisionManifest.ridDependent = deserializeExtendedGUID(); // the rid
LOG.debug("{}dependent gosid {}", getIndent(),
data.subType.revisionManifest.ridDependent);
data.subType.revisionManifest.timeCreation = deserializeLittleEndianLong();
data.subType.revisionManifest.revisionRole = deserializeLittleEndianInt();
data.subType.revisionManifest.odcsDefault = deserializeLittleEndianShort();
data.gctxid = ExtendedGUID.nil();
document.registerRevisionManifest(data);
} else if (data.id == FndStructureConstants.RevisionManifestStart6FND ||
data.id == FndStructureConstants.RevisionManifestStart7FND) {
data.gosid = deserializeExtendedGUID(); // the rid
data.idDesc = "rid";
//LOG.debug("{}gosid {}", getIndent(), data.gosid.toString().c_str());
data.subType.revisionManifest.ridDependent = deserializeExtendedGUID(); // the rid
LOG.debug("{}dependent gosid {}", getIndent(),
data.subType.revisionManifest.ridDependent);
data.subType.revisionManifest.revisionRole = deserializeLittleEndianInt();
data.subType.revisionManifest.odcsDefault = deserializeLittleEndianShort();
data.gctxid = ExtendedGUID.nil();
if (data.id == FndStructureConstants.RevisionManifestStart7FND) {
data.gctxid = deserializeExtendedGUID(); // the rid
}
document.registerAdditionalRevisionRole(data.gosid,
data.subType.revisionManifest.revisionRole, data.gctxid);
document.registerRevisionManifest(data);
} else if (data.id == FndStructureConstants.GlobalIdTableStartFNDX) {
data.subType.globalIdTableStartFNDX.reserved = deserializeLittleEndianChar();
} else if (data.id == FndStructureConstants.GlobalIdTableEntryFNDX) {
data.subType.globalIdTableEntryFNDX.index = deserializeLittleEndianInt();
data.subType.globalIdTableEntryFNDX.guid = deserializeGUID();
document.revisionMap.get(document.currentRevision).globalId.put(
data.subType.globalIdTableEntryFNDX.index,
data.subType.globalIdTableEntryFNDX.guid);
} else if (data.id == FndStructureConstants.GlobalIdTableEntry2FNDX) {
data.subType.globalIdTableEntry2FNDX.indexMapFrom = deserializeLittleEndianInt();
data.subType.globalIdTableEntry2FNDX.indexMapTo = deserializeLittleEndianInt();
ExtendedGUID dependentRevision =
document.revisionMap.get(document.currentRevision).dependent;
// Get the compactId from the revisionMap's globalId map.
GUID compactId = document.revisionMap.get(dependentRevision).globalId.get(
data.subType.globalIdTableEntry2FNDX.indexMapFrom);
if (compactId == null) {
throw new TikaException("COMPACT_ID_MISSING");
}
document.revisionMap.get(document.currentRevision).globalId.put(
data.subType.globalIdTableEntry2FNDX.indexMapTo, compactId);
} else if (data.id == FndStructureConstants.GlobalIdTableEntry3FNDX) {
data.subType.globalIdTableEntry3FNDX.indexCopyFromStart = deserializeLittleEndianInt();
data.subType.globalIdTableEntry3FNDX.entriesToCopy = deserializeLittleEndianInt();
data.subType.globalIdTableEntry3FNDX.indexCopyToStart = deserializeLittleEndianInt();
ExtendedGUID dependent_revision =
document.revisionMap.get(document.currentRevision).dependent;
for (int i = 0; i < data.subType.globalIdTableEntry3FNDX.entriesToCopy; ++i) {
Map globalIdMap = document.revisionMap.get(dependent_revision).globalId;
GUID compactId = globalIdMap.get(
data.subType.globalIdTableEntry3FNDX.indexCopyFromStart + i);
if (compactId == null) {
throw new TikaException("COMPACT_ID_MISSING");
}
document.revisionMap.get(document.currentRevision).globalId.put(
data.subType.globalIdTableEntry3FNDX.indexCopyToStart + i, compactId);
}
} else if (data.id == FndStructureConstants.CanRevise.ObjectRevisionWithRefCountFNDX ||
data.id == FndStructureConstants.CanRevise.ObjectRevisionWithRefCount2FNDX) {
data.subType.objectRevisionWithRefCountFNDX.oid = deserializeCompactID(); // the oid
if (data.id == FndStructureConstants.CanRevise.ObjectRevisionWithRefCountFNDX) {
int ref = deserializeLittleEndianChar();
data.subType.objectRevisionWithRefCountFNDX.hasOidReferences = ref & 1;
data.subType.objectRevisionWithRefCountFNDX.hasOsidReferences = ref & 2;
data.subType.objectRevisionWithRefCountFNDX.cRef = (ref >> 2);
} else {
long ref = deserializeLittleEndianInt();
data.subType.objectRevisionWithRefCountFNDX.hasOidReferences = ref & 1;
data.subType.objectRevisionWithRefCountFNDX.hasOsidReferences = ref & 2;
if ((ref >> 2) != 0) {
throw new TikaException("Reserved non-zero");
}
data.subType.objectRevisionWithRefCountFNDX.cRef = deserializeLittleEndianInt();
}
} else if (data.id == FndStructureConstants.RootObjectReference2FNDX) {
data.subType.rootObjectReference.oidRoot = deserializeCompactID();
data.idDesc = "oidRoot";
data.gosid = data.subType.rootObjectReference.oidRoot.guid;
data.subType.rootObjectReference.rootObjectReferenceBase.rootRole =
deserializeLittleEndianInt();
LOG.debug("{}Root role {}", getIndent(),
data.subType.rootObjectReference.rootObjectReferenceBase.rootRole);
} else if (data.id == FndStructureConstants.RootObjectReference3FND) {
data.idDesc = "oidRoot";
data.gosid = deserializeExtendedGUID();
data.subType.rootObjectReference.rootObjectReferenceBase.rootRole =
deserializeLittleEndianInt();
LOG.debug("{}Root role {}", getIndent(),
data.subType.rootObjectReference.rootObjectReferenceBase.rootRole);
} else if (data.id == FndStructureConstants.RevisionRoleDeclarationFND ||
data.id == FndStructureConstants.RevisionRoleAndContextDeclarationFND) {
data.gosid = deserializeExtendedGUID();
data.subType.revisionRoleDeclaration.revisionRole = deserializeLittleEndianInt();
if (data.id == FndStructureConstants.RevisionRoleAndContextDeclarationFND) {
data.gctxid = deserializeExtendedGUID();
}
document.registerAdditionalRevisionRole(data.gosid,
data.subType.revisionRoleDeclaration.revisionRole, data.gctxid);
// FIXME: deal with ObjectDataEncryptionKey
} else if (data.id == FndStructureConstants.ObjectInfoDependencyOverridesFND) {
OneNotePtr content = new OneNotePtr(this);
if (!data.ref.equals(FileChunkReference.nil())) {
content.reposition(data.ref); // otherwise it's positioned right at this node
}
data.subType.objectInfoDependencyOverrides.data =
content.deserializeObjectInfoDependencyOverrideData();
} else if (data.id == FndStructureConstants.FileDataStoreListReferenceFND) {
// already processed this
} else if (data.id == FndStructureConstants.FileDataStoreObjectReferenceFND) {
FileChunkReference ref = deserializeFileChunkReference64();
GUID guid = deserializeGUID();
ExtendedGUID extendedGuid = new ExtendedGUID(guid, 0);
LOG.trace("found extended guid {}", extendedGuid);
document.guidToRef.put(extendedGuid, ref);
OneNotePtr fileDataStorePtr = new OneNotePtr(this);
fileDataStorePtr.reposition(data.ref);
data.subType.fileDataStoreObjectReference.ref =
fileDataStorePtr.deserializeFileDataStoreObject();
} else if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCountFNDX ||
data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCount2FNDX ||
data.id == FndStructureConstants.CanRevise.ObjectDeclaration2RefCountFND ||
data.id == FndStructureConstants.CanRevise.ObjectDeclaration2LargeRefCountFND ||
data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2RefCountFND ||
data.id ==
FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2LargeRefCountFND) {
data.subType.objectDeclarationWithRefCount.body.file_data_store_reference = false;
if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCountFNDX ||
data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCount2FNDX) {
data.subType.objectDeclarationWithRefCount.body =
deserializeObjectDeclarationWithRefCountBody();
} else { // one of the other 4 that use the ObjectDeclaration2Body
data.subType.objectDeclarationWithRefCount.body =
deserializeObjectDeclaration2Body();
}
if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCountFNDX ||
data.id == FndStructureConstants.CanRevise.ObjectDeclaration2RefCountFND ||
data.id ==
FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2RefCountFND) {
data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianChar();
} else {
data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianInt();
}
if (data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2RefCountFND ||
data.id ==
FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2LargeRefCountFND) {
ByteBuffer md5Buffer = ByteBuffer.allocate(16);
deserializeBytes(md5Buffer);
data.subType.objectDeclarationWithRefCount.readOnly.md5 = md5Buffer.array();
}
data.idDesc = "oid";
postprocessObjectDeclarationContents(data, curPath);
LOG.debug("{}Ref Count JCID {}", getIndent(),
data.subType.objectDeclarationWithRefCount.body.jcid);
} else if (
data.id == FndStructureConstants.CanRevise.ObjectDeclarationFileData3RefCountFND ||
data.id ==
FndStructureConstants.CanRevise.ObjectDeclarationFileData3LargeRefCountFND) {
data.subType.objectDeclarationWithRefCount.body.oid = deserializeCompactID();
long jcid = deserializeLittleEndianInt();
data.subType.objectDeclarationWithRefCount.body.jcid.loadFrom32BitIndex(jcid);
if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationFileData3RefCountFND) {
data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianChar();
} else {
data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianInt();
}
long cch = deserializeLittleEndianInt();
long roomLeftLong = roomLeft();
if (cch > roomLeftLong) { // not a valid guid
throw new TikaException(
"Data out of bounds - cch " + cch + " is > room left = " + roomLeftLong);
}
if (cch > dif.size()) {
throw new TikaMemoryLimitException(
"CCH=" + cch + " was found that was greater" + " than file size " +
dif.size());
}
ByteBuffer dataSpaceBuffer = ByteBuffer.allocate((int) cch * 2);
dif.read(dataSpaceBuffer);
byte[] dataSpaceBufferBytes = dataSpaceBuffer.array();
offset += dataSpaceBufferBytes.length;
if (dataSpaceBufferBytes.length == (IFNDF_GUID_LENGTH * 2 + IFNDF.length) &&
Arrays.equals(IFNDF,
Arrays.copyOfRange(dataSpaceBufferBytes, 0, IFNDF.length))) {
data.subType.objectDeclarationWithRefCount.body.file_data_store_reference = true;
GUID guid = GUID.fromCurlyBraceUTF16Bytes(
Arrays.copyOfRange(dataSpaceBufferBytes, IFNDF.length,
dataSpaceBufferBytes.length));
ExtendedGUID extendedGUID = new ExtendedGUID(guid, 0);
FileChunkReference fileChunk = document.getAssocGuidToRef(extendedGUID);
if (fileChunk == null) {
LOG.debug("{} have not seen GUID {} yet", getIndent(), extendedGUID);
} else {
// TODO - call postprocessObjectDeclarationContents on this object?
}
} else {
LOG.debug("{}Ignoring an external reference {}", getIndent(),
new String(dataSpaceBufferBytes, StandardCharsets.UTF_16LE));
}
} else if (data.id == FndStructureConstants.ObjectGroupListReferenceFND) {
data.idDesc = "object_group_id";
data.gosid = deserializeExtendedGUID(); // the object group id
// the ref populates the FileNodeList children
} else if (data.id == FndStructureConstants.ObjectGroupStartFND) {
data.idDesc = "object_group_id";
data.gosid = deserializeExtendedGUID(); // the oid
} else if (data.id == FndStructureConstants.ObjectGroupEndFND) {
// nothing to see here
} else if (data.id == FndStructureConstants.DataSignatureGroupDefinitionFND) {
data.idDesc = "data_sig";
data.gosid = deserializeExtendedGUID(); // the DataSignatureGroup
} else if (data.id == FndStructureConstants.RevisionManifestListReferenceFND) {
document.revisionMap.putIfAbsent(document.currentRevision, new Revision());
Revision currentRevision = document.revisionMap.get(document.currentRevision);
currentRevision.manifestList.add(curPath);
} else {
LOG.debug(
"No fnd needed to be parsed for data.id=0x" + Long.toHexString(data.id) + " (" +
FndStructureConstants.nameOf(data.id) + ")");
}
if (data.baseType == 2) {
// Generic baseType == 2 parser - means we have children to parse.
OneNotePtr subList = new OneNotePtr(this);
// position the subList pointer to the data.ref and deserialize recursively.
subList.reposition(data.ref);
subList.deserializeFileNodeList(data.childFileNodeList, curPath);
}
offset = backup.offset + data.size;
end = backup.end;
if (reserved != 1) {
throw new TikaException("RESERVED_NONZERO");
}
if (data.baseType == 1 && !(data.ref.equals(FileChunkReference.nil()))) {
document.setAssocGuidToRef(data.gosid, data.ref);
OneNotePtr content = new OneNotePtr(this);
content.reposition(data.ref);
if (data.hasGctxid()) {
LOG.debug("{}gctxid {}", getIndent(), data.gctxid);
}
} else if (!data.gosid.equals(ExtendedGUID.nil())) {
LOG.trace("Non base type == 1 guid {}", data.gosid);
}
--indentLevel;
if (data.gosid.equals(ExtendedGUID.nil())) {
LOG.debug("{}End Node {} ({}) - Offset={}, End={}", getIndent(),
FndStructureConstants.nameOf(data.id), (int) data.id, offset, end);
} else {
LOG.debug("{}End Node {} ({}) {}:[{}] - Offset={}, End={}", getIndent(),
FndStructureConstants.nameOf(data.id), (int) data.id, data.idDesc, data.gosid,
offset, end);
}
return data;
}
private void deserializeBytes(ByteBuffer byteBuffer) throws IOException {
if (dif.position() != offset) {
dif.position(offset);
}
dif.read(byteBuffer);
offset = dif.position();
}
private ObjectDeclarationWithRefCountBody deserializeObjectDeclarationWithRefCountBody()
throws IOException, TikaException {
ObjectDeclarationWithRefCountBody data = new ObjectDeclarationWithRefCountBody();
data.oid = deserializeCompactID();
long jci_odcs_etc = deserializeLittleEndianInt();
long reserved = deserializeLittleEndianShort();
data.jcid.index = jci_odcs_etc & 0x3ffL;
long must_be_zero = (jci_odcs_etc >> 10) & 0xf;
long must_be_zeroA = ((jci_odcs_etc >> 14) & 0x3);
data.fHasOidReferences = ((jci_odcs_etc >> 16) & 0x1) != 0;
data.hasOsidReferences = ((jci_odcs_etc >> 17) & 0x1) != 0;
if (jci_odcs_etc >> 18L > 0) {
throw new TikaException("RESERVED_NONZERO");
}
if (reserved != 0 || must_be_zeroA != 0 || must_be_zero != 0) {
throw new TikaException("RESERVED_NONZERO");
}
return data;
}
private ObjectDeclarationWithRefCountBody deserializeObjectDeclaration2Body()
throws IOException, TikaException {
ObjectDeclarationWithRefCountBody data = new ObjectDeclarationWithRefCountBody();
data.oid = deserializeCompactID();
long jcid = deserializeLittleEndianInt();
data.jcid.loadFrom32BitIndex(jcid);
long hasRefs = deserializeLittleEndianChar();
data.fHasOidReferences = (hasRefs & 0x1) != 0;
data.hasOsidReferences = (hasRefs & 0x2) != 0;
return data;
}
/**
* The FileDataStoreObject structure specifies the data for a file data object.
*
* @return
* @throws IOException
*/
private FileDataStoreObject deserializeFileDataStoreObject() throws IOException, TikaException {
FileDataStoreObject data = new FileDataStoreObject();
GUID header = deserializeGUID();
// TODO - the expected header is different per version of one note.
// if (!header.equals(FILE_DATA_STORE_OBJ_HEADER)) {
// throw new TikaException("Unexpected file data store object header: " + header);
// }
long len = deserializeLittleEndianLong();
long unused = deserializeLittleEndianInt();
long reserved = deserializeLittleEndianLong();
if (offset + len + 16 > end) {
throw new TikaException("SEGV error");
}
if (unused > 0 || reserved > 0) {
throw new TikaException("SEGV error");
}
data.fileData.stp = offset;
data.fileData.cb = len;
offset += len;
while ((offset & 0x7) > 0) {
// Padding is added to the end of the FileData stream to ensure that it
// ends on an 8-byte boundary.
++offset;
}
GUID footer = deserializeGUID();
// TODO - the expected footer is per version of one note.
// if (!footer.equals(FILE_DATA_STORE_OBJ_FOOTER)) {
// throw new TikaException("Unexpected file data store object footer: " + footer);
// }
return data;
}
private ObjectInfoDependencyOverrideData deserializeObjectInfoDependencyOverrideData()
throws IOException {
ObjectInfoDependencyOverrideData objectInfoDependencyOverrideData =
new ObjectInfoDependencyOverrideData();
long num_8bit_overrides = deserializeLittleEndianInt();
long num_32bit_overrides = deserializeLittleEndianInt();
long crc = deserializeLittleEndianInt();
for (int i = 0; i < num_8bit_overrides; ++i) {
int local = deserializeLittleEndianChar();
objectInfoDependencyOverrideData.overrides1.add(local);
}
for (int i = 0; i < num_32bit_overrides; ++i) {
long local = deserializeLittleEndianInt();
objectInfoDependencyOverrideData.overrides2.add(local);
}
return objectInfoDependencyOverrideData;
}
private CompactID deserializeCompactID() throws IOException, TikaException {
CompactID compactID = new CompactID();
compactID.n = deserializeLittleEndianChar();
compactID.guidIndex = deserializeInt24();
compactID.guid = ExtendedGUID.nil();
compactID.guid.n = compactID.n;
long index = compactID.guidIndex;
Map globalIdMap = document.revisionMap.get(document.currentRevision).globalId;
GUID guid = globalIdMap.get(index);
if (guid != null) {
compactID.guid.guid = guid;
} else {
throw new TikaException("COMPACT ID MISSING");
}
return compactID;
}
private long deserializeInt24() throws IOException {
int b1 = deserializeLittleEndianChar();
int b2 = deserializeLittleEndianChar();
int b3 = deserializeLittleEndianChar();
return new Int24(b1, b2, b3).value();
}
private ExtendedGUID deserializeExtendedGUID() throws IOException {
GUID guid = deserializeGUID();
long n = deserializeLittleEndianInt();
return new ExtendedGUID(guid, n);
}
/**
* Depending on stpFormat and cbFormat, will deserialize a FileChunkReference.
*
* @param stpFormat An unsigned integer that specifies the size and format of the
* FileNodeChunkReference.stp field specified by the fnd field if this
* FileNode structure has a
* value of the BaseType field equal to 1 or 2. MUST be ignored if the
* value of the BaseType field
* of this FileNode structure is equal to 0. The meaning of the StpFormat
* field is given by the
* following table.
* Value Meaning
* 0 8 bytes, uncompressed.
* 1 4 bytes, uncompressed.
* 2 2 bytes, compressed.
* 3 4 bytes, compressed.
* The value of an uncompressed file pointer specifies a location in the
* file. To uncompress a
* compressed file pointer, multiply the value by 8.
* @param cbFormat An unsigned integer that specifies the size and format of the
* FileNodeChunkReference.cb field specified by the fnd field if this
* FileNode structure has a
* BaseType field value equal to 1 or 2. MUST be 0 and MUST be ignored if
* BaseType of this
* FileNode structure is equal to 0. The meaning of CbFormat is given by
* the following table.
* Value Meaning
* 0 4 bytes, uncompressed.
* 1 8 bytes, uncompressed.
* 2 1 byte, compressed.
* 3 2 bytes, compressed.
* The value of an uncompressed byte count specifies the size, in bytes, of
* the data referenced by a
* FileNodeChunkReference structure. To uncompress a compressed byte count,
* multiply the value by 8.
* @return
* @throws IOException
*/
FileChunkReference deserializeVarFileChunkReference(long stpFormat, long cbFormat)
throws IOException, TikaException {
FileChunkReference data = new FileChunkReference(0, 0);
long local8;
long local16;
long local32;
switch (Long.valueOf(stpFormat).intValue()) {
case 0: // 8 bytes, uncompressed
data.stp = deserializeLittleEndianLong();
break;
case 1:
local32 = deserializeLittleEndianInt();
data.stp = local32;
break;
case 2:
local16 = deserializeLittleEndianShort();
data.stp = local16;
data.stp <<= 3;
break;
case 3:
local32 = deserializeLittleEndianInt();
data.stp = local32;
data.stp <<= 3;
break;
default:
throw new TikaException("Unknown STP file node format " + stpFormat);
}
switch (Long.valueOf(cbFormat).intValue()) {
case 0: // 4 bytes, uncompressed
local32 = deserializeLittleEndianInt();
data.cb = local32;
break;
case 1: // 8 bytes, uncompressed;
data.cb = deserializeLittleEndianLong();
break;
case 2: // 1 byte, compressed
local8 = deserializeLittleEndianChar();
data.cb = local8;
data.cb <<= 3;
break;
case 3: // 2 bytes, compressed
local16 = deserializeLittleEndianShort();
data.cb = local16;
data.cb <<= 3;
break;
default:
throw new TikaException("Unknown CB file node format " + cbFormat);
}
return data;
}
FileNodeListHeader deserializeFileNodeListHeader() throws TikaException, IOException {
long positionOfThisHeader = offset;
long uintMagic = deserializeLittleEndianLong();
long fileNodeListId = deserializeLittleEndianInt();
long nFragmentSequence = deserializeLittleEndianInt();
return new FileNodeListHeader(positionOfThisHeader, uintMagic, fileNodeListId,
nFragmentSequence);
}
/**
* For an object declaration file node, after parsing all the fnd variables, now we will process
* the object declaration's contents.
*
* @param data The FileNode containing all the fnd variable's data.
* @param curPtr The current pointer.
* @throws IOException
*/
private void postprocessObjectDeclarationContents(FileNode data, FileNodePtr curPtr)
throws IOException, TikaException {
data.gosid = data.subType.objectDeclarationWithRefCount.body.oid.guid;
document.guidToObject.put(data.gosid, new FileNodePtr(curPtr));
if (data.subType.objectDeclarationWithRefCount.body.jcid.isObjectSpaceObjectPropSet()) {
OneNotePtr objectSpacePropSetPtr = new OneNotePtr(this);
objectSpacePropSetPtr.reposition(data.ref);
data.subType.objectDeclarationWithRefCount.objectRef =
objectSpacePropSetPtr.deserializeObjectSpaceObjectPropSet();
ObjectStreamCounters streamCounters = new ObjectStreamCounters();
data.propertySet = objectSpacePropSetPtr.deserializePropertySet(streamCounters,
data.subType.objectDeclarationWithRefCount.objectRef);
} else {
if (!data.subType.objectDeclarationWithRefCount.body.jcid.isFileData) {
throw new TikaException("JCID must be file data when !isObjectSpaceObjectPropSet.");
}
// this is FileData
data.isFileData = true;
if (LOG.isDebugEnabled()) {
OneNotePtr content = new OneNotePtr(this);
content.reposition(data.ref);
LOG.debug("{}Raw:", getIndent());
content.dumpHex();
LOG.debug("");
}
}
}
private PropertySet deserializePropertySet(ObjectStreamCounters counters,
ObjectSpaceObjectPropSet streams)
throws IOException, TikaException {
PropertySet data = new PropertySet();
long count = deserializeLittleEndianShort();
data.rgPridsData =
Stream.generate(PropertyValue::new).limit((int) count).collect(Collectors.toList());
for (int i = 0; i < count; ++i) {
data.rgPridsData.get(i).propertyId = deserializePropertyID();
LOG.debug("{}Property {}", getIndent(), data.rgPridsData.get(i).propertyId);
}
LOG.debug("{}{} elements in property set:", getIndent(), count);
for (int i = 0; i < count; ++i) {
data.rgPridsData.set(i,
deserializePropertyValueFromPropertyID(data.rgPridsData.get(i).propertyId,
streams, counters));
}
LOG.debug("");
return data;
}
private PropertyValue deserializePropertyValueFromPropertyID(OneNotePropertyId propertyID,
ObjectSpaceObjectPropSet streams,
ObjectStreamCounters counters)
throws IOException, TikaException {
PropertyValue data = new PropertyValue();
data.propertyId = propertyID;
char val8;
long val16;
long val32 = 0;
long val64;
if (LOG.isDebugEnabled()) {
LOG.debug("\n{}<{}", getIndent(), propertyID);
}
++indentLevel;
try {
long type = propertyID.type;
switch ((int) type) {
case 0x1:
LOG.debug(" [] ");
return data;
case 0x2:
LOG.debug(" PropertyID bool({})", propertyID.inlineBool);
data.scalar = propertyID.inlineBool ? 1 : 0;
return data;
case 0x3:
val8 = deserializeLittleEndianChar();
data.scalar = val8;
LOG.debug(" PropertyID byte({})", data.scalar);
break;
case 0x4:
val16 = deserializeLittleEndianShort();
data.scalar = val16;
LOG.debug(" uint16 PropertyID short({})", data.scalar);
break;
case 0x5:
val32 = deserializeLittleEndianInt();
data.scalar = val32;
LOG.debug(" PropertyID int({})", data.scalar);
break;
case 0x6:
val64 = deserializeLittleEndianLong();
data.scalar = val64;
LOG.debug(" PropertyID long({})", data.scalar);
break;
case 0x7:
// If the value of the PropertyID.type element is "0x7" and the property
// specifies an array of elements, the value of
// the
// prtFourBytesOfLengthFollowedByData.cb element MUST be the sum of the
// sizes, in bytes, of each element in the array.
// Exceptions include:
// * The RgOutlineIndentDistance element, where the value of the
// prtFourBytesOfLengthFollowedByData.cb element
// MUST be: 4 + (4 × RgOutlineIndentDistance.count).
// * The TableColumnsLocked element, where the value of the
// prtFourBytesOfLengthFollowedByData.cb
// element MUST be: 1 + (TableColumnsLocked.cColumns + 7) / 8.
// * The TableColumnWidths element, where the value of the
// prtFourBytesOfLengthFollowedByData.cb
// element MUST be: 1 + (4 × TableColumnWidths.cColumns).
val32 = deserializeLittleEndianInt();
LOG.debug(" raw data: ({})[", val32);
data.rawData.stp = offset;
data.rawData.cb = 0;
if (offset + val32 > end) {
data.rawData.cb = end - offset;
offset = end;
throw new TikaException("Offset is past end of file.");
}
data.rawData.cb = val32;
offset += val32;
if (LOG.isDebugEnabled()) {
OneNotePtr content = new OneNotePtr(this);
content.reposition(data.rawData);
content.dumpHex();
}
LOG.debug("]");
break;
case 0x9:
case 0xb:
case 0xd:
val32 = deserializeLittleEndianInt();
// fallthrough
case 0x8:
case 0xa:
case 0xc:
if (type == 0x8 || type == 0xa || type == 0xc) {
val32 = 1;
}
List stream = streams.contextIDs.data;
String xtype = "contextID";
long s_count = counters.context_ids_count;
if (type == 0x8 || type == 0x9) {
stream = streams.oids.data;
s_count = counters.oids_count;
xtype = "OIDs";
}
if (type == 0xa || type == 0xb) {
stream = streams.osids.data;
s_count = counters.osids_count;
xtype = "OSIDS";
}
for (int i = 0; i < val32; ++i, ++s_count) {
int index = (int) s_count;
if (index < stream.size()) {
data.compactIDs.add(stream.get(index));
LOG.debug(" {}[{}]", xtype,
data.compactIDs.get(data.compactIDs.size() - 1));
} else {
throw new TikaException("SEGV");
}
}
break;
case 0x10:
val32 = deserializeLittleEndianInt();
OneNotePropertyId propId = deserializePropertyID();
LOG.debug(" UnifiedSubPropertySet {} {}", val32, propId);
data.propertySet.rgPridsData =
Stream.generate(PropertyValue::new).limit((int) val32)
.collect(Collectors.toList());
for (int i = 0; i < val32; ++i) {
try {
data.propertySet.rgPridsData.set(i,
deserializePropertyValueFromPropertyID(propId, streams,
counters));
} catch (IOException e) {
return data;
}
}
break;
case 0x11:
LOG.debug(" SubPropertySet");
data.propertySet = deserializePropertySet(counters, streams);
break;
default:
throw new TikaException("Invalid type: " + type);
}
LOG.debug(">");
return data;
} finally {
--indentLevel;
}
}
private OneNotePropertyId deserializePropertyID() throws TikaException, IOException {
long pid = deserializeLittleEndianInt();
return new OneNotePropertyId(pid);
}
private ObjectSpaceObjectPropSet deserializeObjectSpaceObjectPropSet()
throws IOException, TikaException {
ObjectSpaceObjectPropSet data = new ObjectSpaceObjectPropSet();
data.osids.extendedStreamsPresent = 0;
data.osids.osidsStreamNotPresent = 1;
data.contextIDs.extendedStreamsPresent = 0;
data.contextIDs.osidsStreamNotPresent = 0;
//uint64_t cur_offset = offset;
//LOG.debug("starting deserialization %lx(%lx) / %lx", offset, offset - cur_offset, end);
data.oids = deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
//LOG.debug("mid deserialization %lx(%lx) / %lx", offset, offset - cur_offset, end);
if (data.oids.osidsStreamNotPresent == 0) {
data.osids = deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
}
//LOG.debug("lat deserialization %lx(%lx) / %lx", offset, offset - cur_offset, end);
if (data.oids.extendedStreamsPresent != 0) {
data.contextIDs = deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
}
return data;
}
private ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs()
throws IOException, TikaException {
ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs data =
new ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
long header = deserializeLittleEndianInt();
data.count = header & 0xffffff;
data.osidsStreamNotPresent = ((header >> 31) & 0x1);
data.extendedStreamsPresent = ((header >> 30) & 0x1);
if (LOG.isDebugEnabled()) {
LOG.debug("{}Deserialized Stream Header count: {} OsidsNotPresent {} Extended {}",
getIndent(), data.count, data.osidsStreamNotPresent,
data.extendedStreamsPresent);
}
for (int i = 0; i < data.count; ++i) {
CompactID cid;
cid = deserializeCompactID();
data.data.add(cid);
}
return data;
}
long roomLeft() {
return end - offset;
}
public void dumpHex() throws TikaMemoryLimitException, IOException {
if (end - offset > dif.size()) {
throw new TikaMemoryLimitException(
"Exceeded memory limit when trying to dumpHex - " + "" + (end - offset) +
" > " + dif.size());
}
ByteBuffer byteBuffer = ByteBuffer.allocate((int) (end - offset));
LOG.debug(Hex.encodeHexString(byteBuffer.array()));
}
public int size() {
return (int) (end - offset);
}
}