All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.onenote.OneNoteParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.microsoft.onenote;

import java.io.IOException;
import java.io.InputStream;
import java.time.Instant;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.MSOneStorePackage;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.MSOneStoreParser;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.AlternativePackaging;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * OneNote tika parser capable of parsing Microsoft OneNote files.
 * 

* Based on the Microsoft specs MS-ONE and MS-ONESTORE. */ public class OneNoteParser implements Parser { public static final String ONE_NOTE_PREFIX = "onenote:"; private static final Map> TYPES_MAP = new HashMap<>(); /** * Serial version UID */ private static final long serialVersionUID = -5504243905998074168L; private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(TYPES_MAP.keySet()); static { // All types should be 4 bytes long, space padded as needed TYPES_MAP.put(MediaType.application("onenote; format=one"), Collections.singletonList("ONE ")); // TODO - add onetoc and other onenote mime types } private final OneNoteTreeWalkerOptions options = new OneNoteTreeWalkerOptions(); @Override public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { byte[] oneStoreFileBytes = IOUtils.toByteArray(stream); try (TemporaryResources temporaryResources = new TemporaryResources(); TikaInputStream tikaInputStream = TikaInputStream.get(oneStoreFileBytes); OneNoteDirectFileResource oneNoteDirectFileResource = new OneNoteDirectFileResource( tikaInputStream.getFile())) { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); temporaryResources.addResource(oneNoteDirectFileResource); OneNoteDocument oneNoteDocument = createOneNoteDocumentFromDirectFileResource(oneNoteDirectFileResource); OneNoteHeader header = oneNoteDocument.header; if (header.isMsOneStoreFormat()) { metadata.set(ONE_NOTE_PREFIX + "buildNumberCreated", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberCreated)); metadata.set(ONE_NOTE_PREFIX + "buildNumberLastWroteToFile", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberLastWroteToFile)); metadata.set(ONE_NOTE_PREFIX + "buildNumberNewestWritten", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberNewestWritten)); metadata.set(ONE_NOTE_PREFIX + "buildNumberOldestWritten", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberOldestWritten)); metadata.set(ONE_NOTE_PREFIX + "cbExpectedFileLength", "0x" + Long.toHexString(oneNoteDocument.header.cbExpectedFileLength)); metadata.set(ONE_NOTE_PREFIX + "cbFreeSpaceInFreeChunkList", "0x" + Long.toHexString(oneNoteDocument.header.cbFreeSpaceInFreeChunkList)); metadata.set(ONE_NOTE_PREFIX + "cbLegacyExpectedFileLength", "0x" + Long.toHexString(oneNoteDocument.header.cbLegacyExpectedFileLength)); metadata.set(ONE_NOTE_PREFIX + "cbLegacyFreeSpaceInFreeChunkList", "0x" + Long.toHexString(oneNoteDocument.header.cbLegacyFreeSpaceInFreeChunkList)); metadata.set(ONE_NOTE_PREFIX + "crcName", "0x" + Long.toHexString(oneNoteDocument.header.crcName)); metadata.set(ONE_NOTE_PREFIX + "cTransactionsInLog", "0x" + Long.toHexString(oneNoteDocument.header.cTransactionsInLog)); metadata.set(ONE_NOTE_PREFIX + "ffvLastCodeThatWroteToThisFile", "0x" + Long.toHexString(oneNoteDocument.header.ffvLastCodeThatWroteToThisFile)); metadata.set(ONE_NOTE_PREFIX + "ffvNewestCodeThatHasWrittenToThisFile", "0x" + Long.toHexString( oneNoteDocument.header.ffvNewestCodeThatHasWrittenToThisFile)); metadata.set(ONE_NOTE_PREFIX + "ffvOldestCodeThatMayReadThisFile", "0x" + Long.toHexString(oneNoteDocument.header.ffvOldestCodeThatMayReadThisFile)); metadata.set(ONE_NOTE_PREFIX + "ffvOldestCodeThatHasWrittenToThisFile", "0x" + Long.toHexString( oneNoteDocument.header.ffvOldestCodeThatHasWrittenToThisFile)); metadata.set(ONE_NOTE_PREFIX + "grfDebugLogFlags", "0x" + Long.toHexString(oneNoteDocument.header.grfDebugLogFlags)); metadata.set(ONE_NOTE_PREFIX + "nFileVersionGeneration", "0x" + Long.toHexString(oneNoteDocument.header.nFileVersionGeneration)); metadata.set(ONE_NOTE_PREFIX + "rgbPlaceholder", "0x" + Long.toHexString(oneNoteDocument.header.rgbPlaceholder)); Pair roleAndContext = Pair.of(1L, ExtendedGUID.nil()); OneNoteTreeWalker oneNoteTreeWalker = new OneNoteTreeWalker(options, oneNoteDocument, oneNoteDirectFileResource, xhtml, metadata, context, roleAndContext); oneNoteTreeWalker.walkTree(); if (!oneNoteTreeWalker.getAuthors().isEmpty()) { metadata.set(TikaCoreProperties.CREATOR, oneNoteTreeWalker.getAuthors().toArray(new String[]{})); } if (!oneNoteTreeWalker.getMostRecentAuthors().isEmpty()) { metadata.set(Property.externalTextBag(ONE_NOTE_PREFIX + "mostRecentAuthors"), oneNoteTreeWalker.getMostRecentAuthors().toArray(new String[]{})); } if (!oneNoteTreeWalker.getOriginalAuthors().isEmpty()) { metadata.set(Property.externalTextBag(ONE_NOTE_PREFIX + "originalAuthors"), oneNoteTreeWalker.getOriginalAuthors().toArray(new String[]{})); } if (!Instant.MAX.equals( Instant.ofEpochMilli(oneNoteTreeWalker.getCreationTimestamp()))) { metadata.set(ONE_NOTE_PREFIX + "creationTimestamp", String.valueOf(oneNoteTreeWalker.getCreationTimestamp())); } if (!Instant.MIN.equals(oneNoteTreeWalker.getLastModifiedTimestamp())) { metadata.set(ONE_NOTE_PREFIX + "lastModifiedTimestamp", String.valueOf( oneNoteTreeWalker.getLastModifiedTimestamp().toEpochMilli())); } if (oneNoteTreeWalker.getLastModified() > Long.MIN_VALUE) { metadata.set(TikaCoreProperties.MODIFIED, String.valueOf(oneNoteTreeWalker.getLastModified())); } } else if (header.isLegacyOrAlternativePackaging()) { try { AlternativePackaging alternatePackageOneStoreFile = new AlternativePackaging(); alternatePackageOneStoreFile.doDeserializeFromByteArray(oneStoreFileBytes, 0); MSOneStoreParser onenoteParser = new MSOneStoreParser(); MSOneStorePackage pkg = onenoteParser.parse(alternatePackageOneStoreFile.dataElementPackage); pkg.walkTree(options, metadata, xhtml); } catch (Exception e) { OneNoteLegacyDumpStrings dumpStrings = new OneNoteLegacyDumpStrings(oneNoteDirectFileResource, xhtml); dumpStrings.dump(); } } else { throw new TikaException("Invalid OneStore document - could not parse headers"); } xhtml.endDocument(); } } /** * Create a OneNoteDocument object. *

* This won't actually have the binary data of the sections, but it's more of a * metadata structure that contains * the general structure of the container and contains offset positions of where to find the * binary data we care about. *

* OneNote files are of format: *

* The header (section 2.3.1 in MS-ONESTORE) is the first 1024 bytes of the file. It contains * references to the other structures in the * file as well as metadata about the file. * The free chunk list (section 2.3.2 in MS-ONESTORE) defines where there are free spaces in * the file where data can be written. * The transaction log (section 2.3.3 in MS-ONESTORE) stores the state and length of each * file node list (section 2.4 in MS-ONESTORE) * in the file. * The hashed chunk list (section 2.3.4 in MS-ONESTORE) stores read-only objects in the file * that can be referenced by multiple * revisions (section 2.1.8 in MS-ONESTORE). * The root file node list (section 2.1.14 in MS-ONESTORE) is the file node list that is the * root of the tree of all file node lists in * the file. *

* In this method we first parse the header. *

* After parsing the header, this results in header.fcrFileNodeListRoot that points to the first * * @param oneNoteDirectFileResource A random access file resource used as the source of the * content. * @return A parsed one note document. This document does not contain any of the binary data, * rather it just contains * the data pointers and metadata. * @throws IOException Will throw IOException in typical IO issue situations. */ public OneNoteDocument createOneNoteDocumentFromDirectFileResource( OneNoteDirectFileResource oneNoteDirectFileResource) throws IOException, TikaException { OneNoteDocument oneNoteDocument = new OneNoteDocument(); OneNotePtr oneNotePtr = new OneNotePtr(oneNoteDocument, oneNoteDirectFileResource); // First parse out the header. oneNoteDocument.header = oneNotePtr.deserializeHeader(); if (oneNoteDocument.header.isMsOneStoreFormat()) { // Now that we parsed the header, the "root file node list" oneNotePtr.reposition(oneNoteDocument.header.fcrFileNodeListRoot); FileNodePtr curPath = new FileNodePtr(); oneNotePtr.deserializeFileNodeList(oneNoteDocument.root, curPath); } return oneNoteDocument; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy