Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.onenote;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Walk the one note tree and create a Map while it goes.
* Also writes user input text to a print writer as it parses.
*/
class OneNoteTreeWalker {
private static final String P = "p";
/**
* See spec MS-ONE - 2.3.1 - TIME32 - epoch of jan 1 1980 UTC.
* So we create this offset used to calculate number of seconds between this and the Instant
* .EPOCH.
*/
private static final long TIME32_EPOCH_DIFF_1980;
/**
* See spec MS-DTYP - 2.3.3 - DATETIME dates are based on epoch of jan 1 1601 UTC.
* So we create this offset used to calculate number of seconds between this and the Instant
* .EPOCH.
*/
private static final long DATETIME_EPOCH_DIFF_1601;
private static final Pattern HYPERLINK_PATTERN =
Pattern.compile("\uFDDFHYPERLINK\\s+\"([^\"]+)\"([^\"]+)$");
static {
LocalDateTime time32Epoch1980 = LocalDateTime.of(1980, Month.JANUARY, 1, 0, 0);
Instant instant = time32Epoch1980.atZone(ZoneOffset.UTC).toInstant();
TIME32_EPOCH_DIFF_1980 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
}
static {
LocalDateTime time32Epoch1601 = LocalDateTime.of(1601, Month.JANUARY, 1, 0, 0);
Instant instant = time32Epoch1601.atZone(ZoneOffset.UTC).toInstant();
DATETIME_EPOCH_DIFF_1601 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
}
private final Metadata parentMetadata;
private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
private final Set authors = new HashSet<>();
private final Set mostRecentAuthors = new HashSet<>();
private final Set originalAuthors = new HashSet<>();
private final OneNoteTreeWalkerOptions options;
private final OneNoteDocument oneNoteDocument;
private final OneNoteDirectFileResource dif;
private final XHTMLContentHandler xhtml;
private final Pair roleAndContext;
private Instant lastModifiedTimestamp = Instant.MIN;
private long creationTimestamp = Long.MAX_VALUE;
private long lastModified = Long.MIN_VALUE;
private boolean mostRecentAuthorProp = false;
private boolean originalAuthorProp = false;
/**
* Contains pairs of {Offset,Length} that we have added to the text stream already.
*/
private final Set> textAlreadyFetched = new HashSet<>();
/**
* Create a one tree walker.
*
* @param options The options for how to walk this tree.
* @param oneNoteDocument The one note document we want to walk.
* @param dif The rando file access structure we read and reposition while
* extracting the content.
* @param xhtml The XHTMLContentHandler to populate as you walk the tree.
* @param roleAndContext The role nd context value we want to use when crawling. Set this
* to null if you are
* crawling all root file nodes, and don't care about revisions.
*/
public OneNoteTreeWalker(OneNoteTreeWalkerOptions options, OneNoteDocument oneNoteDocument,
OneNoteDirectFileResource dif, XHTMLContentHandler xhtml,
Metadata parentMetadata, ParseContext parseContext,
Pair roleAndContext) {
this.options = options;
this.oneNoteDocument = oneNoteDocument;
this.dif = dif;
this.roleAndContext = roleAndContext;
this.xhtml = xhtml;
this.parentMetadata = parentMetadata;
this.embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
}
/**
* Parse the tree.
*
* @return Map of the fully parsed one note document.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public Map walkTree() throws IOException, TikaException, SAXException {
Map structure = new HashMap<>();
structure.put("header", oneNoteDocument.header);
structure.put("rootFileNodes", walkRootFileNodes());
return structure;
}
/**
* Walk the root file nodes, depending on the options will crawl revisions or the entire
* revision tree.
*
* @return List of the root file nodes.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public List