All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.detect.ole.MiscOLEDetector Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.ole;

import static org.apache.tika.mime.MediaType.application;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
 * TODO: refactor this copy/paste from POIFSContainerDetector
 */

/**
 * A detector that works on a POIFS OLE2 document
 * to figure out exactly what the file is.
 * This should work for all OLE2 documents, whether
 * they are ones supported by POI or not.
 */
public class MiscOLEDetector implements Detector {

    /**
     * The OLE base file format
     */
    public static final MediaType OLE = application("x-tika-msoffice");


    /**
     * Hangul Word Processor (Korean)
     */
    public static final MediaType HWP = application("x-hwp-v5");

    /**
     * Base QuattroPro mime
     */
    public static final MediaType QUATTROPRO = application("x-quattro-pro");


    @Field
    private int markLimit = 16 * 1024 * 1024;

    /**
     * Internal detection of the specific kind of OLE2 document, based on the
     * names of the top level streams within the file.
     *
     * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
     * entry of the filesystem whose type is to be detected, as a
     * second argument.
     */
    @Deprecated
    protected static MediaType detect(Set names) {
        return detect(names, null);
    }

    /**
     * Internal detection of the specific kind of OLE2 document, based on the
     * names of the top-level streams within the file. In some cases the
     * detection may need access to the root {@link DirectoryEntry} of that file
     * for best results. The entry can be given as a second, optional argument.
     *
     * @param names
     * @param root
     * @return
     */
    protected static MediaType detect(Set names, DirectoryEntry root) {
        if (names == null || names.isEmpty()) {
            return OLE;
        } else if (names.contains("\u0005HwpSummaryInformation")) {
            // Hangul Word Processor v5+ (previous aren't OLE2-based)
            return HWP;
        } else if (names.contains("PerfectOffice_MAIN")) {
            if (names.contains("SlideShow")) {
                return MediaType.application("x-corelpresentations"); // .shw
            } else if (names.contains("PerfectOffice_OBJECTS")) {
                return new MediaType(QUATTROPRO, "version", "7-8"); // .wb?
            }
        } else if (names.contains("NativeContent_MAIN")) {
            return new MediaType(QUATTROPRO, "version", "9"); // .qpw
            // Couldn't detect a more specific type
        }
        return OLE;
    }

    private static Set getTopLevelNames(DirectoryNode root) {
        Set names = new HashSet<>();
        for (Entry entry : root) {
            names.add(entry.getName());
        }
        return names;
    }

    /**
     * If a TikaInputStream is passed in to {@link #detect(InputStream, Metadata)},
     * and there is not an underlying file, this detector will spool up to {@link #markLimit}
     * to disk.  If the stream was read in entirety (e.g. the spooled file is not truncated),
     * this detector will open the file with POI and perform detection.
     * If the spooled file is truncated, the detector will return {@link #OLE} (or
     * {@link MediaType#OCTET_STREAM} if there's no OLE header).
     * 

* As of Tika 1.21, this detector respects the legacy behavior of not performing detection * on a non-TikaInputStream. * * @param markLimit */ public void setMarkLimit(int markLimit) { this.markLimit = markLimit; } private Set getTopLevelNames(TikaInputStream stream) throws IOException { // Force the document stream to a (possibly temporary) file // so we don't modify the current position of the stream. //If the markLimit is < 0, this will spool the entire file //to disk if there is not an underlying file. Path file = stream.getPath(markLimit); //if the stream was longer than markLimit, don't detect if (file == null) { return Collections.emptySet(); } try { POIFSFileSystem fs = new POIFSFileSystem(file.toFile(), true); // Optimize a possible later parsing process by keeping // a reference to the already opened POI file system stream.setOpenContainer(fs); return getTopLevelNames(fs.getRoot()); } catch (IOException e) { // Parse error in POI, so we don't know the file type return Collections.emptySet(); } catch (RuntimeException e) { // Another problem in POI return Collections.emptySet(); } } @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { // Check if we have access to the document if (input == null) { return MediaType.OCTET_STREAM; } // If this is a TikaInputStream wrapping an already // parsed NPOIFileSystem/DirectoryNode, just get the // names from the root: TikaInputStream tis = TikaInputStream.cast(input); Set names = null; if (tis != null) { Object container = tis.getOpenContainer(); if (container instanceof POIFSFileSystem) { names = getTopLevelNames(((POIFSFileSystem) container).getRoot()); } else if (container instanceof DirectoryNode) { names = getTopLevelNames((DirectoryNode) container); } } if (names == null) { // Check if the document starts with the OLE header input.mark(8); try { if (input.read() != 0xd0 || input.read() != 0xcf || input.read() != 0x11 || input.read() != 0xe0 || input.read() != 0xa1 || input.read() != 0xb1 || input.read() != 0x1a || input.read() != 0xe1) { return MediaType.OCTET_STREAM; } } catch (IOException e) { return MediaType.OCTET_STREAM; } finally { input.reset(); } } // We can only detect the exact type when given a TikaInputStream if (names == null && tis != null) { // Look for known top level entry names to detect the document type names = getTopLevelNames(tis); } // Detect based on the names (as available) if (tis != null && tis.getOpenContainer() != null && tis.getOpenContainer() instanceof POIFSFileSystem) { return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot()); } else { return detect(names, null); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy