All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.detect.zip.DefaultZipContainerDetector Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.zip;

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.Field;
import org.apache.tika.config.LoadErrorHandler;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;

public class DefaultZipContainerDetector implements Detector {

    //Regrettably, some tiff files can be incorrectly identified
    //as tar files.  We need this ugly workaround to rule out TIFF.
    //If commons-compress ever chooses to take over TIFF detection
    //we can remove all of this. See TIKA-2591.
    final static MediaType TIFF = MediaType.image("tiff");
    final static byte[][] TIFF_SIGNATURES = new byte[3][];
    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 2891763938430295453L;

    private static final Logger LOG = LoggerFactory.getLogger(DefaultZipContainerDetector.class);

    static {
        TIFF_SIGNATURES[0] = new byte[]{'M', 'M', 0x00, 0x2a};
        TIFF_SIGNATURES[1] = new byte[]{'I', 'I', 0x2a, 0x00};
        TIFF_SIGNATURES[2] = new byte[]{'M', 'M', 0x00, 0x2b};
    }

    //this has to be > 100,000 to handle some of the iworks files
    //in our unit tests
    @Field
    int markLimit = 16 * 1024 * 1024;

    private transient ServiceLoader loader;

    private List staticZipDetectors;

    public DefaultZipContainerDetector() {
        this(new ServiceLoader(DefaultZipContainerDetector.class.getClassLoader(),
                LoadErrorHandler.WARN, false));
    }

    public DefaultZipContainerDetector(ServiceLoader loader) {
        this.loader = loader;
        staticZipDetectors = loader.loadStaticServiceProviders(ZipContainerDetector.class);
    }

    public DefaultZipContainerDetector(List zipDetectors) {
        staticZipDetectors = zipDetectors;
    }

    static boolean isZipArchive(MediaType type) {
        return type.equals(PackageConstants.ZIP) || type.equals(PackageConstants.JAR);
    }

    private static boolean isTiff(byte[] prefix) {
        for (byte[] sig : TIFF_SIGNATURES) {
            if (arrayStartWith(sig, prefix)) {
                return true;
            }
        }
        return false;
    }

    private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
        if (haystack.length < needle.length) {
            return false;
        }
        for (int i = 0; i < needle.length; i++) {
            if (haystack[i] != needle[i]) {
                return false;
            }
        }
        return true;
    }

    static MediaType detectArchiveFormat(byte[] prefix, int length) {
        if (isTiff(prefix)) {
            return TIFF;
        }
        try {
            String name = ArchiveStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix, 0, length));
            return PackageConstants.getMediaType(name);
        } catch (ArchiveException e) {
            return MediaType.OCTET_STREAM;
        }
    }

    static MediaType detectCompressorFormat(byte[] prefix, int length) {
        try {
            String type =
                    CompressorStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix, 0, length));
            return CompressorConstants.getMediaType(type);
        } catch (CompressorException e) {
            return MediaType.OCTET_STREAM;
        }
    }

    /**
     * If this is less than 0, the file will be spooled to disk,
     * and detection will run on the full file.
     * If this is greater than 0, the {@link DeprecatedStreamingZipContainerDetector}
     * will be called only up to the markLimit.
     *
     * @param markLimit mark limit for streaming detection
     */
    @Field
    public void setMarkLimit(int markLimit) {
        this.markLimit = markLimit;
    }

    public int getMarkLimit() {
        return markLimit;
    }

    @Override
    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
        // Check if we have access to the document
        if (input == null) {
            return MediaType.OCTET_STREAM;
        }

        byte[] prefix = new byte[1024]; // enough for all known archive formats
        input.mark(1024);
        int length = -1;
        try {
            length = IOUtils.read(input, prefix, 0, 1024);
        } finally {
            input.reset();
        }

        MediaType type = detectArchiveFormat(prefix, length);

        if (type == TIFF) {
            return TIFF;
        } else if (isZipArchive(type)) {

            if (TikaInputStream.isTikaInputStream(input)) {
                TikaInputStream tis = TikaInputStream.cast(input);
                if (markLimit < 0) {
                    tis.getFile();
                }
                if (tis.hasFile()) {
                    return detectZipFormatOnFile(tis, metadata);
                }
            }
            return detectStreaming(input, metadata);
        } else if (!type.equals(MediaType.OCTET_STREAM)) {
            return type;
        } else {
            return detectCompressorFormat(prefix, length);
        }
    }

    /**
     * This will call TikaInputStream's getFile(). If there are no exceptions,
     * it will place the ZipFile in TikaInputStream's openContainer and leave it
     * open.
     *
     * @param tis
     * @return
     */
    private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata metadata) {
        ZipFile zip = null;
        try {
            zip = new ZipFile(tis.getFile()); // TODO: hasFile()?

            for (ZipContainerDetector zipDetector : getDetectors()) {
                MediaType type = zipDetector.detect(zip, tis);
                if (type != null) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("{} detected {}", zipDetector.getClass(),
                                type.toString());
                    }
                    //e.g. if OPCPackage has already been set
                    //don't overwrite it with the zip
                    if (tis.getOpenContainer() == null) {
                        tis.setOpenContainer(zip);
                    } else {
                        tis.addCloseableResource(zip);
                    }
                    return type;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("{} detected null", zipDetector.getClass());
                    }
                }
            }
        } catch (IOException e) {
            //do nothing
        }
        // Fallback: it's still a zip file, we just don't know what kind of one
        if (zip != null) {
            IOUtils.closeQuietly(zip);
            return MediaType.APPLICATION_ZIP;
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("zip file failed to open; attempting streaming detect");
        }
        if (zip == null) {
            //problem opening zip file (truncated?)
            try (InputStream is = new BufferedInputStream(Files.newInputStream(tis.getPath()))) {
                return detectStreaming(is, metadata);
            } catch (IOException e) {
                //swallow
            }
        }
        return MediaType.APPLICATION_ZIP;

    }

    MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException {
        BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, input);
        boundedInputStream.mark(markLimit);
        try {
            return detectStreaming(boundedInputStream, metadata, false);
        } finally {
            boundedInputStream.reset();
        }
    }

    MediaType detectStreaming(InputStream input, Metadata metadata, boolean allowStoredEntries)
            throws IOException {
        StreamingDetectContext detectContext = new StreamingDetectContext();
        try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
                CloseShieldInputStream.wrap(input), "UTF8", false, allowStoredEntries)) {
            ZipArchiveEntry zae = zis.getNextZipEntry();
            while (zae != null) {
                MediaType mt = detect(zae, zis, detectContext);
                if (mt != null) {
                    return mt;
                }
                zae = zis.getNextZipEntry();
            }
        } catch (UnsupportedZipFeatureException zfe) {
            if (allowStoredEntries == false &&
                    zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
                input.reset();
                return detectStreaming(input, metadata, true);
            }
        } catch (SecurityException e) {
            throw e;
        } catch (EOFException e) {
            //truncated zip -- swallow
        } catch (IOException e) {
            //another option for a truncated zip
        }

        return finalDetect(detectContext);
    }


    private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis,
                             StreamingDetectContext detectContext) throws IOException {
        for (ZipContainerDetector d : getDetectors()) {
            MediaType mt = d.streamingDetectUpdate(zae, zis, detectContext);
            if (mt != null) {
                return mt;
            }
        }
        return null;
    }

    private MediaType finalDetect(StreamingDetectContext detectContext) {
        for (ZipContainerDetector d : getDetectors()) {
            MediaType mt = d.streamingDetectFinal(detectContext);
            if (mt != null) {
                return mt;
            }
        }
        return MediaType.APPLICATION_ZIP;
    }

    private List getDetectors() {
        if (loader != null && loader.isDynamic()) {
            List dynamicDetectors =
                    loader.loadDynamicServiceProviders(ZipContainerDetector.class);
            if (dynamicDetectors.size() > 0) {
                List zipDetectors = new ArrayList<>(staticZipDetectors);
                zipDetectors.addAll(dynamicDetectors);
                return zipDetectors;
            } else {
                return staticZipDetectors;
            }
        }
        return staticZipDetectors;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy