All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.pkg.ZipContainerDetector Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.pkg;

import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.util.ZipEntrySource;
import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * A detector that works on Zip documents and other archive and compression
 * formats to figure out exactly what the file is.
 */
public class ZipContainerDetector implements Detector {

    //Regrettably, some tiff files can be incorrectly identified
    //as tar files.  We need this ugly workaround to rule out TIFF.
    //If commons-compress ever chooses to take over TIFF detection
    //we can remove all of this. See TIKA-2591.
    private final static MediaType TIFF = MediaType.image("tiff");
    private final static byte[][] TIFF_SIGNATURES = new byte[3][];
    static {
        TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
        TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
        TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
    }

    private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);

    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
    private static final String VISIO_DOCUMENT =
            "http://schemas.microsoft.com/visio/2010/relationships/document";
    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
    private static final String STRICT_CORE_DOCUMENT = 
            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";

    private static final String XPS_DOCUMENT =
            "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
    
    /** Serial version UID */
    private static final long serialVersionUID = 2891763938430295453L;

    public MediaType detect(InputStream input, Metadata metadata)
            throws IOException {
        // Check if we have access to the document
        if (input == null) {
            return MediaType.OCTET_STREAM;
        }

        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(input, tmp);

            byte[] prefix = new byte[1024]; // enough for all known formats
            int length = tis.peek(prefix);

            MediaType type = detectArchiveFormat(prefix, length);

            if (type == TIFF) {
                return TIFF;
            } else if (PackageParser.isZipArchive(type)
                        && TikaInputStream.isTikaInputStream(input)) {
                return detectZipFormat(tis);
            } else if (!type.equals(MediaType.OCTET_STREAM)) {
                return type;
            } else {
                return detectCompressorFormat(prefix, length);
            }
        } finally {
            try {
                tmp.dispose();
            } catch (TikaException e) {
                // ignore
            }
        }
    }

    private static MediaType detectCompressorFormat(byte[] prefix, int length) {
        try {
            String type = CompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
            return CompressorParser.getMediaType(type);
        } catch (CompressorException e) {
            return MediaType.OCTET_STREAM;
        }
    }

    private static boolean isTiff(byte[] prefix) {
        for (byte[] sig : TIFF_SIGNATURES) {
            if(arrayStartWith(sig, prefix)) {
                return true;
            }
        }
        return false;
    }

    private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
        if (haystack.length < needle.length) {
            return false;
        }
        for (int i = 0; i < needle.length; i++) {
            if (haystack[i] != needle[i]) {
                return false;
            }
        }
        return true;
    }

    private static MediaType detectArchiveFormat(byte[] prefix, int length) {
        if (isTiff(prefix)) {
            return TIFF;
        }
        try {
            String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
            return PackageParser.getMediaType(name);
        } catch (ArchiveException e) {
            return MediaType.OCTET_STREAM;
        }
    }

    private static MediaType detectZipFormat(TikaInputStream tis) {
        try {

            //try opc first because opening a package
            //will not necessarily throw an exception for
            //truncated files.
            MediaType type = detectOPCBased(tis);
            if (type != null) {
                return type;
            }
            ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
            try {
                type = detectOpenDocument(zip);

                if (type == null) {
                    type = detectIWork13(zip);
                }
                if (type == null) {
                    type = detectIWork(zip);
                }
                if (type == null) {
                    type = detectJar(zip);
                }
                if (type == null) {
                    type = detectKmz(zip);
                }
                if (type == null) {
                    type = detectIpa(zip);
                }
                if (type != null) {
                    return type;
                }
            } finally {
                // TODO: shouldn't we record the open
                // container so it can be later
                // reused...?
                // tis.setOpenContainer(zip);
                try {
                    zip.close();
                } catch (IOException e) {
                    // ignore
                }
            }
        } catch (IOException e) {
            // ignore
        }
        // Fallback: it's still a zip file, we just don't know what kind of one
        return MediaType.APPLICATION_ZIP;
    }

    /**
     * OpenDocument files, along with EPub files and ASiC ones, have a 
     *  mimetype entry in the root of their Zip file. This entry contains
     *  the mimetype of the overall file, stored as a single string.  
     */
    private static MediaType detectOpenDocument(ZipFile zip) {
        try {
            ZipArchiveEntry mimetype = zip.getEntry("mimetype");
            if (mimetype != null) {
                try (InputStream stream = zip.getInputStream(mimetype)) {
                    return MediaType.parse(IOUtils.toString(stream, UTF_8));
                }
            } else {
                return null;
            }
        } catch (IOException e) {
            return null;
        }
    }

    private static MediaType detectOPCBased(TikaInputStream stream) {

        ZipEntrySource zipEntrySource = null;
        try {
            zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile()));
        } catch (IOException e) {
            return null;
        }

        //if (zip.getEntry("_rels/.rels") != null
        //  || zip.getEntry("[Content_Types].xml") != null) {
        // Use POI to open and investigate it for us
        //Unfortunately, POI can throw a RuntimeException...so we
        //have to catch that.
        OPCPackage pkg = null;
        try {
            pkg = OPCPackage.open(zipEntrySource);
        } catch (SecurityException e) {
            closeQuietly(zipEntrySource);
            //TIKA-2571
            throw e;
        } catch (InvalidFormatException|RuntimeException e) {
            closeQuietly(zipEntrySource);
            return null;
        }

        MediaType type = null;
        try {

            // Is at an OOXML format?
            type = detectOfficeOpenXML(pkg);
            if (type == null) {
                // Is it XPS format?
                type = detectXPSOPC(pkg);
            }
            if (type == null) {
                // Is it an AutoCAD format?
                type = detectAutoCADOPC(pkg);
            }

        } catch (SecurityException e) {
            closeQuietly(zipEntrySource);
            //TIKA-2571
            throw e;
        } catch (RuntimeException e) {
            closeQuietly(zipEntrySource);
            return null;
        }
        //only set the open container if we made it here
        stream.setOpenContainer(pkg);
        // We don't know what it is, sorry
        return type;
    }

    private static void closeQuietly(ZipEntrySource zipEntrySource) {
        if (zipEntrySource == null) {
            return;
        }
        try {
            zipEntrySource.close();
        } catch (IOException e) {
            //swallow
        }
    }
    /**
     * Detects the type of an OfficeOpenXML (OOXML) file from
     *  opened Package 
     */
    public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
        // Check for the normal Office core document
        PackageRelationshipCollection core = 
               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
        // Otherwise check for some other Office core document types
        if (core.size() == 0) {
            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
        }
        if (core.size() == 0) {
            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
        }
        
        // If we didn't find a single core document of any type, skip detection
        if (core.size() != 1) {
            // Invalid OOXML Package received
            return null;
        }

        // Get the type of the core document part
        PackagePart corePart = pkg.getPart(core.getRelationship(0));
        String coreType = corePart.getContentType();

        if (coreType.contains(".xps")) {
            return MediaType.application("vnd.ms-package.xps");
        }
        // Turn that into the type of the overall document
        String docType = coreType.substring(0, coreType.lastIndexOf('.'));

        // The Macro Enabled formats are a little special
        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
            docType = docType.toLowerCase(Locale.ROOT) + ".12";
        }

        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
            docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
        }

        // Build the MediaType object and return
        return MediaType.parse(docType);
    }
    /**
     * Detects Open XML Paper Specification (XPS)
     */
    public static MediaType detectXPSOPC(OPCPackage pkg) {
        PackageRelationshipCollection xps = 
                pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
        if (xps.size() == 1) {
            return MediaType.application("vnd.ms-xpsdocument");
        } else {
            // Non-XPS Package received
            return null;
        }
    }
    /**
     * Detects AutoCAD formats that live in OPC packaging
     */
    private static MediaType detectAutoCADOPC(OPCPackage pkg) {
        PackageRelationshipCollection dwfxSeq = 
                pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
        if (dwfxSeq.size() == 1) {
            return MediaType.parse("model/vnd.dwfx+xps");
        } else {
            // Non-AutoCAD Package received
            return null;
        }
    }

    private static MediaType detectIWork13(ZipFile zip) {
        if (zip.getEntry(IWork13PackageParser.IWORK13_COMMON_ENTRY) != null) {
            return IWork13PackageParser.IWork13DocumentType.detect(zip);
        }
        return null;
    }

    private static MediaType detectIWork(ZipFile zip) {
        if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
            // Locate the appropriate index file entry, and reads from that
            // the root element of the document. That is used to the identify
            // the correct type of the keynote container.
            for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
               IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip); 
               if (type != null) {
                  return type.getType();
               }
            }
            
            // Not sure, fallback to the container type
            return MediaType.application("vnd.apple.iwork");
        } else {
            return null;
        }
    }
    
    private static MediaType detectJar(ZipFile zip) {
       if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
          // It's a Jar file, or something based on Jar
          
          // Is it an Android APK?
          if (zip.getEntry("AndroidManifest.xml") != null) {
             return MediaType.application("vnd.android.package-archive");
          }
          
          // Check for WAR and EAR
          if (zip.getEntry("WEB-INF/") != null) {
             return MediaType.application("x-tika-java-web-archive");
          }
          if (zip.getEntry("META-INF/application.xml") != null) {
             return MediaType.application("x-tika-java-enterprise-archive");
          }
          
          // Looks like a regular Jar Archive
          return MediaType.application("java-archive");
       } else {
          // Some Android APKs miss the default Manifest
          if (zip.getEntry("AndroidManifest.xml") != null) {
             return MediaType.application("vnd.android.package-archive");
          }
          
          return null;
       }
    }

    private static MediaType detectKmz(ZipFile zip) {
        boolean kmlFound = false;

        Enumeration entries = zip.getEntries();
        while (entries.hasMoreElements()) {
            ZipArchiveEntry entry = entries.nextElement();
            String name = entry.getName();
            if (!entry.isDirectory()
                    && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
                if (name.endsWith(".kml") && !kmlFound) {
                    kmlFound = true;
                } else {
                    return null;
                }
            }
        }

        if (kmlFound) {
            return MediaType.application("vnd.google-earth.kmz");
        } else {
            return null;
        }
    }

    /**
     * To be considered as an IPA file, it needs to match all of these
     */
    private static HashSet ipaEntryPatterns = new HashSet() {
        private static final long serialVersionUID = 6545295886322115362L;
        {
           add(Pattern.compile("^Payload/$"));
           add(Pattern.compile("^Payload/.*\\.app/$"));
           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
           add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
           add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
    }};
    @SuppressWarnings("unchecked")
    private static MediaType detectIpa(ZipFile zip) {
        // Note - consider generalising this logic, if another format needs many regexp matching
        Set tmpPatterns = (Set)ipaEntryPatterns.clone();
        
        Enumeration entries = zip.getEntries();
        while (entries.hasMoreElements()) {
            ZipArchiveEntry entry = entries.nextElement();
            String name = entry.getName();
            
            Iterator ip = tmpPatterns.iterator();
            while (ip.hasNext()) {
                if (ip.next().matcher(name).matches()) {
                    ip.remove();
                }
            }
            if (tmpPatterns.isEmpty()) {
                // We've found everything we need to find
                return MediaType.application("x-itunes-ipa");
            }
        }
        
        // If we get here, not all required entries were found
        return null;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy