org.apache.tika.detect.zip.DefaultZipContainerDetector Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect.zip;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.config.Field;
import org.apache.tika.config.LoadErrorHandler;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
public class DefaultZipContainerDetector implements Detector {
//Regrettably, some tiff files can be incorrectly identified
//as tar files. We need this ugly workaround to rule out TIFF.
//If commons-compress ever chooses to take over TIFF detection
//we can remove all of this. See TIKA-2591.
final static MediaType TIFF = MediaType.image("tiff");
final static byte[][] TIFF_SIGNATURES = new byte[3][];
/**
* Serial version UID
*/
private static final long serialVersionUID = 2891763938430295453L;
private static final Logger LOG = LoggerFactory.getLogger(DefaultZipContainerDetector.class);
static {
TIFF_SIGNATURES[0] = new byte[]{'M', 'M', 0x00, 0x2a};
TIFF_SIGNATURES[1] = new byte[]{'I', 'I', 0x2a, 0x00};
TIFF_SIGNATURES[2] = new byte[]{'M', 'M', 0x00, 0x2b};
}
//this has to be > 100,000 to handle some of the iworks files
//in our unit tests
@Field
int markLimit = 16 * 1024 * 1024;
private transient ServiceLoader loader;
private List staticZipDetectors;
public DefaultZipContainerDetector() {
this(new ServiceLoader(DefaultZipContainerDetector.class.getClassLoader(),
LoadErrorHandler.WARN, false));
}
public DefaultZipContainerDetector(ServiceLoader loader) {
this.loader = loader;
staticZipDetectors = loader.loadStaticServiceProviders(ZipContainerDetector.class);
}
public DefaultZipContainerDetector(List zipDetectors) {
staticZipDetectors = zipDetectors;
}
static boolean isZipArchive(MediaType type) {
return type.equals(PackageConstants.ZIP) || type.equals(PackageConstants.JAR);
}
private static boolean isTiff(byte[] prefix) {
for (byte[] sig : TIFF_SIGNATURES) {
if (arrayStartWith(sig, prefix)) {
return true;
}
}
return false;
}
private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
if (haystack.length < needle.length) {
return false;
}
for (int i = 0; i < needle.length; i++) {
if (haystack[i] != needle[i]) {
return false;
}
}
return true;
}
static MediaType detectArchiveFormat(byte[] prefix, int length) {
if (isTiff(prefix)) {
return TIFF;
}
try {
String name = ArchiveStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix, 0, length));
return PackageConstants.getMediaType(name);
} catch (ArchiveException e) {
return MediaType.OCTET_STREAM;
}
}
static MediaType detectCompressorFormat(byte[] prefix, int length) {
try {
String type =
CompressorStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix, 0, length));
return CompressorConstants.getMediaType(type);
} catch (CompressorException e) {
return MediaType.OCTET_STREAM;
}
}
/**
* If this is less than 0, the file will be spooled to disk,
* and detection will run on the full file.
* If this is greater than 0, the {@link DeprecatedStreamingZipContainerDetector}
* will be called only up to the markLimit.
*
* @param markLimit mark limit for streaming detection
*/
@Field
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
public int getMarkLimit() {
return markLimit;
}
@Override
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
byte[] prefix = new byte[1024]; // enough for all known archive formats
input.mark(1024);
int length = -1;
try {
length = IOUtils.read(input, prefix, 0, 1024);
} finally {
input.reset();
}
MediaType type = detectArchiveFormat(prefix, length);
if (type == TIFF) {
return TIFF;
} else if (isZipArchive(type)) {
if (TikaInputStream.isTikaInputStream(input)) {
TikaInputStream tis = TikaInputStream.cast(input);
if (markLimit < 0) {
tis.getFile();
}
if (tis.hasFile()) {
return detectZipFormatOnFile(tis, metadata);
}
}
return detectStreaming(input, metadata);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
}
/**
* This will call TikaInputStream's getFile(). If there are no exceptions,
* it will place the ZipFile in TikaInputStream's openContainer and leave it
* open.
*
* @param tis
* @return
*/
private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata metadata) {
ZipFile zip = null;
try {
zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
for (ZipContainerDetector zipDetector : getDetectors()) {
MediaType type = zipDetector.detect(zip, tis);
if (type != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("{} detected {}", zipDetector.getClass(),
type.toString());
}
//e.g. if OPCPackage has already been set
//don't overwrite it with the zip
if (tis.getOpenContainer() == null) {
tis.setOpenContainer(zip);
} else {
tis.addCloseableResource(zip);
}
return type;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("{} detected null", zipDetector.getClass());
}
}
}
} catch (IOException e) {
//do nothing
}
// Fallback: it's still a zip file, we just don't know what kind of one
if (zip != null) {
IOUtils.closeQuietly(zip);
return MediaType.APPLICATION_ZIP;
}
if (LOG.isDebugEnabled()) {
LOG.debug("zip file failed to open; attempting streaming detect");
}
if (zip == null) {
//problem opening zip file (truncated?)
try (InputStream is = new BufferedInputStream(Files.newInputStream(tis.getPath()))) {
return detectStreaming(is, metadata);
} catch (IOException e) {
//swallow
}
}
return MediaType.APPLICATION_ZIP;
}
MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException {
BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, input);
boundedInputStream.mark(markLimit);
try {
return detectStreaming(boundedInputStream, metadata, false);
} finally {
boundedInputStream.reset();
}
}
MediaType detectStreaming(InputStream input, Metadata metadata, boolean allowStoredEntries)
throws IOException {
StreamingDetectContext detectContext = new StreamingDetectContext();
try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
CloseShieldInputStream.wrap(input), "UTF8", false, allowStoredEntries)) {
ZipArchiveEntry zae = zis.getNextZipEntry();
while (zae != null) {
MediaType mt = detect(zae, zis, detectContext);
if (mt != null) {
return mt;
}
zae = zis.getNextZipEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
if (allowStoredEntries == false &&
zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
input.reset();
return detectStreaming(input, metadata, true);
}
} catch (SecurityException e) {
throw e;
} catch (EOFException e) {
//truncated zip -- swallow
} catch (IOException e) {
//another option for a truncated zip
}
return finalDetect(detectContext);
}
private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis,
StreamingDetectContext detectContext) throws IOException {
for (ZipContainerDetector d : getDetectors()) {
MediaType mt = d.streamingDetectUpdate(zae, zis, detectContext);
if (mt != null) {
return mt;
}
}
return null;
}
private MediaType finalDetect(StreamingDetectContext detectContext) {
for (ZipContainerDetector d : getDetectors()) {
MediaType mt = d.streamingDetectFinal(detectContext);
if (mt != null) {
return mt;
}
}
return MediaType.APPLICATION_ZIP;
}
private List getDetectors() {
if (loader != null && loader.isDynamic()) {
List dynamicDetectors =
loader.loadDynamicServiceProviders(ZipContainerDetector.class);
if (dynamicDetectors.size() > 0) {
List zipDetectors = new ArrayList<>(staticZipDetectors);
zipDetectors.addAll(dynamicDetectors);
return zipDetectors;
} else {
return staticZipDetectors;
}
}
return staticZipDetectors;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy