org.apache.tika.mime.MimeTypes Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
// JDK imports
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import javax.xml.namespace.QName;
import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.TextDetector;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.metadata.Metadata;
/**
* This class is a MimeType repository. It gathers a set of MimeTypes and
* enables to retrieves a content-type from its name, from a file name, or from
* a magic character sequence.
*
* The MIME type detection methods that take an {@link InputStream} as
* an argument will never reads more than {@link #getMinLength()} bytes
* from the stream. Also the given stream is never
* {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
* or {@link InputStream#reset() reset} by the methods. Thus a client can
* use the {@link InputStream#markSupported() mark feature} of the stream
* (if available) to restore the stream back to the state it was before type
* detection if it wants to process the stream based on the detected type.
*/
public final class MimeTypes implements Detector, Serializable {
/**
* Serial version UID.
*/
private static final long serialVersionUID = -1350863170146349036L;
/**
* Name of the {@link #rootMimeType root} type, application/octet-stream.
*/
public static final String OCTET_STREAM = "application/octet-stream";
/**
* Name of the {@link #textMimeType text} type, text/plain.
*/
public static final String PLAIN_TEXT = "text/plain";
/**
* Name of the {@link #xmlMimeType xml} type, application/xml.
*/
public static final String XML = "application/xml";
/**
* Root type, application/octet-stream.
*/
private final MimeType rootMimeType;
private final List rootMimeTypeL;
/**
* Text type, text/plain.
*/
private final MimeType textMimeType;
/**
* html type, text/html
*/
private final MimeType htmlMimeType;
/**
* xml type, application/xml
*/
private final MimeType xmlMimeType;
/**
* Registered media types and their aliases.
*/
private final MediaTypeRegistry registry = new MediaTypeRegistry();
/** All the registered MimeTypes indexed on their canonical names */
private final Map types =
new HashMap();
/** The patterns matcher */
private Patterns patterns = new Patterns(registry);
/** Sorted list of all registered magics */
private final List magics = new ArrayList();
/** Sorted list of all registered rootXML */
private final List xmls = new ArrayList();
public MimeTypes() {
rootMimeType = new MimeType(MediaType.OCTET_STREAM);
textMimeType = new MimeType(MediaType.TEXT_PLAIN);
htmlMimeType = new MimeType(MediaType.TEXT_HTML);
xmlMimeType = new MimeType(MediaType.APPLICATION_XML);
rootMimeTypeL = Collections.singletonList(rootMimeType);
add(rootMimeType);
add(textMimeType);
add(xmlMimeType);
}
/**
* Find the Mime Content Type of a document from its name.
* Returns application/octet-stream if no better match is found.
*
* @deprecated Use {@link Tika#detect(String)} instead
* @param name of the document to analyze.
* @return the Mime Content Type of the specified document name
*/
public MimeType getMimeType(String name) {
MimeType type = patterns.matches(name);
if (type != null) {
return type;
}
type = patterns.matches(name.toLowerCase(Locale.ENGLISH));
if (type != null) {
return type;
} else {
return rootMimeType;
}
}
/**
* Find the Mime Content Type of a document stored in the given file.
* Returns application/octet-stream if no better match is found.
*
* @deprecated Use {@link Tika#detect(File)} instead
* @param file file to analyze
* @return the Mime Content Type of the specified document
* @throws MimeTypeException if the type can't be detected
* @throws IOException if the file can't be read
*/
public MimeType getMimeType(File file)
throws MimeTypeException, IOException {
return forName(new Tika(this).detect(file));
}
/**
* Returns the MIME type that best matches the given first few bytes
* of a document stream. Returns application/octet-stream if no better
* match is found.
*
* If multiple matches are found, the best (highest priority) matching
* type is returned. If multiple matches are found with the same priority,
* then all of these are returned.
*
* The given byte array is expected to be at least {@link #getMinLength()}
* long, or shorter only if the document stream itself is shorter.
*
* @param data first few bytes of a document stream
* @return matching MIME type
*/
List getMimeType(byte[] data) {
if (data == null) {
throw new IllegalArgumentException("Data is missing");
} else if (data.length == 0) {
// See https://issues.apache.org/jira/browse/TIKA-483
return rootMimeTypeL;
}
// Then, check for magic bytes
List result = new ArrayList(1);
int currentPriority = -1;
for (Magic magic : magics) {
if (currentPriority > 0 && currentPriority > magic.getPriority()) {
break;
}
if (magic.eval(data)) {
result.add(magic.getType());
currentPriority = magic.getPriority();
}
}
if (!result.isEmpty()) {
for (int i=0; i
* The given stream is never {@link InputStream#close() closed},
* {@link InputStream#mark(int) marked}, or
* {@link InputStream#reset() reset} by this method.
*
* @param stream stream to be read
* @return first {@link #getMinLength()} (or fewer) bytes of the stream
* @throws IOException if the stream can not be read
*/
byte[] readMagicHeader(InputStream stream) throws IOException {
if (stream == null) {
throw new IllegalArgumentException("InputStream is missing");
}
byte[] bytes = new byte[getMinLength()];
int totalRead = 0;
int lastRead = stream.read(bytes);
while (lastRead != -1) {
totalRead += lastRead;
if (totalRead == bytes.length) {
return bytes;
}
lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
}
byte[] shorter = new byte[totalRead];
System.arraycopy(bytes, 0, shorter, 0, totalRead);
return shorter;
}
/**
* Returns the registered media type with the given name (or alias).
* The named media type is automatically registered (and returned) if
* it doesn't already exist.
*
* @param name media type name (case-insensitive)
* @return the registered media type with the given name or alias
* @throws MimeTypeException if the given media type name is invalid
*/
public MimeType forName(String name) throws MimeTypeException {
MediaType type = MediaType.parse(name);
if (type != null) {
MediaType normalisedType = registry.normalize(type);
MimeType mime = types.get(normalisedType);
if (mime == null) {
synchronized (this) {
// Double check it didn't already get added while
// we were waiting for the lock
mime = types.get(normalisedType);
if (mime == null) {
mime = new MimeType(type);
add(mime);
types.put(type, mime);
}
}
}
return mime;
} else {
throw new MimeTypeException("Invalid media type name: " + name);
}
}
/**
* Returns the registered, normalised media type with the given name (or alias).
*
* Unlike {@link #forName(String)}, this function will not create a
* new MimeType and register it. Instead, null
will be returned if
* there is no definition available for the given name.
*
* Also, unlike {@link #forName(String)}, this function may return a
* mime type that has fewer parameters than were included in the supplied name.
* If the registered mime type has parameters (e.g.
* application/dita+xml;format=map
), then those will be maintained.
* However, if the supplied name has paramenters that the registered mime
* type does not (e.g. application/xml; charset=UTF-8
as a name,
* compared to just application/xml
for the type in the registry),
* then those parameters will not be included in the returned type.
*
* @param name media type name (case-insensitive)
* @return the registered media type with the given name or alias, or null if not found
* @throws MimeTypeException if the given media type name is invalid
*/
public MimeType getRegisteredMimeType(String name) throws MimeTypeException {
MediaType type = MediaType.parse(name);
if (type != null) {
MediaType normalisedType = registry.normalize(type);
MimeType candidate = types.get(normalisedType);
if (candidate != null) {
return candidate;
}
if (normalisedType.hasParameters()) {
return types.get(normalisedType.getBaseType());
}
return null;
} else {
throw new MimeTypeException("Invalid media type name: " + name);
}
}
public synchronized void setSuperType(MimeType type, MediaType parent) {
registry.addSuperType(type.getType(), parent);
}
/**
* Adds an alias for the given media type. This method should only
* be called from {@link MimeType#addAlias(String)}.
*
* @param type media type
* @param alias media type alias (normalized to lower case)
*/
synchronized void addAlias(MimeType type, MediaType alias) {
registry.addAlias(type.getType(), alias);
}
/**
* Adds a file name pattern for the given media type. Assumes that the
* pattern being added is not a JDK standard regular expression.
*
* @param type
* media type
* @param pattern
* file name pattern
* @throws MimeTypeException
* if the pattern conflicts with existing ones
*/
public void addPattern(MimeType type, String pattern)
throws MimeTypeException {
this.addPattern(type, pattern, false);
}
/**
* Adds a file name pattern for the given media type. The caller can specify
* whether the pattern being added is or is not a JDK standard
* regular expression via the isRegex
parameter. If the value
* is set to true, then a JDK standard regex is assumed, otherwise the
* freedesktop glob type is assumed.
*
* @param type
* media type
* @param pattern
* file name pattern
* @param isRegex
* set to true if JDK std regexs are desired, otherwise set to
* false.
* @throws MimeTypeException
* if the pattern conflicts with existing ones.
*
*/
public void addPattern(MimeType type, String pattern, boolean isRegex)
throws MimeTypeException {
patterns.add(pattern, isRegex, type);
}
public MediaTypeRegistry getMediaTypeRegistry() {
return registry;
}
/**
* Return the minimum length of data to provide to analyzing methods based
* on the document's content in order to check all the known MimeTypes.
*
* @return the minimum length of data to provide.
* @see #getMimeType(byte[])
* @see #getMimeType(String, byte[])
*/
public int getMinLength() {
// This needs to be reasonably large to be able to correctly detect
// things like XML root elements after initial comment and DTDs
return 64 * 1024;
}
/**
* Add the specified mime-type in the repository.
*
* @param type
* is the mime-type to add.
*/
void add(MimeType type) {
registry.addType(type.getType());
types.put(type.getType(), type);
// Update the magics index...
if (type.hasMagic()) {
magics.addAll(type.getMagics());
}
// Update the xml (xmlRoot) index...
if (type.hasRootXML()) {
xmls.add(type);
}
}
/**
* Called after all configured types have been loaded.
* Initializes the magics and xmls sets.
*/
void init() {
for (MimeType type : types.values()) {
magics.addAll(type.getMagics());
if (type.hasRootXML()) {
xmls.add(type);
}
}
Collections.sort(magics);
Collections.sort(xmls);
}
/**
* Automatically detects the MIME type of a document based on magic
* markers in the stream prefix and any given metadata hints.
*
* The given stream is expected to support marks, so that this method
* can reset the stream to the position it was in before this method
* was called.
*
* @param input document stream, or null
* @param metadata metadata hints
* @return MIME type of the document
* @throws IOException if the document stream could not be read
*/
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
List possibleTypes = null;
// Get type based on magic prefix
if (input != null) {
input.mark(getMinLength());
try {
byte[] prefix = readMagicHeader(input);
possibleTypes = getMimeType(prefix);
} finally {
input.reset();
}
}
// Get type based on resourceName hint (if available)
String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (resourceName != null) {
String name = null;
// Deal with a URI or a path name in as the resource name
try {
URI uri = new URI(resourceName);
String path = uri.getPath();
if (path != null) {
int slash = path.lastIndexOf('/');
if (slash + 1 < path.length()) {
name = path.substring(slash + 1);
}
}
} catch (URISyntaxException e) {
name = resourceName;
}
if (name != null) {
MimeType hint = getMimeType(name);
// If we have some types based on mime magic, try to specialise
// and/or select the type based on that
// Otherwise, use the type identified from the name
possibleTypes = applyHint(possibleTypes, hint);
}
}
// Get type based on metadata hint (if available)
String typeName = metadata.get(Metadata.CONTENT_TYPE);
if (typeName != null) {
try {
MimeType hint = forName(typeName);
possibleTypes = applyHint(possibleTypes, hint);
} catch (MimeTypeException e) {
// Malformed type name, ignore
}
}
if (possibleTypes == null || possibleTypes.isEmpty()) {
// Report that we don't know what it is
return MediaType.OCTET_STREAM;
} else {
return possibleTypes.get(0).getType();
}
}
/**
* Use the MimeType hint to try to clarify or specialise the current
* possible types list.
* If the hint is a specialised form, use that instead
* If there are multiple possible types, use the hint to select one
*/
private List applyHint(List possibleTypes, MimeType hint) {
if (possibleTypes == null || possibleTypes.isEmpty()) {
return Collections.singletonList(hint);
} else {
for (int i=0; i CLASSLOADER_SPECIFIC_DEFAULT_TYPES =
new HashMap();
/**
* Get the default MimeTypes. This includes all the build in
* media types, and any custom override ones present.
*
* @return MimeTypes default type registry
*/
public static synchronized MimeTypes getDefaultMimeTypes() {
return getDefaultMimeTypes(null);
}
/**
* Get the default MimeTypes. This includes all the built-in
* media types, and any custom override ones present.
*
* @param classLoader to use, if not the default
* @return MimeTypes default type registry
*/
public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader) {
MimeTypes types = DEFAULT_TYPES;
if (classLoader != null) {
types = CLASSLOADER_SPECIFIC_DEFAULT_TYPES.get(classLoader);
}
if (types == null) {
try {
types = MimeTypesFactory.create(
"tika-mimetypes.xml", "custom-mimetypes.xml", classLoader);
} catch (MimeTypeException e) {
throw new RuntimeException(
"Unable to parse the default media type registry", e);
} catch (IOException e) {
throw new RuntimeException(
"Unable to read the default media type registry", e);
}
if (classLoader == null) {
DEFAULT_TYPES = types;
} else {
CLASSLOADER_SPECIFIC_DEFAULT_TYPES.put(classLoader, types);
}
}
return types;
}
}