All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.mime.MediaTypeRegistry Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.mime;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;

/**
 * Registry of known Internet media types.
 */
public class MediaTypeRegistry implements Serializable {

    /** Serial version UID */
    private static final long serialVersionUID = 4710974869988895410L;

    /**
     * Returns the built-in media type registry included in Tika.
     *
     * @since Apache Tika 0.8
     * @return default media type registry
     */
    public static MediaTypeRegistry getDefaultRegistry() {
        return MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
    }

    /**
     * Registry of known media types, including type aliases. A canonical
     * media type is handled as an identity mapping, while an alias is stored
     * as a mapping from the alias to the corresponding canonical type.
     */
    private final Map registry =
        new HashMap();

    /**
     * Known type inheritance relationships. The mapping is from a media type
     * to the closest supertype.
     */
    private final Map inheritance =
        new HashMap();

    /**
     * Returns the set of all known canonical media types. Type aliases are
     * not included in the returned set.
     *
     * @since Apache Tika 0.8
     * @return canonical media types
     */
    public SortedSet getTypes() {
        return new TreeSet(registry.values());
    }

    /**
     * Returns the set of known aliases of the given canonical media type.
     *
     * @since Apache Tika 0.8
     * @param type canonical media type
     * @return known aliases
     */
    public SortedSet getAliases(MediaType type) {
        SortedSet aliases = new TreeSet();
        for (Map.Entry entry : registry.entrySet()) {
            if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
                aliases.add(entry.getKey());
            }
        }
        return aliases;
    }
    
    /**
     * Returns the set of known children of the given canonical media type
     * 
     * @since Apache Tika 1.8
     * @param type canonical media type
     * @return known children
     */
    public SortedSet getChildTypes(MediaType type) {
        SortedSet children = new TreeSet();
        for (Map.Entry entry : inheritance.entrySet()) {
            if (entry.getValue().equals(type)) {
                children.add(entry.getKey());
            }
        }
        return children;
    }

    public void addType(MediaType type) {
        registry.put(type, type);
    }

    public void addAlias(MediaType type, MediaType alias) {
        registry.put(alias, type);
    }

    public void addSuperType(MediaType type, MediaType supertype) {
        inheritance.put(type, supertype);
    }

    public MediaType normalize(MediaType type) {
        if (type == null) {
            return null;
        }
        MediaType canonical = registry.get(type.getBaseType());
        if (canonical == null) {
            return type;
        } else if (type.hasParameters()) {
            return new MediaType(canonical, type.getParameters());
        } else {
            return canonical;
        }
    }

    /**
     * Checks whether the given media type a is a specialization of a more
     * generic type b. Both types should be already normalised.
     *
     * @since Apache Tika 0.8
     * @param a media type, normalised
     * @param b suspected supertype, normalised
     * @return true if b is a supertype of a,
     *         false otherwise
     */
    public boolean isSpecializationOf(MediaType a, MediaType b) {
        return isInstanceOf(getSupertype(a), b);
    }

    /**
     * Checks whether the given media type equals the given base type or
     * is a specialization of it. Both types should be already normalised.
     *
     * @since Apache Tika 1.2
     * @param a media type, normalised
     * @param b base type, normalised
     * @return true if b equals a or is a specialization of it,
     *         false otherwise
     */
    public boolean isInstanceOf(MediaType a, MediaType b) {
        return a != null && (a.equals(b) || isSpecializationOf(a, b));
    }

    /**
     * Parses and normalises the given media type string and checks whether
     * the result equals the given base type or is a specialization of it.
     * The given base type should already be normalised.
     *
     * @since Apache Tika 1.2
     * @param a media type
     * @param b base type, normalised
     * @return true if b equals a or is a specialization of it,
     *         false otherwise
     */
    public boolean isInstanceOf(String a, MediaType b) {
        return isInstanceOf(normalize(MediaType.parse(a)), b);
    }

    /**
     * Returns the supertype of the given type. If the media type database
     * has an explicit inheritance rule for the type, then that is used. 
     * Next, if the given type has any parameters, then the respective base 
     * type (parameter-less) is returned. Otherwise built-in heuristics like 
     * text/... -> text/plain and .../...+xml -> application/xml are used. 
     * Finally application/octet-stream is returned for all types for which no other
     * supertype is known, and the return value for application/octet-stream
     * is null.
     *
     * @since Apache Tika 0.8
     * @param type media type
     * @return supertype, or null for application/octet-stream
     */
    public MediaType getSupertype(MediaType type) {
        if (type == null) {
            return null;
        } else if (inheritance.containsKey(type)) {
            return inheritance.get(type);
        } else if (type.hasParameters()) {
            return type.getBaseType();
        } else if (type.getSubtype().endsWith("+xml")) {
            return MediaType.APPLICATION_XML;
        } else if (type.getSubtype().endsWith("+zip")) {
            return MediaType.APPLICATION_ZIP;
        } else if ("text".equals(type.getType())
                && !MediaType.TEXT_PLAIN.equals(type)) {
            return MediaType.TEXT_PLAIN;
        } else if(type.getType().contains("empty") && !MediaType.EMPTY.equals(type)){
            return MediaType.EMPTY;
        } else if (!MediaType.OCTET_STREAM.equals(type)) {
            return MediaType.OCTET_STREAM;
        } else {
            return null;
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy