All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.mime.Patterns Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.mime;

import java.io.Serializable;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

/**
 * Defines a MimeType pattern.
 */
class Patterns implements Serializable {

    /**
     * Serial version UID.
     */
    private static final long serialVersionUID = -5778015347278111140L;

    private final MediaTypeRegistry registry;

    /**
     * Index of exact name patterns.
     */
    private final Map names = new HashMap();

    /**
     * Index of extension patterns of the form "*extension".
     */
    private final Map extensions =
        new HashMap();

    private int minExtensionLength = Integer.MAX_VALUE;

    private int maxExtensionLength = 0;

    /**
     * Index of generic glob patterns, sorted by length.
     */
    private final SortedMap globs =
        new TreeMap(new LengthComparator());

    private static final class LengthComparator
            implements Comparator, Serializable {

        /**
         * Serial version UID.
         */
        private static final long serialVersionUID = 8468289702915532359L;

        public int compare(String a, String b) {
            int diff = b.length() - a.length();
            if (diff == 0) {
                diff = a.compareTo(b);
            }
            return diff;
        }

    }

    public Patterns(MediaTypeRegistry registry) {
        this.registry = registry;
    }

    public void add(String pattern, MimeType type) throws MimeTypeException {
        this.add(pattern, false, type);
    }
   
    public void add(String pattern, boolean isJavaRegex, MimeType type)
            throws MimeTypeException {
        if (pattern == null || type == null) {
            throw new IllegalArgumentException(
                    "Pattern and/or mime type is missing");
        }
        
        if (isJavaRegex) {
            // in this case, we don't need to build a regex pattern
            // it's already there for us, so just add the pattern as is
            addGlob(pattern, type);
        } else {

            if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1
                    && pattern.indexOf('[') == -1) {
                addName(pattern, type);
            } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1
                    && pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) {
                String extension = pattern.substring(1);
                addExtension(extension, type);
                type.addExtension(extension);
            } else {
                addGlob(compile(pattern), type);
            }
        }
    }
    
    private void addName(String name, MimeType type) throws MimeTypeException {
        MimeType previous = names.get(name);
        if (previous == null
                || registry.isSpecializationOf(previous.getType(), type.getType())) {
            names.put(name, type);
        } else if (previous == type
                || registry.isSpecializationOf(type.getType(), previous.getType())) {
            // do nothing
        } else {
            throw new MimeTypeException("Conflicting name pattern: " + name);
        }
    }

    private void addExtension(String extension, MimeType type)
            throws MimeTypeException {
        MimeType previous = extensions.get(extension);
        if (previous == null
                || registry.isSpecializationOf(previous.getType(), type.getType())) {
            extensions.put(extension, type);
            int length = extension.length();
            minExtensionLength = Math.min(minExtensionLength, length);
            maxExtensionLength = Math.max(maxExtensionLength, length);
        } else if (previous == type
                || registry.isSpecializationOf(type.getType(), previous.getType())) {
            // do nothing
        } else {
            throw new MimeTypeException(
                    "Conflicting extension pattern: " + extension);
        }
    }

    private void addGlob(String glob, MimeType type)
            throws MimeTypeException {
        MimeType previous = globs.get(glob);
        if (previous == null
                || registry.isSpecializationOf(previous.getType(), type.getType())) {
            globs.put(glob, type);
        } else if (previous == type
                || registry.isSpecializationOf(type.getType(), previous.getType())) {
            // do nothing
        } else {
            throw new MimeTypeException("Conflicting glob pattern: " + glob);
        }
    }

    /**
     * Find the MimeType corresponding to a resource name.
     * 
     * It applies the recommendations detailed in FreeDesktop Shared MIME-info
     * Database for guessing MimeType from a resource name: It first tries a
     * case-sensitive match, then try again with the resource name converted to
     * lower-case if that fails. If several patterns match then the longest
     * pattern is used. In particular, files with multiple extensions (such as
     * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in
     * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched
     * before all others. Patterns beginning with `*.' and containing no other
     * special characters (`*?[') are matched before other wildcarded patterns
     * (since this covers the majority of the patterns).
     */
    public MimeType matches(String name) {
        if (name == null) {
            throw new IllegalArgumentException("Name is missing");
        }

        // First, try exact match of the provided resource name
        if (names.containsKey(name)) {
            return names.get(name);
        }

        // Then try "extension" (*.xxx) matching
        int maxLength = Math.min(maxExtensionLength, name.length());
        for (int n = maxLength; n >= minExtensionLength; n--) {
            String extension = name.substring(name.length() - n);
            if (extensions.containsKey(extension)) {
                return extensions.get(extension);
            }
        }

        // And finally, try complex regexp matching
        for (Map.Entry entry : globs.entrySet()) {
            if (name.matches(entry.getKey())) {
                return entry.getValue();
            }
        }

        return null;
    }

    private String compile(String glob) {
        StringBuilder pattern = new StringBuilder();
        pattern.append("\\A");
        for (int i = 0; i < glob.length(); i++) {
            char ch = glob.charAt(i);
            if (ch == '?') {
                pattern.append('.');
            } else if (ch == '*') {
                pattern.append(".*");
            } else if ("\\[]^.-$+(){}|".indexOf(ch) != -1) {
                pattern.append('\\');
                pattern.append(ch);
            } else {
                pattern.append(ch);
            }
        }
        pattern.append("\\z");
        return pattern.toString();
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy