org.apache.tika.detect.MagicDetector Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Content type detection based on magic bytes, i.e. type-specific patterns
 * near the beginning of the document input stream.
 *
 * Because this works on bytes, not characters, by default any string
 *  matching is done as ISO_8859_1. To use an explicit different
 *  encoding, supply a type other than "string" / "stringignorecase"
 *
 * @since Apache Tika 0.3
 */
public class MagicDetector implements Detector {

    public static MagicDetector parse(
            MediaType mediaType,
            String type, String offset, String value, String mask) {
        int start = 0;
        int end = 0;
        if (offset != null) {
            int colon = offset.indexOf(':');
            if (colon == -1) {
                start = Integer.parseInt(offset);
                end = start;
            } else {
                start = Integer.parseInt(offset.substring(0, colon));
                end = Integer.parseInt(offset.substring(colon + 1));
            }
        }

        byte[] patternBytes = decodeValue(value, type);
        byte[] maskBytes = null;
        if (mask != null) {
            maskBytes = decodeValue(mask, type);
        }

        return new MagicDetector(
                mediaType, patternBytes, maskBytes,
                type.equals("regex"), type.equals("stringignorecase"),
                start, end);
    }

    private static byte[] decodeValue(String value, String type) {
        // Preliminary check
        if ((value == null) || (type == null)) {
            return null;
        }

        byte[] decoded = null;
        String tmpVal = null;
        int radix = 8;

        // hex
        if (value.startsWith("0x")) {
            tmpVal = value.substring(2);
            radix = 16;
        } else {
            tmpVal = value;
            radix = 8;
        }

        if (type.equals("string")
                || type.equals("regex")
                || type.equals("unicodeLE")
                || type.equals("unicodeBE")) {
            decoded = decodeString(value, type);
        } else if (type.equals("stringignorecase")) {
            decoded = decodeString(value.toLowerCase(Locale.ROOT), type);
        } else if (type.equals("byte")) {
            decoded = tmpVal.getBytes(UTF_8);
        } else if (type.equals("host16") || type.equals("little16")) {
            int i = Integer.parseInt(tmpVal, radix);
            decoded = new byte[] { (byte) (i & 0x00FF), (byte) (i >> 8) };
        } else if (type.equals("big16")) {
            int i = Integer.parseInt(tmpVal, radix);
            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
        } else if (type.equals("host32") || type.equals("little32")) {
            long i = Long.parseLong(tmpVal, radix);
            decoded = new byte[] {
                    (byte) ((i & 0x000000FF)),
                    (byte) ((i & 0x0000FF00) >> 8),
                    (byte) ((i & 0x00FF0000) >> 16),
                    (byte) ((i & 0xFF000000) >> 24) };
        } else if (type.equals("big32")) {
            long i = Long.parseLong(tmpVal, radix);
            decoded = new byte[] {
                    (byte) ((i & 0xFF000000) >> 24),
                    (byte) ((i & 0x00FF0000) >> 16),
                    (byte) ((i & 0x0000FF00) >> 8),
                    (byte) ((i & 0x000000FF)) };
        }
        return decoded;
    }

    private static byte[] decodeString(String value, String type) {
        if (value.startsWith("0x")) {
            byte[] vals = new byte[(value.length() - 2) / 2];
            for (int i = 0; i < vals.length; i++) {
                vals[i] = (byte)
                Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
            }
            return vals;
        }

        CharArrayWriter decoded = new CharArrayWriter();

        for (int i = 0; i < value.length(); i++) {
            if (value.charAt(i) == '\\') {
                if (value.charAt(i + 1) == '\\') {
                    decoded.write('\\');
                    i++;
                } else if (value.charAt(i + 1) == 'x') {
                    decoded.write(Integer.parseInt(
                            value.substring(i + 2, i + 4), 16));
                    i += 3;
                } else if (value.charAt(i + 1) == 'r') {
                    decoded.write((int)'\r');
                    i++;
                } else if (value.charAt(i + 1) == 'n') {
                   decoded.write((int)'\n');
                   i++;
                } else {
                    int j = i + 1;
                    while ((j < i + 4) && (j < value.length())
                            && (Character.isDigit(value.charAt(j)))) {
                        j++;
                    }
                    decoded.write(Short.decode(
                            "0" + value.substring(i + 1, j)).byteValue());
                    i = j - 1;
                }
            } else {
                decoded.write(value.charAt(i));
            }
        }

        // Now turn the chars into bytes
        char[] chars = decoded.toCharArray();
        byte[] bytes;
        if ("unicodeLE".equals(type)) {
            bytes = new byte[chars.length * 2];
            for (int i = 0; i < chars.length; i++) {
                bytes[i * 2] = (byte) (chars[i] & 0xff);
                bytes[i * 2 + 1] = (byte) (chars[i] >> 8);
            }
        } else if ("unicodeBE".equals(type)) {
            bytes = new byte[chars.length * 2];
            for(int i = 0; i < chars.length; i++) {
                bytes[i * 2] = (byte) (chars[i] >> 8);
                bytes[i * 2 + 1] = (byte) (chars[i] & 0xff);
            }
        } else {
            // Copy with truncation
            bytes = new byte[chars.length];
            for(int i = 0; i < bytes.length; i++) {
                bytes[i] = (byte) chars[i];
            }
        }
        return bytes;
    }

    /**
     * The matching media type. Returned by the
     * {@link #detect(InputStream, Metadata)} method if a match is found.
     */
    private final MediaType type;

    /**
     * Length of the comparison window.
     */
    private final int length;

    /**
     * The magic match pattern. If this byte pattern is equal to the
     * possibly bit-masked bytes from the input stream, then the type
     * detection succeeds and the configured {@link #type} is returned.
     */
    private final byte[] pattern;
    
    /**
     * Length of the pattern, which in the case of regular expressions will
     * not be the same as the comparison window length.
     */
    private final int patternLength;
    
    /**
     * True if pattern is a regular expression, false otherwise.
     */
    private final boolean isRegex;

    /**
     * True if we're doing a case-insensitive string match, false otherwise.
     */
    private final boolean isStringIgnoreCase;

    /**
     * Bit mask that is applied to the source bytes before pattern matching.
     */
    private final byte[] mask;

    /**
     * First offset (inclusive) of the comparison window within the
     * document input stream. Greater than or equal to zero.
     */
    private final int offsetRangeBegin;

    /**
     * Last offset (inclusive) of the comparison window within the document
     * input stream. Greater than or equal to the
     * {@link #offsetRangeBegin first offset}.
     * 
     * Note that this is not the offset of the last byte read from
     * the document stream. Instead, the last window of bytes to be compared
     * starts at this offset.
     */
    private final int offsetRangeEnd;

    /**
     * Creates a detector for input documents that have the exact given byte
     * pattern at the beginning of the document stream.
     *
     * @param type matching media type
     * @param pattern magic match pattern
     */
    public MagicDetector(MediaType type, byte[] pattern) {
        this(type, pattern, 0);
    }

    /**
     * Creates a detector for input documents that have the exact given byte
     * pattern at the given offset of the document stream.
     *
     * @param type matching media type
     * @param pattern magic match pattern
     * @param offset offset of the pattern match
     */
    public MagicDetector(MediaType type, byte[] pattern, int offset) {
        this(type, pattern, null, offset, offset);
    }
    
    /**
     * Creates a detector for input documents that meet the specified magic
     * match.  {@code pattern} must NOT be a regular expression.
     * Constructor maintained for legacy reasons.
     */
    public MagicDetector(
        MediaType type, byte[] pattern, byte[] mask,
        int offsetRangeBegin, int offsetRangeEnd) {
        this(type, pattern, mask, false, offsetRangeBegin, offsetRangeEnd);
    }

    /**
     * Creates a detector for input documents that meet the specified
     * magic match.
     */
    public MagicDetector(
            MediaType type, byte[] pattern, byte[] mask,
            boolean isRegex,
            int offsetRangeBegin, int offsetRangeEnd) {
        this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd);
    }
    /**
     * Creates a detector for input documents that meet the specified
     * magic match.
     */
    public MagicDetector(
            MediaType type, byte[] pattern, byte[] mask,
            boolean isRegex, boolean isStringIgnoreCase,
            int offsetRangeBegin, int offsetRangeEnd) {
        if (type == null) {
            throw new IllegalArgumentException("Matching media type is null");
        } else if (pattern == null) {
            throw new IllegalArgumentException("Magic match pattern is null");
        } else if (offsetRangeBegin < 0
                || offsetRangeEnd < offsetRangeBegin) {
            throw new IllegalArgumentException(
                    "Invalid offset range: ["
                    + offsetRangeBegin + "," + offsetRangeEnd + "]");
        }

        this.type = type;

        this.isRegex = isRegex;
        this.isStringIgnoreCase = isStringIgnoreCase;

        this.patternLength = Math.max(pattern.length, mask != null ? mask.length : 0);

        if (this.isRegex) {
            // 8K buffer should cope with most regex patterns
            this.length = 8 * 1024;
        } else {
            this.length = patternLength;
        }

        this.mask = new byte[this.patternLength];
        this.pattern = new byte[this.patternLength];

        for (int i = 0; i < this.patternLength; i++) {
            if (mask != null && i < mask.length) {
                this.mask[i] = mask[i];
            } else {
                this.mask[i] = -1;
            }

            if (i < pattern.length) {
                this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
            } else {
                this.pattern[i] = 0;
            }
        }

        this.offsetRangeBegin = offsetRangeBegin;
        this.offsetRangeEnd = offsetRangeEnd;
    }

    /**
     * 
     * @param input document input stream, or null
     * @param metadata ignored
     */
    public MediaType detect(InputStream input, Metadata metadata)
            throws IOException {
        if (input == null) {
            return MediaType.OCTET_STREAM;
        }

        input.mark(offsetRangeEnd + length);
        try {
            int offset = 0;

            // Skip bytes at the beginning, using skip() or read()
            while (offset < offsetRangeBegin) {
                long n = input.skip(offsetRangeBegin - offset);
                if (n > 0) {
                    offset += n;
                } else if (input.read() != -1) {
                    offset += 1;
                } else {
                    return MediaType.OCTET_STREAM;
                }
            }

            // Fill in the comparison window
            byte[] buffer =
                new byte[length + (offsetRangeEnd - offsetRangeBegin)];
            int n = input.read(buffer);
            if (n > 0) {
                offset += n;
            }
            while (n != -1 && offset < offsetRangeEnd + length) {
                int bufferOffset = offset - offsetRangeBegin;
                n = input.read(
                        buffer, bufferOffset, buffer.length - bufferOffset);
                // increment offset - in case not all read (see testDetectStreamReadProblems)
                if (n > 0) {
                    offset += n;
                }
            }

            if (this.isRegex) {
                int flags = 0;
                if (this.isStringIgnoreCase) {
                    flags = Pattern.CASE_INSENSITIVE;
                }
                
                Pattern p = Pattern.compile(new String(this.pattern, UTF_8), flags);

                ByteBuffer bb = ByteBuffer.wrap(buffer);
                CharBuffer result = ISO_8859_1.decode(bb);
                Matcher m = p.matcher(result);

                boolean match = false;
                // Loop until we've covered the entire offset range
                for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
                    m.region(i,  length+i);
                    match = m.lookingAt(); // match regex from start of region
                    if (match) {
                        return type;
                    }
                }
            } else {
                if (offset < offsetRangeBegin + length) {
                    return MediaType.OCTET_STREAM;
                }
                // Loop until we've covered the entire offset range
                for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
                    boolean match = true;
                    int masked;
                    for (int j = 0; match && j < length; j++) {
                        masked = (buffer[i + j] & mask[j]);
                        if (this.isStringIgnoreCase) {
                            masked = Character.toLowerCase(masked);
                        }
                        match = (masked == pattern[j]);
                    }
                    if (match) {
                        return type;
                    }
                }
            }

            return MediaType.OCTET_STREAM;
        } finally {
            input.reset();
        }
    }

    public int getLength() {
        return this.patternLength;
    }

    /**
     * Returns a string representation of the Detection Rule.
     * Should sort nicely by type and details, as we sometimes
     *  compare these.
     */
    public String toString() {
        // Needs to be unique, as these get compared.
        return "Magic Detection for " + type +
                " looking for " + pattern.length + 
                " bytes = " + this.pattern + 
                " mask = " + this.mask;
    }
}