eu.medsea.util.EncodingGuesser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of skinny-engine_2.11 Show documentation
skinny-engine
The newest version!
/*
 * Copyright 2007-2009 Medsea Business Solutions S.L.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.medsea.util;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.*;

/**
 * This class contains a list of known encodings used by TextMimeType.
 * It is used by the TextMimeDetector but can be used as a stand alone utility class
 * in other parts of your program if you want.
 * 
 * The getPossibleEncodings() method takes a byte [] as its source and the bigger the
 * array the better the detection ratio will be.
 * 
 * 
 * The class is initialised with an empty list of encodings so it is effectively disabled by
 * default. You can set the supported encodings to ALL of the encodings supported by your JVM at
 * any point during your program execution using the following method
 * EncodingGuesser.setSupportedEncodings(EncodingGuesser.getCanonicalEncodingNamesSupportedByJVM());
 * You can also clear the encodings and disable the detector at any point by calling
 * EncodingGuesser.setSupportedEncodings(new ArrayList()). If later on you dynamically
 * add more encodings they will NOT be detected automatically by this class but you can recall the
 * above method.
 * 
 * 
 * As the JVM can have a large number of encodings and each one is checked against the
 * byte array it may be wise to remove all encodings you are sure you will not use
 * to trim down on the number of tests. It will not stop at the first match but will try to
 * match as many encodings as possible and return this as a Collection.
 * 
 * 
 * A common scenario is where an application can handle only a small set of text encodings such as UTF-8
 * and windows-1252. If this is your case you can use the setSupportedEncodings() method so that
 * these are the only encodings in the supported encodings Collection.
 * This will dramatically improve the performance of this class.
 * 
 * 
 * It's possible that small byte arrays that should contain binary data are considered
 * possible text matches but generally binary data, such as images, should return no matches.
 * 
 * 
 * There are some optimisations that are applicable to text files containing BOM's (Byte Order Marks) such as
 * UTF-8, UTF-16LE, UTF-16BE, UTF-32LE and UTF-32BE. These are not required but if present will greatly improve
 * the resultant possible matches returned from the getPossibleEncodings() method.
 * 
 */
public class EncodingGuesser {

    private static Logger log = LoggerFactory.getLogger(EncodingGuesser.class);

    // We want the CANONICAL name of the default Charset for the JVM.
    private static String defaultJVMEncoding = Charset.forName(
            new java.io.OutputStreamWriter(new java.io.ByteArrayOutputStream()).getEncoding()).name();

    private static Collection supportedEncodings = new TreeSet();

    private static Map boms = new HashMap();

    /**
     * Initialise the supported encodings to be those supported by the JVM.
     * This will NOT be updated should you later add encodings dynamically to your
     * running code.
     *
     * You can also remove some of these later if you know they will not be used.
     * The more you remove the more performant the it will be.
     */
    static {
        // We have this switched off by default. If you want to initialise with all encodings
        // supported by your JVM the just un-comment the following line
        // EncodingGuesser.supportedEncodings = getCanonicalEncodingNamesSupportedByJVM();

        // Initialise some known BOM (s) keyed by their canonical encoding name.
        boms.put("UTF-32BE", new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF});
        boms.put("UTF-32LE", new byte[]{(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00});
        boms.put("UTF-16BE", new byte[]{(byte) 0xFE, (byte) 0xFF});
        boms.put("UTF-16LE", new byte[]{(byte) 0xFF, (byte) 0xFE});
        boms.put("UTF-8", new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF});
        boms.put("UTF-7", new byte[]{(byte) 0x2B, (byte) 0x2F, (byte) 0x76}); // We may need to cater for the next char as well which can be one of [38 | 39 | 2B | 2F]
        boms.put("UTF-1", new byte[]{(byte) 0xF7, (byte) 0x64, (byte) 0x4C});
        boms.put("UTF-EBCDIC", new byte[]{(byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73});
        boms.put("SCSU", new byte[]{(byte) 0x0E, (byte) 0xFE, (byte) 0xFF});
        boms.put("BOCU-1", new byte[]{(byte) 0xFB, (byte) 0xEE, (byte) 0x28}); // optionally followed by 0xFF

    }

    /**
     * Check if the encoding String is one of the encodings supported.
     *
     * @param encoding
     * @return true if encoding is understood by this class
     */
    public static boolean isKnownEncoding(String encoding) {
        return supportedEncodings.contains(encoding);
    }

    /**
     * Get a Collection of all the possible encodings this byte array could be used to represent.
     *
     * @param data
     * @return the Collection of possible encodings from the supported encodings
     */
    public static Collection getPossibleEncodings(byte[] data) {

        Collection possibleEncodings = new TreeSet();
        if (data == null || data.length == 0) {
            return possibleEncodings;
        }

        // We may have to take account of a BOM (Byte Order Mark) as this could be present at the beginning of
        // the source byte array. These sequences may match valid bytes at the beginning of binary data but this shouldn't
        // match any encodings anyway.

        String encoding = null;
        for (Iterator it = supportedEncodings.iterator(); it.hasNext(); ) {
            // This will eliminate encodings it can't possibly be from the supported encodings
            // by converting the source byte array to a String using each encoding in turn and
            // then getting the resultant byte array and checking it against the passed in data.

            try {
                // One problem to overcome is that the passed in data may be terminated by an
                // incomplete character for the current encoding so we need to remove the last character
                // then get the resulting bytes and only match this against the source byte array.

                encoding = (String) it.next();

                // Check if this encoding has a known bom and if so does it match the beginning of the data array ?
                // returns either 0 or the length of the bom
                int lengthBOM = getLengthBOM(encoding, data);

                // Don't use the BOM when constructing the String
                String test = new String(getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding);

                // Only remove the last character if the String is more than 1 character long
                if (test.length() > 1) {
                    // Remove last character from the test string.
                    test = test.substring(0, test.length() - 2);
                }

                // This is the byte array we will compare with the passed in source array copy
                byte[] compare = null;
                try {
                    compare = test.getBytes(encoding);
                } catch (UnsupportedOperationException ignore) {
                    continue;
                }

                // Check if source and destination byte arrays are equal
                if (!compareByteArrays(data, lengthBOM, compare, 0, compare.length)) {
                    // dosn't match so ignore this encoding as it is unlikely to be correct
                    // even if it does contain valid text data.
                    continue;
                }

                // If we get this far and the lengthBOM is not 0 then we have a match for this encoding.
                if (lengthBOM != 0) {
                    // We know we have a perfect match for this encoding so ditch the rest and return just this one
                    possibleEncodings.clear();
                    possibleEncodings.add(encoding);
                    return possibleEncodings;
                }

                // This is a possible match.
                possibleEncodings.add(encoding);
            } catch (UnsupportedEncodingException uee) {
                log.error("The encoding [" + encoding + "] is not supported by your JVM.");
            } catch (Exception e) {
                // Log the error but carry on with the next encoding
                log.error(e.getLocalizedMessage(), e);
            }
        }
        return possibleEncodings;
    }

    /**
     * Get a Collection containing entries in both the supported encodings
     * and the passed in String [] of encodings.
     * 
     * This is used by TextMimeDetector to get a valid list of the preferred encodings.
     *
     * @param encodings
     * @return a Collection containing all valid encodings contained in the passed in encodings array
     */
    public static Collection getValidEncodings(String[] encodings) {
        Collection c = new ArrayList();
        for (int i = 0; i < encodings.length; i++) {
            if (supportedEncodings.contains(encodings[i])) {
                c.add(encodings[i]);
            }
        }
        return c;
    }


    /**
     * Get the JVM default canonical encoding. For instance the canonical encoding for cp1252 is windows-1252
     *
     * @return the default canonical encoding name for the JVM
     */
    public static String getDefaultEncoding() {
        return EncodingGuesser.defaultJVMEncoding;
    }

    /**
     * Get the Collection of currently supported encodings
     *
     * @return the supported encodings.
     */
    public static Collection getSupportedEncodings() {
        return supportedEncodings;
    }

    /**
     * Set the supported encodings
     *
     * @param encodings If this is null the supported encodings are left unchanged.
     * @return a copy of the currently supported encodings
     */
    public static Collection setSupportedEncodings(Collection encodings) {
        Collection current = new TreeSet();
        for (Iterator it = supportedEncodings.iterator(); it.hasNext(); ) {
            current.add(it.next());
        }
        if (encodings != null) {
            supportedEncodings.clear();
            for (Iterator it = encodings.iterator(); it.hasNext(); ) {
                supportedEncodings.add(it.next());
            }
        }
        return current;
    }

    /**
     * Get the length of a BOM for this this encoding and byte array
     *
     * @param encoding
     * @param data
     * @return length of BOM if the data contains a BOM else returns 0
     */
    public static int getLengthBOM(String encoding, byte[] data) {
        if (!boms.containsKey(encoding)) {
            return 0;
        }
        byte[] bom = (byte[]) boms.get(encoding);
        if (compareByteArrays(bom, 0, data, 0, bom.length)) {
            return bom.length;
        } else {
            return 0;
        }
    }

    /**
     * Get a sub array of this byte array starting at offset until length
     *
     * @param a
     * @param offset
     * @param length
     * @return new byte array unless is would replicate or increase the original array in which case it returns the original
     */
    public static byte[] getByteArraySubArray(byte[] a, int offset, int length) {
        if ((offset + length > a.length)) {
            return a;
        }
        byte[] data = new byte[length];
        for (int i = 0; i < length; i++) {
            data[i] = a[offset + i];
        }
        return data;
    }

    /**
     * Utility method to compare a region of two byte arrays for equality
     *
     * @param a
     * @param aOffset
     * @param b
     * @param bOffset
     * @param length
     * @return true is the two regions contain the same byte values else false
     */
    public static boolean compareByteArrays(byte[] a, int aOffset, byte[] b, int bOffset, int length) {
        if ((a.length < aOffset + length) || (b.length < bOffset + length)) {
            // would match beyond one of the arrays
            return false;
        }

        for (int i = 0; i < length; i++) {
            if (a[aOffset + i] != b[bOffset + i]) {
                return false;
            }
        }
        return true;
    }

}