src.main.java.com.dd.plist.ByteOrderMarkReader Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of dd-plist Show documentation

This library enables Java applications to work with property lists in various formats. Supported formats for reading and writing are OS X/iOS binary and XML property lists. ASCII property lists are also supported. The library also provides access to basic functions of NeXTSTEP/Cocoa classes like NSDictionary, NSArray, etc.

The newest version!

/*
 * plist - An open source library to parse and generate property lists
 * Copyright (C) 2022 Daniel Dreibrodt
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package com.dd.plist;

/**
 * Reads Byte Order Marks for various Unicode encodings.
 * @author Daniel Dreibrodt
 */
class ByteOrderMarkReader {
    private static final int[][] BOMs = {
        { 0xEF, 0xBB, 0xBF },
        { 0xFE, 0xFF },
        { 0xFF, 0xFE},
        { 0x00, 0x00, 0xFE, 0xFF },
        { 0xFF, 0xFE, 0x00, 0x00 },
    };

    private static final String[] Charsets = {
        "UTF-8",
        "UTF-16BE",
        "UTF-16LE",
        "UTF-32BE",
        "UTF-32LE"
    };

    private final boolean[] charsetPossible = { true, true, true, true, true};
    private int offset;
    private String charset;

    /**
     * Gets the charset that was detected.
     * @return The name of the detected charset, or null if no charset was detected.
     */
    public String getDetectedCharset() {
        return this.charset;
    }

    /**
     * Processes a byte that was read from the input.
     * @param b The byte to process.
     * @return true if the input so far could potentially be a BOM; otherwise, false.
     */
    public boolean readByte(int b) {
        boolean matchingCharset = false;
        for (int c = 0; c < Charsets.length; c++) {
            if (this.charsetPossible[c]) {
                int[] bom = BOMs[c];
                boolean match = this.offset < bom.length && bom[this.offset] == b;
                if (match) {
                    matchingCharset = true;
                    if (this.offset + 1 == bom.length) {
                        this.charset = Charsets[c];
                    }
                }
                else {
                    this.charsetPossible[c] = false;
                }
            }
        }

        this.offset++;
        return matchingCharset;
    }

    /**
     * Detects the encoding of input data that is available as a complete byte array.
     * @param bytes The input data.
     * @return The name of the detected charset, or null if no BOM was detected.
     */
    public static String detect(byte[] bytes) {
        // Check for byte order marks
        if (bytes.length > 2) {
            if (bytes[0] == (byte)0xFE && bytes[1] == (byte)0xFF) {
                return "UTF-16";
            }
            else if (bytes[0] == (byte)0xFF && bytes[1] == (byte)0xFE) {
                if (bytes.length > 4 && bytes[2] == (byte)0x00 && bytes[3] == (byte)0x00) {
                    return "UTF-32";
                }
                return "UTF-16";
            }
            else if (bytes.length > 3) {
                if (bytes[0] == (byte)0xEF && bytes[1] == (byte)0xBB && bytes[2] == (byte)0xBF) {
                    return "UTF-8";
                }
                else if (bytes.length > 4 && bytes[0] == (byte)0x00 && bytes[1] == (byte)0x00 && bytes[2] == (byte)0xFE && bytes[3] == (byte)0xFF) {
                    return "UTF-32";
                }
            }
        }

        return null;
    }
}