All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.abdera.i18n.text.io.CharsetSniffingInputStream Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  The ASF licenses this file to You
 * under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.  For additional information regarding
 * copyright in this work, please see the NOTICE file in the top level
 * directory of this distribution.
 */
package org.apache.abdera.i18n.text.io;

import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 *  Will attempt to autodetect the character encoding from the stream By default, this will preserve the BOM if it exists
 *
 * @deprecated This API is deprecated as Apache Abdera is a retired project since 2017.
 */
@Deprecated(since = "2021-07-29")
public class CharsetSniffingInputStream extends FilterInputStream {

    public static enum Encoding {

        UTF32be("UTF-32", true, new byte[] { 0x00, 0x00, 0xFFFFFFFE, 0xFFFFFFFF }),
        UTF32le("UTF-32", true, new byte[] { 0xFFFFFFFF, 0xFFFFFFFE, 0x00, 0x00 }),
        INVALID(null, true, new byte[] { 0xFFFFFFFE, 0xFFFFFFFF, 0x00, 0x00 }, new byte[] { 0x00, 0x00, 0xFFFFFFFF, 0xFFFFFFFE }),
        UTF16be("UTF-16", true, new byte[] { 0xFFFFFFFE, 0xFFFFFFFF }),
        UTF16le("UTF-16", true, new byte[] { 0xFFFFFFFF, 0xFFFFFFFE }),
        UTF8("UTF-8", true, new byte[] { 0xFFFFFFEF, 0xFFFFFFBB, 0xFFFFFFBF }),
        UTF32be2("UTF-32be", false, new byte[] { 0x00, 0x00, 0x00, 0x3C }),
        UTF32le2("UTF-32le", false, new byte[] { 0x3C, 0x00, 0x00, 0x00 }),
        UTF16be2("UTF-16be", false, new byte[] { 0x00, 0x3C, 0x00, 0x3F }),
        UTF16le2("UTF-16le", false, new byte[] { 0x3C, 0x00, 0x3F, 0x00 });

        private final String enc;

        private final byte[][] checks;

        private final boolean bom;

        Encoding(String name, boolean bom, byte[]... checks) {
            this.enc = name;
            this.checks = checks;
            this.bom = bom;
        }

        public String getEncoding() {
            return enc;
        }

        public boolean getBom() {
            return bom;
        }

        public int equals(byte[] bom) {
            for (byte[] check : checks) {
                if (CharsetSniffingInputStream.equals(bom, check.length, check))
                    return check.length;
            }
            return 0;
        }
    }

    protected String encoding;

    protected boolean bomset = false;

    protected final boolean preserve;

    public CharsetSniffingInputStream(InputStream in) {
        this(in, true);
    }

    public CharsetSniffingInputStream(InputStream in, boolean preserveBom) {
        super(!(in instanceof PeekAheadInputStream) ? new PeekAheadInputStream(in, 4) : in);
        this.preserve = preserveBom;
        try {
            encoding = detectEncoding();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public boolean isBomSet() {
        return bomset;
    }

    public String getEncoding() {
        return encoding;
    }

    protected PeekAheadInputStream getInternal() {
        return (PeekAheadInputStream) in;
    }

    private static boolean equals(byte[] a1, int len, byte[] a2) {
        for (int n = 0, i = 0; n < len; n++, i++) {
            if (a1[n] != a2[i])
                return false;
        }
        return true;
    }

    protected String detectEncoding() throws IOException {
        PeekAheadInputStream pin = (PeekAheadInputStream) this.in;
        byte[] bom = new byte[4];
        pin.peek(bom);
        bomset = false;
        for (Encoding enc : Encoding.values()) {
            int bomlen = enc.equals(bom);
            if (bomlen > 0) {
                bomset = enc.getBom();
                if (// consume the bom
                bomset && !preserve)
                    pin.read(new byte[bomlen]);
                return enc.getEncoding();
            }
        }
        return null;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy