All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jlibs.core.io.EncodingDetector Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2015 Santhosh Kumar Tekuri
 *
 * The JLibs authors license this file to you under the Apache License,
 * version 2.0 (the "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at:
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package jlibs.core.io;

import java.nio.ByteBuffer;
import java.util.Arrays;

/**
 * @author Santhosh Kumar T
 */
public interface EncodingDetector{
    public String detect(ByteBuffer buffer);

    public static final EncodingDetector DEFAULT = new EncodingDetector(){
        @Override
        public String detect(ByteBuffer buffer){
            BOM bom = BOM.detect(buffer);
            return bom==null ? null : bom.encoding();
        }
    };

    // see http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
    public static final EncodingDetector XML = new EncodingDetector(){
        final int[] UTF32_LE = { 0x3C, 0x00, 0x00, 0x00 };
        final int[] UTF32_BE = { 0x00, 0x00, 0x00, 0x3C };
        final int[] UTF_16LE = { 0x3C, 0x00, 0x3F, 0x00 };
        final int[] UTF_16BE = { 0x00, 0x3C, 0x00, 0x3F };
        final int[] UTF_8    = { 0x3C, 0x3F, 0x78, 0x6D };
        final int[] CP037    = { 0x4C, 0x6F, 0xA7, 0x94 };

        @Override
        public String detect(ByteBuffer buffer){
            String encoding = DEFAULT.detect(buffer);
            if(encoding!=null)
                return encoding;

            if(buffer.remaining()>=4){
                int values[] = { buffer.get()&0xFF, buffer.get()&0xFF, buffer.get()&0xFF, buffer.get()&0xFF };
                buffer.position(buffer.position()-4);
                if(Arrays.equals(UTF32_LE, values))
                    return IOUtil.UTF_32LE.name();
                else if(Arrays.equals(UTF32_BE, values))
                    return IOUtil.UTF_32BE.name();
                else if(Arrays.equals(UTF_16LE, values))
                    return IOUtil.UTF_16LE.name();
                else if(Arrays.equals(UTF_16BE, values))
                    return IOUtil.UTF_16BE.name();
                else if(Arrays.equals(UTF_8, values))
                    return IOUtil.UTF_8.name();
                else if(Arrays.equals(CP037, values))
                    return "Cp037";

            }
            return IOUtil.UTF_8.name();
        }
    };

    // see http://www.ietf.org/rfc/rfc4627 section 3
    // 00 00 00 xx  UTF-32BE
    // 00 xx 00 xx  UTF-16BE
    // xx 00 00 00  UTF-32LE
    // xx 00 xx 00  UTF-16LE
    // xx xx xx xx  UTF-8
    public static final EncodingDetector JSON = new EncodingDetector(){
        @Override
        public String detect(ByteBuffer buffer){
            String encoding = DEFAULT.detect(buffer);
            if(encoding!=null)
                return encoding;

            if(buffer.remaining()>=4){
                int flag = 0;
                for(int i=0; i<4; i++){
                    if(buffer.get()!=0x00)
                        flag |= 1 << i;
                }
                buffer.position(buffer.position()-4);

                switch(flag){
                    case 1:
                        return IOUtil.UTF_32LE.name();
                    case 5:
                        return IOUtil.UTF_16LE.name();
                    case 8:
                        return IOUtil.UTF_32BE.name();
                    case 10:
                        return IOUtil.UTF_16BE.name();
                }
            }
            return IOUtil.UTF_8.name();
        }
    };
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy