All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.xiaoleilu.hutool.io.BOMInputStream Maven / Gradle / Ivy

There is a newer version: 3.3.2
Show newest version
package com.xiaoleilu.hutool.io;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

import com.xiaoleilu.hutool.util.CharsetUtil;

/**
 * 读取带BOM头的流内容,getCharset()方法调用后会得到BOM头的编码,且会去除BOM头
* BOM定义:http://www.unicode.org/unicode/faq/utf_bom.html
*
    *
  • 00 00 FE FF = UTF-32, big-endian
  • *
  • FF FE 00 00 = UTF-32, little-endian
  • *
  • EF BB BF = UTF-8
  • *
  • FE FF = UTF-16, big-endian
  • *
  • FF FE = UTF-16, little-endian
  • *
* 使用:
* * String enc = "UTF-8"; // or NULL to use systemdefault
* FileInputStream fis = new FileInputStream(file);
* UnicodeInputStream uin = new UnicodeInputStream(fis, enc);
* enc = uin.getCharset(); // check and skip possible BOM bytes *
*

* 参考: http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt */ public class BOMInputStream extends InputStream { PushbackInputStream in; boolean isInited = false; String defaultCharset; String charset; private static final int BOM_SIZE = 4; // ----------------------------------------------------------------- Constructor start public BOMInputStream(InputStream in) { this(in, CharsetUtil.UTF_8); } public BOMInputStream(InputStream in, String defaultCharset) { in = new PushbackInputStream(in, BOM_SIZE); this.defaultCharset = defaultCharset; } // ----------------------------------------------------------------- Constructor end public String getDefaultCharset() { return defaultCharset; } public String getCharset() { if (!isInited) { try { init(); } catch (IOException ex) { throw new IORuntimeException(ex); } } return charset; } @Override public void close() throws IOException { isInited = true; in.close(); } @Override public int read() throws IOException { isInited = true; return in.read(); } /** * Read-ahead four bytes and check for BOM marks.
* Extra bytes are unread back to the stream, only BOM bytes are skipped. * @throws IOException 读取引起的异常 */ protected void init() throws IOException { if (isInited) return; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = in.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { charset = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { charset = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { charset = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { charset = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { charset = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes charset = defaultCharset; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) in.unread(bom, (n - unread), unread); isInited = true; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy