com.dahuatech.hutool.core.io.BOMInputStream Maven / Gradle / Ivy
package com.dahuatech.hutool.core.io;
import com.dahuatech.hutool.core.util.CharsetUtil;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* 读取带BOM头的流内容,getCharset()
方法调用后会得到BOM头的编码,且会去除BOM头
* BOM定义:http://www.unicode.org/unicode/faq/utf_bom.html
*
*
* - 00 00 FE FF = UTF-32, big-endian
*
- FF FE 00 00 = UTF-32, little-endian
*
- EF BB BF = UTF-8
*
- FE FF = UTF-16, big-endian
*
- FF FE = UTF-16, little-endian
*
*
* 使用:
*
* String enc = "UTF-8"; // or NULL to use systemdefault
* FileInputStream fis = new FileInputStream(file);
* BOMInputStream uin = new BOMInputStream(fis, enc);
* enc = uin.getCharset(); // check and skip possible BOM bytes
*
*
* 参考: http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt
*/
public class BOMInputStream extends InputStream {
private static final int BOM_SIZE = 4;
PushbackInputStream in;
boolean isInited = false;
String defaultCharset;
String charset;
// ----------------------------------------------------------------- Constructor start
public BOMInputStream(InputStream in) {
this(in, CharsetUtil.UTF_8);
}
public BOMInputStream(InputStream in, String defaultCharset) {
this.in = new PushbackInputStream(in, BOM_SIZE);
this.defaultCharset = defaultCharset;
}
// ----------------------------------------------------------------- Constructor end
public String getDefaultCharset() {
return defaultCharset;
}
public String getCharset() {
if (!isInited) {
try {
init();
} catch (IOException ex) {
throw new IORuntimeException(ex);
}
}
return charset;
}
@Override
public void close() throws IOException {
isInited = true;
in.close();
}
@Override
public int read() throws IOException {
isInited = true;
return in.read();
}
/**
* Read-ahead four bytes and check for BOM marks.
* Extra bytes are unread back to the stream, only BOM bytes are skipped.
*
* @throws IOException 读取引起的异常
*/
protected void init() throws IOException {
if (isInited) {
return;
}
byte[] bom = new byte[BOM_SIZE];
int n, unread;
n = in.read(bom, 0, bom.length);
if ((bom[0] == (byte) 0x00)
&& (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE)
&& (bom[3] == (byte) 0xFF)) {
charset = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF)
&& (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00)
&& (bom[3] == (byte) 0x00)) {
charset = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
charset = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
charset = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
charset = "UTF-16LE";
unread = n - 2;
} else {
// Unicode BOM mark not found, unread all bytes
charset = defaultCharset;
unread = n;
}
// System.out.println("read=" + n + ", unread=" + unread);
if (unread > 0) {
in.unread(bom, (n - unread), unread);
}
isInited = true;
}
}