cn.hutool.core.io.BOMInputStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hutool-core Show documentation
Hutool核心，包括集合、字符串、Bean等工具
There is a newer version: 5.8.33
package cn.hutool.core.io;

import cn.hutool.core.util.CharsetUtil;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * 读取带BOM头的流内容，{@code getCharset()}方法调用后会得到BOM头的编码，且会去除BOM头

 * BOM定义：http://www.unicode.org/unicode/faq/utf_bom.html

 * 
 * 00 00 FE FF = UTF-32, big-endian
 * FF FE 00 00 = UTF-32, little-endian
 * EF BB BF = UTF-8
 * FE FF = UTF-16, big-endian
 * FF FE = UTF-16, little-endian
 * 
 * 使用： 

 * 
 * String enc = "UTF-8"; // or NULL to use systemdefault

 * FileInputStream fis = new FileInputStream(file); 

 * BOMInputStream uin = new BOMInputStream(fis, enc); 

 * enc = uin.getCharset(); // check and skip possible BOM bytes
 * 
 * 


 * 参考： http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt
 *
 * @author looly
 */
public class BOMInputStream extends InputStream {

	private final PushbackInputStream in;
	private boolean isInited = false;
	private final String defaultCharset;
	private String charset;

	private static final int BOM_SIZE = 4;

	// ----------------------------------------------------------------- Constructor start

	/**
	 * 构造
	 * @param in 流
	 */
	public BOMInputStream(InputStream in) {
		this(in, CharsetUtil.UTF_8);
	}

	/**
	 * 构造
	 *
	 * @param in 流
	 * @param defaultCharset 默认编码
	 */
	public BOMInputStream(InputStream in, String defaultCharset) {
		this.in = new PushbackInputStream(in, BOM_SIZE);
		this.defaultCharset = defaultCharset;
	}
	// ----------------------------------------------------------------- Constructor end

	/**
	 * 获取默认编码
	 *
	 * @return 默认编码
	 */
	public String getDefaultCharset() {
		return defaultCharset;
	}

	/**
	 * 获取BOM头中的编码
	 *
	 * @return 编码
	 */
	public String getCharset() {
		if (false == isInited) {
			try {
				init();
			} catch (IOException ex) {
				throw new IORuntimeException(ex);
			}
		}
		return charset;
	}

	@Override
	public void close() throws IOException {
		isInited = true;
		in.close();
	}

	@Override
	public int read() throws IOException {
		isInited = true;
		return in.read();
	}

	/**
	 * Read-ahead four bytes and check for BOM marks. 

	 * Extra bytes are unread back to the stream, only BOM bytes are skipped.
	 * @throws IOException 读取引起的异常
	 */
	protected void init() throws IOException {
		if (isInited) {
			return;
		}

		byte[] bom = new byte[BOM_SIZE];
		int n, unread;
		n = in.read(bom, 0, bom.length);

		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
			charset = "UTF-32BE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
			charset = "UTF-32LE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
			charset = "UTF-8";
			unread = n - 3;
		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
			charset = "UTF-16BE";
			unread = n - 2;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
			charset = "UTF-16LE";
			unread = n - 2;
		} else {
			// Unicode BOM mark not found, unread all bytes
			charset = defaultCharset;
			unread = n;
		}

		if (unread > 0) {
			in.unread(bom, (n - unread), unread);
		}

		isInited = true;
	}
}