All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.geektool.core.io.BOMInputStream Maven / Gradle / Ivy

package cn.geektool.core.io;

import cn.geektool.core.util.CharsetUtil;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * 读取带BOM头的流内容,{@code getCharset()}方法调用后会得到BOM头的编码,且会去除BOM头
* BOM定义:http://www.unicode.org/unicode/faq/utf_bom.html
*
    *
  • 00 00 FE FF = UTF-32, big-endian
  • *
  • FF FE 00 00 = UTF-32, little-endian
  • *
  • EF BB BF = UTF-8
  • *
  • FE FF = UTF-16, big-endian
  • *
  • FF FE = UTF-16, little-endian
  • *
* 使用:
* * String enc = "UTF-8"; // or NULL to use systemdefault
* FileInputStream fis = new FileInputStream(file);
* BOMInputStream uin = new BOMInputStream(fis, enc);
* enc = uin.getCharset(); // check and skip possible BOM bytes *
*

* 参考: http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt */ public class BOMInputStream extends InputStream { private final PushbackInputStream in; private boolean isInited = false; private final String defaultCharset; private String charset; private static final int BOM_SIZE = 4; // ----------------------------------------------------------------- Constructor start public BOMInputStream(InputStream in) { this(in, CharsetUtil.UTF_8); } public BOMInputStream(InputStream in, String defaultCharset) { this.in = new PushbackInputStream(in, BOM_SIZE); this.defaultCharset = defaultCharset; } // ----------------------------------------------------------------- Constructor end /** * 获取默认编码 * * @return 默认编码 */ public String getDefaultCharset() { return defaultCharset; } /** * 获取BOM头中的编码 * * @return 编码 */ public String getCharset() { if (!isInited) { try { init(); } catch (IOException ex) { throw new IORuntimeException(ex); } } return charset; } @Override public void close() throws IOException { isInited = true; in.close(); } @Override public int read() throws IOException { isInited = true; return in.read(); } /** * Read-ahead four bytes and check for BOM marks.
* Extra bytes are unread back to the stream, only BOM bytes are skipped. * @throws IOException 读取引起的异常 */ protected void init() throws IOException { if (isInited) { return; } byte[] bom = new byte[BOM_SIZE]; int n, unread; n = in.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { charset = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { charset = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { charset = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { charset = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { charset = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes charset = defaultCharset; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) { in.unread(bom, (n - unread), unread); } isInited = true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy