com.dahuatech.hutool.core.io.BOMInputStream Maven / Gradle / Ivy

Go to download
package com.dahuatech.hutool.core.io;

import com.dahuatech.hutool.core.util.CharsetUtil;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * 读取带BOM头的流内容，getCharset()方法调用后会得到BOM头的编码，且会去除BOM头

 * BOM定义：http://www.unicode.org/unicode/faq/utf_bom.html

 *
 * 
 *   00 00 FE FF = UTF-32, big-endian
 *   
FF FE 00 00 = UTF-32, little-endian
 *   
EF BB BF = UTF-8
 *   
FE FF = UTF-16, big-endian
 *   
FF FE = UTF-16, little-endian
 * 
 *
 * 使用： 

 * 
 * String enc = "UTF-8"; // or NULL to use systemdefault

 * FileInputStream fis = new FileInputStream(file); 

 * BOMInputStream uin = new BOMInputStream(fis, enc); 

 * enc = uin.getCharset(); // check and skip possible BOM bytes
 *  

 * 

 * 参考： http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt
 */
public class BOMInputStream extends InputStream {
  private static final int BOM_SIZE = 4;
  PushbackInputStream in;
  boolean isInited = false;
  String defaultCharset;
  String charset;

  // ----------------------------------------------------------------- Constructor start
  public BOMInputStream(InputStream in) {
    this(in, CharsetUtil.UTF_8);
  }

  public BOMInputStream(InputStream in, String defaultCharset) {
    this.in = new PushbackInputStream(in, BOM_SIZE);
    this.defaultCharset = defaultCharset;
  }
  // ----------------------------------------------------------------- Constructor end

  public String getDefaultCharset() {
    return defaultCharset;
  }

  public String getCharset() {
    if (!isInited) {
      try {
        init();
      } catch (IOException ex) {
        throw new IORuntimeException(ex);
      }
    }
    return charset;
  }

  @Override
  public void close() throws IOException {
    isInited = true;
    in.close();
  }

  @Override
  public int read() throws IOException {
    isInited = true;
    return in.read();
  }

  /**
   * Read-ahead four bytes and check for BOM marks. 

   * Extra bytes are unread back to the stream, only BOM bytes are skipped.
   *
   * @throws IOException 读取引起的异常
   */
  protected void init() throws IOException {
    if (isInited) {
      return;
    }

    byte[] bom = new byte[BOM_SIZE];
    int n, unread;
    n = in.read(bom, 0, bom.length);

    if ((bom[0] == (byte) 0x00)
        && (bom[1] == (byte) 0x00)
        && (bom[2] == (byte) 0xFE)
        && (bom[3] == (byte) 0xFF)) {
      charset = "UTF-32BE";
      unread = n - 4;
    } else if ((bom[0] == (byte) 0xFF)
        && (bom[1] == (byte) 0xFE)
        && (bom[2] == (byte) 0x00)
        && (bom[3] == (byte) 0x00)) {
      charset = "UTF-32LE";
      unread = n - 4;
    } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
      charset = "UTF-8";
      unread = n - 3;
    } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
      charset = "UTF-16BE";
      unread = n - 2;
    } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
      charset = "UTF-16LE";
      unread = n - 2;
    } else {
      // Unicode BOM mark not found, unread all bytes
      charset = defaultCharset;
      unread = n;
    }
    // System.out.println("read=" + n + ", unread=" + unread);

    if (unread > 0) {
      in.unread(bom, (n - unread), unread);
    }

    isInited = true;
  }
}