All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.abdera.i18n.text.io.CharsetSniffingInputStream Maven / Gradle / Ivy

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.  For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.abdera.i18n.text.io;

import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * Will attempt to autodetect the character encoding from the stream
 * By default, this will preserve the BOM if it exists
 */
public class CharsetSniffingInputStream 
  extends FilterInputStream {
  
  public static enum Encoding {
    UTF32be(
      "UTF-32",
      true, 
      new byte[] {0x00,0x00,0xFFFFFFFE,0xFFFFFFFF}),
    UTF32le(
      "UTF-32",
      true, 
      new byte[] {0xFFFFFFFF,0xFFFFFFFE,0x00,0x00}),
    INVALID(
      null,
      true,
      new byte[] {0xFFFFFFFE,0xFFFFFFFF,0x00,0x00},
      new byte[] {0x00,0x00,0xFFFFFFFF,0xFFFFFFFE}),
    UTF16be(
      "UTF-16",
      true,
      new byte[] {0xFFFFFFFE,0xFFFFFFFF}),
    UTF16le(
      "UTF-16",
      true,
      new byte[] {0xFFFFFFFF,0xFFFFFFFE}),
    UTF8(
      "UTF-8",
      true,
      new byte[] {0xFFFFFFEF,0xFFFFFFBB,0xFFFFFFBF}),
    UTF32be2(
      "UTF-32be",
      false,
      new byte[] {0x00,0x00,0x00,0x3C}),
    UTF32le2(
      "UTF-32le",
      false,
      new byte[] {0x3C,0x00,0x00,0x00}),
    UTF16be2(
      "UTF-16be",
      false,
      new byte[] {0x00,0x3C,0x00,0x3F}),
    UTF16le2(
      "UTF-16le",
      false,
      new byte[] {0x3C,0x00,0x3F,0x00})
    ;
    
    private final String enc;
    private final byte[][] checks;
    private final boolean bom;
    Encoding(
      String name, 
      boolean bom, 
      byte[]... checks) {
      this.enc = name;
      this.checks = checks;
      this.bom = bom;
    }
    public String getEncoding() {
      return enc;
    }
    public boolean getBom() {
      return bom;
    }
    public int equals(byte[] bom) {
      for (byte[] check : checks) {
        if (CharsetSniffingInputStream.equals(bom, check.length, check)) 
          return check.length;
      }
      return 0;
    }
  }
  
  protected String encoding;
  protected boolean bomset = false;
  protected final boolean preserve;
  
  public CharsetSniffingInputStream(InputStream in) {
    this(in,true);
  }
  
  public CharsetSniffingInputStream(
    InputStream in, 
    boolean preserveBom) {
      super(
        !(in instanceof PeekAheadInputStream) ? 
          new PeekAheadInputStream(in,4) : in);
      this.preserve = preserveBom;
      try {
        encoding = detectEncoding();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
  }

  public boolean isBomSet() {
    return bomset;
  }
  
  public String getEncoding() {
    return encoding;
  }
  
  protected PeekAheadInputStream getInternal() {
    return (PeekAheadInputStream)in;
  }
  
  private static boolean equals(byte[] a1, int len, byte[] a2) {
    for (int n = 0, i = 0; n < len; n++, i++) {  
      if (a1[n] != a2[i]) return false;
    }
    return true;
  }
  
  protected String detectEncoding() throws IOException {
    PeekAheadInputStream pin = (PeekAheadInputStream) this.in;
    byte[] bom = new byte[4];
    pin.peek(bom);
    bomset = false;
    for (Encoding enc : Encoding.values()) {
      int bomlen = enc.equals(bom);
      if (bomlen > 0) {
        bomset = enc.getBom();
        if (bomset && !preserve) // consume the bom 
          pin.read(new byte[bomlen]);
        return enc.getEncoding();
      }
    }
    return null;
  }
  
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy