All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.mozilla.universalchardet.UnicodeBOMInputStream Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
// (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz)

package org.mozilla.universalchardet;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * The UnicodeBOMInputStream class wraps any
 * InputStream and detects the presence of any Unicode BOM
 * (Byte Order Mark) at its beginning, as defined by
 * RFC 3629 - UTF-8, a
 * transformation format of ISO 10646
 *
 * 

The * Unicode FAQ * defines 5 types of BOMs:

    *
  • 00 00 FE FF  = UTF-32, big-endian
  • *
  • FF FE 00 00  = UTF-32, little-endian
  • *
  • FE FF        = UTF-16, big-endian
  • *
  • FF FE        = UTF-16, little-endian
  • *
  • EF BB BF     = UTF-8
  • *

* *

Use the {@link #getBOM()} method to know whether a BOM has been detected * or not. *

*

Use the {@link #skipBOM()} method to remove the detected BOM from the * wrapped InputStream object.

* * @author Gregory Pakosz * @version 1.0 */ public class UnicodeBOMInputStream extends InputStream { /** * Type safe enumeration class that describes the different types of Unicode * BOMs. */ public static final class BOM { final byte bytes[]; private final String description; /** * NONE. */ public static final BOM NONE = new BOM(new byte[] {}, "NONE"); /** * UTF-8 BOM (EF BB BF). */ public static final BOM UTF_8 = new BOM(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }, "UTF-8"); /** * UTF-16, little-endian (FF FE). */ public static final BOM UTF_16_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE }, "UTF-16 little-endian"); /** * UTF-16, big-endian (FE FF). */ public static final BOM UTF_16_BE = new BOM(new byte[] { (byte) 0xFE, (byte) 0xFF }, "UTF-16 big-endian"); /** * UTF-32, little-endian (FF FE 00 00). */ public static final BOM UTF_32_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 }, "UTF-32 little-endian"); /** * UTF-32, big-endian (00 00 FE FF). */ public static final BOM UTF_32_BE = new BOM(new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF }, "UTF-32 big-endian"); /** * Returns a String representation of this BOM * value. */ public final String toString() { return description; } /** * Returns the bytes corresponding to this BOM value. */ public final byte[] getBytes() { final int length = bytes.length; final byte[] result = new byte[length]; // make a defensive copy System.arraycopy(bytes, 0, result, 0, length); return result; } private BOM(final byte bom[], final String description) { assert (bom != null) : "invalid BOM: null is not allowed"; assert (description != null) : "invalid description: null is not allowed"; assert (description.length() != 0) : "invalid description: empty string is not allowed"; this.bytes = bom; this.description = description; } } // BOM private final PushbackInputStream in; private final BOM bom; private boolean skipped = false; /** * Constructs a new UnicodeBOMInputStream that wraps the * specified InputStream. By default skip BOM bytes * * @param inputStream an InputStream. * * @throws NullPointerException when inputStream is * null. * @throws IOException on reading from the specified InputStream * when trying to detect the Unicode BOM. */ public UnicodeBOMInputStream(final InputStream inputStream) throws IOException { this(inputStream, true); } /** * Constructs a new UnicodeBOMInputStream that wraps the * specified InputStream. * * @param inputStream an InputStream. * @param skipIfFound to automatically skip BOM bytes if found * * @throws NullPointerException when inputStream is * null. * @throws IOException on reading from the specified InputStream * when trying to detect the Unicode BOM. */ public UnicodeBOMInputStream(final InputStream inputStream, boolean skipIfFound) throws IOException { if (inputStream == null) { throw new NullPointerException( "invalid input stream: null is not allowed"); } in = new PushbackInputStream(inputStream, 4); final byte bom[] = new byte[4]; final int read = in.read(bom); switch (read) { case 4: if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { this.bom = BOM.UTF_32_LE; break; } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { this.bom = BOM.UTF_32_BE; break; } case 3: if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { this.bom = BOM.UTF_8; break; } case 2: if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { this.bom = BOM.UTF_16_LE; break; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { this.bom = BOM.UTF_16_BE; break; } default: this.bom = BOM.NONE; break; } if (read > 0) { in.unread(bom, 0, read); } if (skipIfFound) { this.skipBOM(); } } /** * Returns the BOM that was detected in the wrapped * InputStream object. * * @return a BOM value. */ public final BOM getBOM() { // BOM type is immutable. return bom; } /** * Skips the BOM that was found in the wrapped * InputStream object. * * @return this UnicodeBOMInputStream. * * @throws IOException when trying to skip the BOM from the wrapped * InputStream object. */ public final synchronized UnicodeBOMInputStream skipBOM() throws IOException { if (!skipped) { long bytesToSkip = bom.bytes.length; long bytesSkipped = in.skip(bytesToSkip); for (long i = bytesSkipped; i < bytesToSkip; i++) { in.read(); } skipped = true; } return this; } /** * {@inheritDoc} */ public int read() throws IOException { this.skipped = true; return in.read(); } /** * {@inheritDoc} */ public int read(final byte b[]) throws IOException { this.skipped = true; return in.read(b, 0, b.length); } /** * {@inheritDoc} */ public int read(final byte b[], final int off, final int len) throws IOException { this.skipped = true; return in.read(b, off, len); } /** * {@inheritDoc} */ public long skip(final long n) throws IOException { this.skipped = true; return in.skip(n); } /** * {@inheritDoc} */ public int available() throws IOException { return in.available(); } /** * {@inheritDoc} */ public void close() throws IOException { in.close(); } /** * {@inheritDoc} */ public synchronized void mark(final int readlimit) { in.mark(readlimit); } /** * {@inheritDoc} */ public synchronized void reset() throws IOException { in.reset(); } /** * {@inheritDoc} */ public boolean markSupported() { return in.markSupported(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy