org.mozilla.universalchardet.UnicodeBOMInputStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of juniversalchardet Show documentation
Show all versions of juniversalchardet Show documentation
JUniversalChardet is a Java encoding detector library
// (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz)
package org.mozilla.universalchardet;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* The UnicodeBOMInputStream
class wraps any
* InputStream
and detects the presence of any Unicode BOM
* (Byte Order Mark) at its beginning, as defined by
* RFC 3629 - UTF-8, a
* transformation format of ISO 10646
*
* The
* Unicode FAQ
* defines 5 types of BOMs:
* 00 00 FE FF = UTF-32, big-endian
* FF FE 00 00 = UTF-32, little-endian
* FE FF = UTF-16, big-endian
* FF FE = UTF-16, little-endian
* EF BB BF = UTF-8
*
*
* Use the {@link #getBOM()} method to know whether a BOM has been detected
* or not.
*
* Use the {@link #skipBOM()} method to remove the detected BOM from the
* wrapped InputStream
object.
*
* @author Gregory Pakosz
* @version 1.0
*/
public class UnicodeBOMInputStream extends InputStream {
/**
* Type safe enumeration class that describes the different types of Unicode
* BOMs.
*/
public static final class BOM {
final byte bytes[];
private final String description;
/**
* NONE.
*/
public static final BOM NONE = new BOM(new byte[] {}, "NONE");
/**
* UTF-8 BOM (EF BB BF).
*/
public static final BOM UTF_8 = new BOM(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }, "UTF-8");
/**
* UTF-16, little-endian (FF FE).
*/
public static final BOM UTF_16_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE }, "UTF-16 little-endian");
/**
* UTF-16, big-endian (FE FF).
*/
public static final BOM UTF_16_BE = new BOM(new byte[] { (byte) 0xFE, (byte) 0xFF }, "UTF-16 big-endian");
/**
* UTF-32, little-endian (FF FE 00 00).
*/
public static final BOM UTF_32_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 },
"UTF-32 little-endian");
/**
* UTF-32, big-endian (00 00 FE FF).
*/
public static final BOM UTF_32_BE = new BOM(new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF },
"UTF-32 big-endian");
/**
* Returns a String
representation of this BOM
* value.
*/
public final String toString() {
return description;
}
/**
* Returns the bytes corresponding to this BOM
value.
*/
public final byte[] getBytes() {
final int length = bytes.length;
final byte[] result = new byte[length];
// make a defensive copy
System.arraycopy(bytes, 0, result, 0, length);
return result;
}
private BOM(final byte bom[], final String description) {
assert (bom != null) : "invalid BOM: null is not allowed";
assert (description != null) : "invalid description: null is not allowed";
assert (description.length() != 0) : "invalid description: empty string is not allowed";
this.bytes = bom;
this.description = description;
}
} // BOM
private final PushbackInputStream in;
private final BOM bom;
private boolean skipped = false;
/**
* Constructs a new UnicodeBOMInputStream
that wraps the
* specified InputStream
. By default skip BOM bytes
*
* @param inputStream an InputStream
.
*
* @throws NullPointerException when inputStream
is
* null
.
* @throws IOException on reading from the specified InputStream
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream) throws IOException {
this(inputStream, true);
}
/**
* Constructs a new UnicodeBOMInputStream
that wraps the
* specified InputStream
.
*
* @param inputStream an InputStream
.
* @param skipIfFound to automatically skip BOM bytes if found
*
* @throws NullPointerException when inputStream
is
* null
.
* @throws IOException on reading from the specified InputStream
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream,
boolean skipIfFound) throws IOException {
if (inputStream == null) {
throw new NullPointerException(
"invalid input stream: null is not allowed");
}
in = new PushbackInputStream(inputStream, 4);
final byte bom[] = new byte[4];
final int read = in.read(bom);
switch (read) {
case 4:
if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
this.bom = BOM.UTF_32_LE;
break;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
this.bom = BOM.UTF_32_BE;
break;
}
case 3:
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
&& (bom[2] == (byte) 0xBF)) {
this.bom = BOM.UTF_8;
break;
}
case 2:
if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
this.bom = BOM.UTF_16_LE;
break;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
this.bom = BOM.UTF_16_BE;
break;
}
default:
this.bom = BOM.NONE;
break;
}
if (read > 0) {
in.unread(bom, 0, read);
}
if (skipIfFound) {
this.skipBOM();
}
}
/**
* Returns the BOM
that was detected in the wrapped
* InputStream
object.
*
* @return a BOM
value.
*/
public final BOM getBOM() {
// BOM type is immutable.
return bom;
}
/**
* Skips the BOM
that was found in the wrapped
* InputStream
object.
*
* @return this UnicodeBOMInputStream
.
*
* @throws IOException when trying to skip the BOM from the wrapped
* InputStream
object.
*/
public final synchronized UnicodeBOMInputStream skipBOM()
throws IOException {
if (!skipped) {
long bytesToSkip = bom.bytes.length;
long bytesSkipped = in.skip(bytesToSkip);
for (long i = bytesSkipped; i < bytesToSkip; i++) {
in.read();
}
skipped = true;
}
return this;
}
/**
* {@inheritDoc}
*/
public int read() throws IOException {
this.skipped = true;
return in.read();
}
/**
* {@inheritDoc}
*/
public int read(final byte b[]) throws IOException {
this.skipped = true;
return in.read(b, 0, b.length);
}
/**
* {@inheritDoc}
*/
public int read(final byte b[], final int off, final int len) throws IOException {
this.skipped = true;
return in.read(b, off, len);
}
/**
* {@inheritDoc}
*/
public long skip(final long n) throws IOException {
this.skipped = true;
return in.skip(n);
}
/**
* {@inheritDoc}
*/
public int available() throws IOException {
return in.available();
}
/**
* {@inheritDoc}
*/
public void close() throws IOException {
in.close();
}
/**
* {@inheritDoc}
*/
public synchronized void mark(final int readlimit) {
in.mark(readlimit);
}
/**
* {@inheritDoc}
*/
public synchronized void reset() throws IOException {
in.reset();
}
/**
* {@inheritDoc}
*/
public boolean markSupported() {
return in.markSupported();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy