All Downloads are FREE. Search and download functionalities are using the official Maven repository.

panda.io.stream.BOMInputStream Maven / Gradle / Ivy

package panda.io.stream;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;

import panda.io.ByteOrderMark;

/**
 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first
 * bytes. This class detects these bytes and, if required, can automatically skip them and return
 * the subsequent byte as the first byte in the stream. The {@link ByteOrderMark} implementation has
 * the following pre-defined BOMs:
 * 
    *
  • UTF-8 - {@link ByteOrderMark#UTF_8}
  • *
  • UTF-16BE - {@link ByteOrderMark#UTF_16LE}
  • *
  • UTF-16LE - {@link ByteOrderMark#UTF_16BE}
  • *
  • UTF-32BE - {@link ByteOrderMark#UTF_32LE}
  • *
  • UTF-32LE - {@link ByteOrderMark#UTF_32BE}
  • *
*

Example 1 - Detect and exclude a UTF-8 BOM

* *
 * BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_8);
 * if (bomIn.hasBOM()) {
 * 	// has a UTF-8 BOM
 * }
 * 
* *

Example 2 - Detect a UTF-8 BOM (but don't exclude it)

* *
 * boolean include = true;
 * BOMInputStream bomIn = new BOMInputStream(in, include, ByteOrderMark.UTF_8);
 * if (bomIn.hasBOM()) {
 * 	// has a UTF-8 BOM
 * }
 * 
* *

Example 3 - Detect Multiple BOMs

* *
 * BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE,
 * 	ByteOrderMark.UTF_32BE);
 * if (bomIn.hasBOM() == false) {
 * 	// No BOM found
 * }
 * else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
 * 	// has a UTF-16LE BOM
 * }
 * else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
 * 	// has a UTF-16BE BOM
 * }
 * else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
 * 	// has a UTF-32LE BOM
 * }
 * else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
 * 	// has a UTF-32BE BOM
 * }
 * 
* * @see Wikipedia - Byte Order Mark */ public class BOMInputStream extends ProxyInputStream { private final boolean include; /** * BOMs are sorted from longest to shortest. */ private final List boms; private ByteOrderMark byteOrderMark; private int[] firstBytes; private int fbLength; private int fbIndex; private int markFbIndex; private boolean markedAtStart; /** * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. * * @param delegate the InputStream to delegate to */ public BOMInputStream(final InputStream delegate) { this(delegate, false, ByteOrderMark.ALL); } /** * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally * includes it. * * @param delegate the InputStream to delegate to * @param include true to include the UTF-8 BOM or false to exclude it */ public BOMInputStream(final InputStream delegate, final boolean include) { this(delegate, include, ByteOrderMark.ALL); } /** * Constructs a new BOM InputStream that excludes the specified BOMs. * * @param delegate the InputStream to delegate to * @param boms The BOMs to detect and exclude */ public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { this(delegate, false, boms); } /** * Compares ByteOrderMark objects in descending length order. */ private static final Comparator ByteOrderMarkLengthComparator = new Comparator() { public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) { final int len1 = bom1.length(); final int len2 = bom2.length(); if (len1 > len2) { return -1; } if (len2 > len1) { return 1; } return 0; } }; /** * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes * them. * * @param delegate the InputStream to delegate to * @param include true to include the specified BOMs or false to exclude them * @param boms The BOMs to detect and optionally exclude */ public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { super(delegate); if (boms == null || boms.length == 0) { throw new IllegalArgumentException("No BOMs specified"); } this.include = include; // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two // bytes. Arrays.sort(boms, ByteOrderMarkLengthComparator); this.boms = Arrays.asList(boms); } /** * Indicates whether the stream contains one of the specified BOMs. * * @return true if the stream has one of the specified BOMs, otherwise false if it does not * @throws IOException if an error reading the first bytes of the stream occurs */ public boolean hasBOM() throws IOException { return getBOM() != null; } /** * Indicates whether the stream contains the specified BOM. * * @param bom The BOM to check for * @return true if the stream has the specified BOM, otherwise false if it does not * @throws IllegalArgumentException if the BOM is not one the stream is configured to detect * @throws IOException if an error reading the first bytes of the stream occurs */ public boolean hasBOM(final ByteOrderMark bom) throws IOException { if (!boms.contains(bom)) { throw new IllegalArgumentException("Stream not configure to detect " + bom); } return byteOrderMark != null && getBOM().equals(bom); } /** * Return the BOM (Byte Order Mark). * * @return The BOM or null if none * @throws IOException if an error reading the first bytes of the stream occurs */ public ByteOrderMark getBOM() throws IOException { if (firstBytes == null) { fbLength = 0; // BOMs are sorted from longest to shortest final int maxBomSize = boms.get(0).length(); firstBytes = new int[maxBomSize]; // Read first maxBomSize bytes for (int i = 0; i < firstBytes.length; i++) { firstBytes[i] = in.read(); fbLength++; if (firstBytes[i] < 0) { break; } } // match BOM in firstBytes byteOrderMark = find(); if (byteOrderMark != null) { if (!include) { if (byteOrderMark.length() < firstBytes.length) { fbIndex = byteOrderMark.length(); } else { fbLength = 0; } } } } return byteOrderMark; } /** * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. * * @return The BOM charset Name or null if no BOM found * @throws IOException if an error reading the first bytes of the stream occurs */ public String getBOMCharsetName() throws IOException { getBOM(); return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); } /** * Return the BOM charset - {@link ByteOrderMark#getCharset()}. * * @return The BOM charset or null if no BOM found * @throws IOException if an error reading the first bytes of the stream occurs */ public Charset getBOMCharset() throws IOException { getBOM(); return byteOrderMark == null ? null : byteOrderMark.getCharset(); } /** * This method reads and either preserves or skips the first bytes in the stream. It behaves * like the single-byte read() method, either returning a valid byte or -1 to * indicate that the initial bytes have been processed already. * * @return the byte read (excluding BOM) or -1 if the end of stream * @throws IOException if an I/O error occurs */ private int readFirstBytes() throws IOException { getBOM(); return fbIndex < fbLength ? firstBytes[fbIndex++] : -1; } /** * Find a BOM with the specified bytes. * * @return The matched BOM or null if none matched */ private ByteOrderMark find() { for (final ByteOrderMark bom : boms) { if (matches(bom)) { return bom; } } return null; } /** * Check if the bytes match a BOM. * * @param bom The BOM * @return true if the bytes match the bom, otherwise false */ private boolean matches(final ByteOrderMark bom) { // if (bom.length() != fbLength) { // return false; // } // firstBytes may be bigger than the BOM bytes for (int i = 0; i < bom.length(); i++) { if (bom.get(i) != firstBytes[i]) { return false; } } return true; } // ---------------------------------------------------------------------------- // Implementation of InputStream // ---------------------------------------------------------------------------- /** * Invokes the delegate's read() method, detecting and optionally skipping BOM. * * @return the byte read (excluding BOM) or -1 if the end of stream * @throws IOException if an I/O error occurs */ @Override public int read() throws IOException { final int b = readFirstBytes(); return b >= 0 ? b : in.read(); } /** * Invokes the delegate's read(byte[], int, int) method, detecting and optionally * skipping BOM. * * @param buf the buffer to read the bytes into * @param off The start offset * @param len The number of bytes to read (excluding BOM) * @return the number of bytes read or -1 if the end of stream * @throws IOException if an I/O error occurs */ @Override public int read(final byte[] buf, int off, int len) throws IOException { int firstCount = 0; int b = 0; while (len > 0 && b >= 0) { b = readFirstBytes(); if (b >= 0) { buf[off++] = (byte)(b & 0xFF); len--; firstCount++; } } final int secondCount = in.read(buf, off, len); return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount; } /** * Invokes the delegate's read(byte[]) method, detecting and optionally skipping * BOM. * * @param buf the buffer to read the bytes into * @return the number of bytes read (excluding BOM) or -1 if the end of stream * @throws IOException if an I/O error occurs */ @Override public int read(final byte[] buf) throws IOException { return read(buf, 0, buf.length); } /** * Invokes the delegate's mark(int) method. * * @param readlimit read ahead limit */ @Override public synchronized void mark(final int readlimit) { markFbIndex = fbIndex; markedAtStart = firstBytes == null; in.mark(readlimit); } /** * Invokes the delegate's reset() method. * * @throws IOException if an I/O error occurs */ @Override public synchronized void reset() throws IOException { fbIndex = markFbIndex; if (markedAtStart) { firstBytes = null; } in.reset(); } /** * Invokes the delegate's skip(long) method, detecting and optionallyskipping BOM. * * @param n the number of bytes to skip * @return the number of bytes to skipped or -1 if the end of stream * @throws IOException if an I/O error occurs */ @Override public long skip(long n) throws IOException { while (n > 0 && readFirstBytes() >= 0) { n--; } return in.skip(n); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy