jodd.io.UnicodeInputStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jodd-util Show documentation
Jodd Util
There is a newer version: 6.3.0
// Copyright (c) 2003-present, Jodd Team (http://jodd.org)
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package jodd.io;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
 * Unicode input stream for detecting UTF encodings and reading BOM characters.
 * Detects following BOMs:
 * 
 * UTF-8
 * UTF-16BE
 * UTF-16LE
 * UTF-32BE
 * UTF-32LE
 * 
 */
public class UnicodeInputStream extends InputStream {

	public static final int MAX_BOM_SIZE = 4;

	private final PushbackInputStream internalInputStream;
	private boolean initialized;
	private int BOMSize = -1;
	private Charset encoding;
	private final Charset targetEncoding;

	/**
	 * Creates new unicode stream. It works in two modes: detect mode and read mode.
	 * 
	 * Detect mode is active when target encoding is not specified.
	 * In detect mode, it tries to detect encoding from BOM if exist.
	 * If BOM doesn't exist, encoding is not detected.
	 * 
	 * Read mode is active when target encoding is set. Then this stream reads
	 * optional BOM for given encoding. If BOM doesn't exist, nothing is skipped.
	 */
	public UnicodeInputStream(final InputStream in, final Charset targetEncoding) {
		internalInputStream = new PushbackInputStream(in, MAX_BOM_SIZE);
		this.targetEncoding = targetEncoding;
	}

	/**
	 * Returns detected UTF encoding or {@code null} if no UTF encoding has been detected (i.e. no BOM).
	 * If stream is not read yet, it will be {@link #init() initalized} first.
	 */
	public Charset getDetectedEncoding() {
		if (!initialized) {
			try {
				init();
			} catch (final IOException ioex) {
				throw new IllegalStateException(ioex);
			}
		}
		return encoding;
	}

	public static final byte[] BOM_UTF32_BE = new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF};
	public static final byte[] BOM_UTF32_LE = new byte[]{(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00};
	public static final byte[] BOM_UTF8 = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
	public static final byte[] BOM_UTF16_BE = new byte[]{(byte) 0xFE, (byte) 0xFF};
	public static final byte[] BOM_UTF16_LE = new byte[]{(byte) 0xFF, (byte) 0xFE};

	/**
	 * Detects and decodes encoding from BOM character.
	 * Reads ahead four bytes and check for BOM marks.
	 * Extra bytes are unread back to the stream, so only
	 * BOM bytes are skipped.
	 */
	protected void init() throws IOException {
		if (initialized) {
			return;
		}

		if (targetEncoding == null) {

			// DETECT MODE

			final byte[] bom = new byte[MAX_BOM_SIZE];
			final int n = internalInputStream.read(bom, 0, bom.length);
			final int unread;

			if ((bom[0] == BOM_UTF32_BE[0]) && (bom[1] == BOM_UTF32_BE[1]) && (bom[2] == BOM_UTF32_BE[2]) && (bom[3] == BOM_UTF32_BE[3])) {
				encoding = Charset.forName("UTF-32BE");
				unread = n - 4;
			} else if ((bom[0] == BOM_UTF32_LE[0]) && (bom[1] == BOM_UTF32_LE[1]) && (bom[2] == BOM_UTF32_LE[2]) && (bom[3] == BOM_UTF32_LE[3])) {
				encoding = Charset.forName("UTF-32LE");
				unread = n - 4;
			} else if ((bom[0] == BOM_UTF8[0]) && (bom[1] == BOM_UTF8[1]) && (bom[2] == BOM_UTF8[2])) {
				encoding = StandardCharsets.UTF_8;
				unread = n - 3;
			} else if ((bom[0] == BOM_UTF16_BE[0]) && (bom[1] == BOM_UTF16_BE[1])) {
				encoding = StandardCharsets.UTF_16BE;
				unread = n - 2;
			} else if ((bom[0] == BOM_UTF16_LE[0]) && (bom[1] == BOM_UTF16_LE[1])) {
				encoding = StandardCharsets.UTF_16LE;
				unread = n - 2;
			} else {
				// BOM not found, unread all bytes
				unread = n;
			}

			BOMSize = MAX_BOM_SIZE - unread;

			if (unread > 0) {
				internalInputStream.unread(bom, (n - unread), unread);
			}
		} else {

			// READ MODE

			byte[] bom = null;

			final String targetEncodingName = targetEncoding.name();

			switch (targetEncodingName) {
				case "UTF-8":
					bom = BOM_UTF8;
					break;
				case "UTF-16LE":
					bom = BOM_UTF16_LE;
					break;
				case "UTF-16BE":
				case "UTF-16":
					bom = BOM_UTF16_BE;
					break;
				case "UTF-32LE":
					bom = BOM_UTF32_LE;
					break;
				case "UTF-32BE":
				case "UTF-32":
					bom = BOM_UTF32_BE;
					break;
				default:
					// no UTF encoding, no BOM
					break;
			}

			if (bom != null) {
				final byte[] fileBom = new byte[bom.length];
				final int n = internalInputStream.read(fileBom, 0, bom.length);

				boolean bomDetected = true;
				for (int i = 0; i < n; i++) {
					if (fileBom[i] != bom[i]) {
						bomDetected = false;
						break;
					}
				}

				if (!bomDetected) {
					internalInputStream.unread(fileBom, 0, fileBom.length);
				}
			}
		}

		initialized = true;
	}

	/**
	 * Closes input stream. If stream was not used, encoding
	 * will be unavailable.
	 */
	@Override
	public void close() throws IOException {
		internalInputStream.close();
	}

	/**
	 * Reads byte from the stream.
	 */
	@Override
	public int read() throws IOException {
		init();
		return internalInputStream.read();
	}

	/**
	 * Returns BOM size in bytes.
	 * Returns -1 if BOM not found.
	 */
	public int getBOMSize() {
		return BOMSize;
	}

}