org.fife.io.UnicodeReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rsyntaxtextarea Show documentation
RSyntaxTextArea is the syntax highlighting text editor for Swing applications. Features include syntax highlighting for 40+ languages, code folding, code completion, regex find and replace, macros, code templates, undo/redo, line numbering and bracket matching.
There is a newer version: 3.5.1
Show newest version
/*
 * 09/23/2004
 *
 * UnicodeReader.java - A reader for Unicode input streams that is capable of
 * discerning which particular encoding is being used via the BOM.
 *
 * This library is distributed under a modified BSD license.  See the included
 * LICENSE file for details.
 */
package org.fife.io;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;


/**
 * A reader capable of identifying Unicode streams by their BOMs.  This class
 * will recognize the following encodings:
 * 
 *   UTF-8
 *   
UTF-16LE
 *   
UTF-16BE
 *   
UTF-32LE
 *   
UTF-32BE
 * 
 * If the stream is not found to be any of the above, then a default encoding
 * is used for reading.  The user can specify this default encoding, or a system
 * default will be used.
 *
 * For optimum performance, it is recommended that you wrap all instances of
 * UnicodeReader with a java.io.BufferedReader.

 *
 * This class is mostly ripped off from the workaround in the description of
 * Java Bug 4508058.
 *
 * @author Robert Futrell
 * @version 0.9
 */
@SuppressWarnings({ "checkstyle:magicnumber" })
public class UnicodeReader extends Reader {

	/**
	 * The input stream from which we're really reading.
	 */
	private InputStreamReader internalIn = null;

	/**
	 * The encoding being used.  We keep our own instead of using the string
	 * returned by java.io.InputStreamReader since that class
	 * does not return user-friendly names.
	 */
	private String encoding;

	/**
	 * The size of a BOM.
	 */
	private static final int BOM_SIZE = 4;


	/**
	 * This utility constructor is here because you will usually use a
	 * UnicodeReader on files.

	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param file The file from which you want to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 * @throws SecurityException If a security manager exists and its
	 *         checkRead method denies read access to the file.
	 */
	public UnicodeReader(String file) throws IOException {
		this(new File(file));
	}


	/**
	 * This utility constructor is here because you will usually use a
	 * UnicodeReader on files.

	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param file The file from which you want to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 * @throws SecurityException If a security manager exists and its
	 *         checkRead method denies read access to the file.
	 */
	public UnicodeReader(File file) throws IOException {
		this(new FileInputStream(file));
	}


	/**
	 * This utility constructor is here because you will usually use a
	 * UnicodeReader on files.
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a specified default encoding is
	 * used.
	 *
	 * @param file The file from which you want to read.
	 * @param defaultEncoding The encoding to use if no BOM is found.  If
	 *        this value is null, a system default is used.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 * @throws SecurityException If a security manager exists and its
	 *         checkRead method denies read access to the file.
	 */
	public UnicodeReader(File file, String defaultEncoding) throws IOException {
		this(new FileInputStream(file), defaultEncoding);
	}


	/**
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param in The input stream from which to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 */
	public UnicodeReader(InputStream in) throws IOException {
		this(in, null);
	}


	/**
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then defaultEncoding is
	 * used.
	 *
	 * @param in The input stream from which to read.
	 * @param defaultEncoding The encoding to use if no recognized BOM is
	 *        found.  If this value is null, a system default
	 *        is used.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 */
	public UnicodeReader(InputStream in, String defaultEncoding)
									throws IOException {
		init(in, defaultEncoding);
	}


	/**
	 * Closes this reader.
	 */
	@Override
	public void close() throws IOException {
		internalIn.close();
	}


	/**
	 * Returns the encoding being used to read this input stream (i.e., the
	 * encoding of the file).  If a BOM was recognized, then the specific
	 * Unicode type is returned; otherwise, either the default encoding passed
	 * into the constructor or the system default is returned.
	 *
	 * @return The encoding of the stream.
	 */
	public String getEncoding() {
		return encoding;
	}


	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are
	 * unread back to the stream, only BOM bytes are skipped.
	 *
	 * @param defaultEncoding The encoding to use if no BOM was recognized.  If
	 *        this value is null, then a system default is used.
	 * @throws IOException If an error occurs when trying to read a BOM.
	 */
	protected void init(InputStream in, String defaultEncoding)
											throws IOException {

		PushbackInputStream tempIn = new PushbackInputStream(in, BOM_SIZE);

		byte[] bom = new byte[BOM_SIZE];
		int n, unread;
		n = tempIn.read(bom, 0, bom.length);

		if ((bom[0]==(byte)0x00) && (bom[1]==(byte)0x00) &&
				(bom[2]==(byte)0xFE) && (bom[3]==(byte)0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		}

		else if (n==BOM_SIZE && // Last 2 bytes are 0; could be an empty UTF-16
				(bom[0]==(byte)0xFF) && (bom[1]==(byte)0xFE) &&
				(bom[2]==(byte)0x00) && (bom[3]==(byte)0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		}

		else if ((bom[0]==(byte)0xEF) &&
			(bom[1]==(byte)0xBB) &&
			(bom[2]==(byte)0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		}

		else if ((bom[0]==(byte)0xFE) && (bom[1] == (byte)0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		}

		else if ((bom[0]==(byte)0xFF) && (bom[1]== (byte)0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		}

		else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEncoding;
			unread = n;
		}

		if (unread > 0) {
			tempIn.unread(bom, (n - unread), unread);
		}
		else if (unread < -1) {
			tempIn.unread(bom, 0, 0);
		}

		// Use given encoding
		if (encoding == null) {
			internalIn = new InputStreamReader(tempIn);
			encoding = internalIn.getEncoding(); // Get the default.
		}
		else {
			internalIn = new InputStreamReader(tempIn, encoding);
		}

	}


	/**
	 * Read characters into a portion of an array. This method will block until
	 * some input is available, an I/O error occurs, or the end of the stream
	 * is reached.
	 *
	 * @param cbuf The buffer into which to read.
	 * @param off The offset at which to start storing characters.
	 * @param len The maximum number of characters to read.
	 *
	 * @return The number of characters read, or -1 if the end
	 *         of the stream has been reached.
	 */
	@Override
	public int read(char[] cbuf, int off, int len) throws IOException {
		return internalIn.read(cbuf, off, len);
	}


}