org.fife.io.UnicodeReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ui4j-ide Show documentation
Web Automation for Java
The newest version!
/*
 * 09/23/2004
 *
 * UnicodeReader.java - A reader for Unicode input streams that is capable of
 * discerning which particular encoding is being used via the BOM.
 * 
 * This library is distributed under a modified BSD license.  See the included
 * RSyntaxTextArea.License.txt file for details.
 */
package org.fife.io;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.io.Reader;


/**
 * A reader capable of identifying Unicode streams by their BOMs.  This class
 * will recognize the following encodings:
 * 
 *   UTF-8
 *   
UTF-16LE
 *   
UTF-16BE
 *   
UTF-32LE
 *   
UTF-32BE
 * 
 * If the stream is not found to be any of the above, then a default encoding
 * is used for reading.  The user can specify this default encoding, or a system
 * default will be used.
 *
 * For optimum performance, it is recommended that you wrap all instances of
 * UnicodeReader with a java.io.BufferedReader.

 *
 * This class is mostly ripped off from the workaround in the description of
 * Java Bug 4508058.
 *
 * @author Robert Futrell
 * @version 0.9
 */
public class UnicodeReader extends Reader {

	/**
	 * The input stream from which we're really reading.
	 */
	private InputStreamReader internalIn = null;

	/**
	 * The encoding being used.  We keep our own instead of using the string
	 * returned by java.io.InputStreamReader since that class
	 * does not return user-friendly names.
	 */
	private String encoding;

	/**
	 * The size of a BOM.
	 */
	private static final int BOM_SIZE = 4;


	/**
	 * This utility constructor is here because you will usually use a
	 * UnicodeReader on files.

	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param file The file from which you want to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 * @throws FileNotFoundException If the file does not exist, is a
	 *         directory, or cannot be opened for reading.
	 * @throws SecurityException If a security manager exists and its
	 *         checkRead method denies read access to the file.
	 */
	public UnicodeReader(String file) throws IOException,
							FileNotFoundException, SecurityException {
		this(new File(file));
	}


	/**
	 * This utility constructor is here because you will usually use a
	 * UnicodeReader on files.

	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param file The file from which you want to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 * @throws FileNotFoundException If the file does not exist, is a
	 *         directory, or cannot be opened for reading.
	 * @throws SecurityException If a security manager exists and its
	 *         checkRead method denies read access to the file.
	 */
	public UnicodeReader(File file) throws IOException, FileNotFoundException,
									SecurityException {
		this(new FileInputStream(file));
	}


	/**
	 * This utility constructor is here because you will usually use a
	 * UnicodeReader on files.
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a specified default encoding is
	 * used.
	 *
	 * @param file The file from which you want to read.
	 * @param defaultEncoding The encoding to use if no BOM is found.  If
	 *        this value is null, a system default is used.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 * @throws FileNotFoundException If the file does not exist, is a
	 *         directory, or cannot be opened for reading.
	 * @throws SecurityException If a security manager exists and its
	 *         checkRead method denies read access to the file.
	 */
	public UnicodeReader(File file, String defaultEncoding)
						throws IOException, FileNotFoundException,
								SecurityException {
		this(new FileInputStream(file), defaultEncoding);
	}


	/**
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then a system default encoding is used.
	 *
	 * @param in The input stream from which to read.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 */
	public UnicodeReader(InputStream in) throws IOException {
		this(in, null);
	}


	/**
	 * Creates a reader using the encoding specified by the BOM in the file;
	 * if there is no recognized BOM, then defaultEncoding is
	 * used.
	 *
	 * @param in The input stream from which to read.
	 * @param defaultEncoding The encoding to use if no recognized BOM is
	 *        found.  If this value is null, a system default
	 *        is used.
	 * @throws IOException If an error occurs when checking for/reading the
	 *         BOM.
	 */
	public UnicodeReader(InputStream in, String defaultEncoding)
									throws IOException {
		init(in, defaultEncoding);
	}


	/**
	 * Closes this reader.
	 */
	@Override
	public void close() throws IOException {
		internalIn.close();
	}


	/**
	 * Returns the encoding being used to read this input stream (i.e., the
	 * encoding of the file).  If a BOM was recognized, then the specific
	 * Unicode type is returned; otherwise, either the default encoding passed
	 * into the constructor or the system default is returned.
	 *
	 * @return The encoding of the stream.
	 */
	public String getEncoding() {
		return encoding;
	}


	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are
	 * unread back to the stream, only BOM bytes are skipped.
	 *
	 * @param defaultEncoding The encoding to use if no BOM was recognized.  If
	 *        this value is null, then a system default is used.
	 * @throws IOException If an error occurs when trying to read a BOM.
	 */
	protected void init(InputStream in, String defaultEncoding)
											throws IOException {

		PushbackInputStream tempIn = new PushbackInputStream(in, BOM_SIZE);

		byte bom[] = new byte[BOM_SIZE];
		int n, unread;
		n = tempIn.read(bom, 0, bom.length);

		if ((bom[0]==(byte)0x00) && (bom[1]==(byte)0x00) &&
				(bom[2]==(byte)0xFE) && (bom[3]==(byte)0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		}

		else if (n==BOM_SIZE && // Last 2 bytes are 0; could be an empty UTF-16
				(bom[0]==(byte)0xFF) && (bom[1]==(byte)0xFE) &&
				(bom[2]==(byte)0x00) && (bom[3]==(byte)0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		}
		
		else if ((bom[0]==(byte)0xEF) &&
			(bom[1]==(byte)0xBB) &&
			(bom[2]==(byte)0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		}

		else if ((bom[0]==(byte)0xFE) && (bom[1] == (byte)0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		}

		else if ((bom[0]==(byte)0xFF) && (bom[1]== (byte)0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		}

		else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEncoding;
			unread = n;
		}

		if (unread > 0)
			tempIn.unread(bom, (n - unread), unread);
		else if (unread < -1)
			tempIn.unread(bom, 0, 0);

		// Use given encoding
		if (encoding == null) {
			internalIn = new InputStreamReader(tempIn);
			encoding = internalIn.getEncoding(); // Get the default.
		}
		else {
			internalIn = new InputStreamReader(tempIn, encoding);
		}

	}


	/**
	 * Read characters into a portion of an array. This method will block until
	 * some input is available, an I/O error occurs, or the end of the stream
	 * is reached.
	 *
	 * @param cbuf The buffer into which to read.
	 * @param off The offset at which to start storing characters.
	 * @param len The maximum number of characters to read.
	 *
	 * @return The number of characters read, or -1 if the end
	 *         of the stream has been reached.
	 */
	@Override
	public int read(char[] cbuf, int off, int len) throws IOException {
		return internalIn.read(cbuf, off, len);
	}


}