org.apache.velocity.io.UnicodeInputStream Maven / Gradle / Ivy
package org.apache.velocity.io;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import org.apache.velocity.util.ExceptionUtils;
/**
* This is an input stream that is unicode BOM aware. This allows you to e.g. read
* Windows Notepad Unicode files as Velocity templates.
*
* It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
* the input stream reader.
*
* This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
* the caller must provide synchronization.
*
* @author Aki Nieminen
* @author Henning P. Schmiedehausen
* @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
* @since 1.5
*/
public class UnicodeInputStream
extends InputStream
{
/** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
/** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
/** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
/**
* BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
*
* TODO: Does Java actually support this?
*/
public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
/**
* BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
*
* TODO: Does Java actually support this?
*/
public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
/** The maximum amount of bytes to read for a BOM */
private static final int MAX_BOM_SIZE = 4;
/** Buffer for BOM reading */
private byte [] buf = new byte[MAX_BOM_SIZE];
/** Buffer pointer. */
private int pos = 0;
/** The stream encoding as read from the BOM or null. */
private final String encoding;
/** True if the BOM itself should be skipped and not read. */
private final boolean skipBOM;
private final PushbackInputStream inputStream;
/**
* Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
*
* @param inputStream The input stream to use for reading.
*/
public UnicodeInputStream(final InputStream inputStream)
throws IllegalStateException, IOException
{
this(inputStream, true);
}
/**
* Creates a new UnicodeInputStream object.
*
* @param inputStream The input stream to use for reading.
* @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
*/
public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
throws IllegalStateException, IOException
{
super();
this.skipBOM = skipBOM;
this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
try
{
this.encoding = readEncoding();
}
catch (IOException ioe)
{
IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");
ExceptionUtils.setCause(ex, ioe);
throw ex;
}
}
/**
* Returns true if the input stream discards the BOM.
*
* @return True if the input stream discards the BOM.
*/
public boolean isSkipBOM()
{
return skipBOM;
}
/**
* Read encoding based on BOM.
*
* @return The encoding based on the BOM.
*
* @throws IllegalStateException When a problem reading the BOM occured.
*/
public String getEncodingFromStream()
{
return encoding;
}
/**
* This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
* is undefined.
*
* @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
*/
protected String readEncoding()
throws IOException
{
pos = 0;
UnicodeBOM encoding = null;
// read first byte.
if (readByte())
{
// Build a list of matches
//
// 00 00 FE FF --> UTF 32 BE
// EF BB BF --> UTF 8
// FE FF --> UTF 16 BE
// FF FE --> UTF 16 LE
// FF FE 00 00 --> UTF 32 LE
switch (buf[0])
{
case (byte)0x00: // UTF32 BE
encoding = match(UTF32BE_BOM, null);
break;
case (byte)0xef: // UTF8
encoding = match(UTF8_BOM, null);
break;
case (byte)0xfe: // UTF16 BE
encoding = match(UTF16BE_BOM, null);
break;
case (byte)0xff: // UTF16/32 LE
encoding = match(UTF16LE_BOM, null);
if (encoding != null)
{
encoding = match(UTF32LE_BOM, encoding);
}
break;
default:
encoding = null;
break;
}
}
pushback(encoding);
return (encoding != null) ? encoding.getEncoding() : null;
}
private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
throws IOException
{
byte [] bom = matchEncoding.getBytes();
for (int i = 0; i < bom.length; i++)
{
if (pos <= i) // Byte has not yet been read
{
if (!readByte())
{
return noMatchEncoding;
}
}
if (bom[i] != buf[i])
{
return noMatchEncoding;
}
}
return matchEncoding;
}
private final boolean readByte()
throws IOException
{
int res = inputStream.read();
if (res == -1)
{
return false;
}
if (pos >= buf.length)
{
throw new IOException("BOM read error");
}
buf[pos++] = (byte) res;
return true;
}
private final void pushback(final UnicodeBOM matchBOM)
throws IOException
{
int count = pos; // By default, all bytes are pushed back.
int start = 0;
if (matchBOM != null && skipBOM)
{
// We have a match (some bytes are part of the BOM)
// and we want to skip the BOM. Push back only the bytes
// after the BOM.
start = matchBOM.getBytes().length;
count = (pos - start);
if (count < 0)
{
throw new IllegalStateException("Match has more bytes than available!");
}
}
inputStream.unread(buf, start, count);
}
/**
* @see java.io.InputStream#close()
*/
public void close()
throws IOException
{
inputStream.close();
}
/**
* @see java.io.InputStream#available()
*/
public int available()
throws IOException
{
return inputStream.available();
}
/**
* @see java.io.InputStream#mark(int)
*/
public void mark(final int readlimit)
{
inputStream.mark(readlimit);
}
/**
* @see java.io.InputStream#markSupported()
*/
public boolean markSupported()
{
return inputStream.markSupported();
}
/**
* @see java.io.InputStream#read()
*/
public int read()
throws IOException
{
return inputStream.read();
}
/**
* @see java.io.InputStream#read(byte[])
*/
public int read(final byte [] b)
throws IOException
{
return inputStream.read(b);
}
/**
* @see java.io.InputStream#read(byte[], int, int)
*/
public int read(final byte [] b, final int off, final int len)
throws IOException
{
return inputStream.read(b, off, len);
}
/**
* @see java.io.InputStream#reset()
*/
public void reset()
throws IOException
{
inputStream.reset();
}
/**
* @see java.io.InputStream#skip(long)
*/
public long skip(final long n)
throws IOException
{
return inputStream.skip(n);
}
/**
* Helper class to bundle encoding and BOM marker.
*
* @author Henning P. Schmiedehausen
* @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
*/
static final class UnicodeBOM
{
private final String encoding;
private final byte [] bytes;
private UnicodeBOM(final String encoding, final byte [] bytes)
{
this.encoding = encoding;
this.bytes = bytes;
}
String getEncoding()
{
return encoding;
}
byte [] getBytes()
{
return bytes;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy