w3c.css.util.UnicodeInputStream Maven / Gradle / Ivy
package org.w3c.css.util;
/*
* This file is a strict copy of the org.apache.velocity.ioUnicodeInputStream
* source from the https://svn.apache.org/repos/asf/velocity/engine repo, with
* the only modifications being this notice and a change to the package name.
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* This is an input stream that is unicode BOM aware. This allows you to e.g. read
* Windows Notepad Unicode files as Velocity templates.
*
* It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
* the input stream reader.
*
* This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
* the caller must provide synchronization.
*
* @author Aki Nieminen
* @author Henning P. Schmiedehausen
* @version $Id: UnicodeInputStream.java 1780734 2017-01-28 19:21:08Z cbrisson $
* @since 1.5
*/
public class UnicodeInputStream
extends InputStream {
/**
* BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html
*/
public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte[]{(byte) 0xef, (byte) 0xbb, (byte) 0xbf});
/**
* BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
*/
public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte[]{(byte) 0xff, (byte) 0xfe});
/**
* BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
*/
public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte[]{(byte) 0xfe, (byte) 0xff});
/**
* BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
*/
public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte[]{(byte) 0xff, (byte) 0xfe, (byte) 0x00, (byte) 0x00});
/**
* BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
*/
public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xfe, (byte) 0xff});
/**
* The maximum amount of bytes to read for a BOM
*/
private static final int MAX_BOM_SIZE = 4;
/**
* Buffer for BOM reading
*/
private byte[] buf = new byte[MAX_BOM_SIZE];
/**
* Buffer pointer.
*/
private int pos = 0;
/**
* The stream encoding as read from the BOM or null.
*/
private final String encoding;
/**
* True if the BOM itself should be skipped and not read.
*/
private final boolean skipBOM;
private final PushbackInputStream inputStream;
/**
* Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
*
* @param inputStream The input stream to use for reading.
*/
public UnicodeInputStream(final InputStream inputStream)
throws IllegalStateException, IOException {
this(inputStream, true);
}
/**
* Creates a new UnicodeInputStream object.
*
* @param inputStream The input stream to use for reading.
* @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
*/
public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
throws IllegalStateException, IOException {
super();
this.skipBOM = skipBOM;
this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
try {
this.encoding = readEncoding();
} catch (IOException ioe) {
throw new IllegalStateException("Could not read BOM from Stream", ioe);
}
}
/**
* Returns true if the input stream discards the BOM.
*
* @return True if the input stream discards the BOM.
*/
public boolean isSkipBOM() {
return skipBOM;
}
/**
* Read encoding based on BOM.
*
* @return The encoding based on the BOM.
* @throws IllegalStateException When a problem reading the BOM occured.
*/
public String getEncodingFromStream() {
return encoding;
}
/**
* This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
* is undefined.
*
* @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
*/
protected String readEncoding()
throws IOException {
pos = 0;
UnicodeBOM encoding = null;
// read first byte.
if (readByte()) {
// Build a list of matches
//
// 00 00 FE FF --> UTF 32 BE
// EF BB BF --> UTF 8
// FE FF --> UTF 16 BE
// FF FE --> UTF 16 LE
// FF FE 00 00 --> UTF 32 LE
switch (buf[0]) {
case (byte) 0x00: // UTF32 BE
encoding = match(UTF32BE_BOM, null);
break;
case (byte) 0xef: // UTF8
encoding = match(UTF8_BOM, null);
break;
case (byte) 0xfe: // UTF16 BE
encoding = match(UTF16BE_BOM, null);
break;
case (byte) 0xff: // UTF16/32 LE
encoding = match(UTF16LE_BOM, null);
if (encoding != null) {
encoding = match(UTF32LE_BOM, encoding);
}
break;
default:
encoding = null;
break;
}
}
pushback(encoding);
return (encoding != null) ? encoding.getEncoding() : null;
}
private UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
throws IOException {
byte[] bom = matchEncoding.getBytes();
for (int i = 0; i < bom.length; i++) {
if (pos <= i) // Byte has not yet been read
{
if (!readByte()) {
return noMatchEncoding;
}
}
if (bom[i] != buf[i]) {
return noMatchEncoding;
}
}
return matchEncoding;
}
private boolean readByte()
throws IOException {
int res = inputStream.read();
if (res == -1) {
return false;
}
if (pos >= buf.length) {
throw new IOException("BOM read error");
}
buf[pos++] = (byte) res;
return true;
}
private void pushback(final UnicodeBOM matchBOM)
throws IOException {
int count = pos; // By default, all bytes are pushed back.
int start = 0;
if (matchBOM != null && skipBOM) {
// We have a match (some bytes are part of the BOM)
// and we want to skip the BOM. Push back only the bytes
// after the BOM.
start = matchBOM.getBytes().length;
count = (pos - start);
if (count < 0) {
throw new IllegalStateException("Match has more bytes than available!");
}
}
inputStream.unread(buf, start, count);
}
/**
* @see java.io.InputStream#close()
*/
public void close()
throws IOException {
inputStream.close();
}
/**
* @see java.io.InputStream#available()
*/
public int available()
throws IOException {
return inputStream.available();
}
/**
* @see java.io.InputStream#mark(int)
*/
public void mark(final int readlimit) {
inputStream.mark(readlimit);
}
/**
* @see java.io.InputStream#markSupported()
*/
public boolean markSupported() {
return inputStream.markSupported();
}
/**
* @see java.io.InputStream#read()
*/
public int read()
throws IOException {
return inputStream.read();
}
/**
* @see java.io.InputStream#read(byte[])
*/
public int read(final byte[] b)
throws IOException {
return inputStream.read(b);
}
/**
* @see java.io.InputStream#read(byte[], int, int)
*/
public int read(final byte[] b, final int off, final int len)
throws IOException {
return inputStream.read(b, off, len);
}
/**
* @see java.io.InputStream#reset()
*/
public void reset()
throws IOException {
inputStream.reset();
}
/**
* @see java.io.InputStream#skip(long)
*/
public long skip(final long n)
throws IOException {
return inputStream.skip(n);
}
/**
* Helper function to compare encodings
*/
public static boolean sameEncoding(String left, String right) {
left = left.toUpperCase().replace("-", "").replace("_", "");
right = right.toUpperCase().replace("-", "").replace("_", "");
return left.equals(right);
}
/**
* Helper class to bundle encoding and BOM marker.
*
* @author Henning P. Schmiedehausen
* @version $Id: UnicodeInputStream.java 1780734 2017-01-28 19:21:08Z cbrisson $
*/
static final class UnicodeBOM {
private final String encoding;
private final byte[] bytes;
private UnicodeBOM(final String encoding, final byte[] bytes) {
this.encoding = encoding;
this.bytes = bytes;
}
String getEncoding() {
return encoding;
}
byte[] getBytes() {
return bytes;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy