com.sun.xml.dtdparser.XmlReader Maven / Gradle / Ivy
/**
Copyright (c) 2009, Sun Microsystems
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the Sun Microsystems nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.sun.xml.dtdparser;
import java.io.ByteArrayInputStream;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Hashtable;
// NOTE: Add I18N support to this class when JDK gets the ability to
// defer selection of locale for exception messages ... use the same
// technique for both.
/**
* This handles several XML-related tasks that normal java.io Readers
* don't support, inluding use of IETF standard encoding names and
* automatic detection of most XML encodings. The former is needed
* for interoperability; the latter is needed to conform with the XML
* spec. This class also optimizes reading some common encodings by
* providing low-overhead unsynchronized Reader support.
*
* Note that the autodetection facility should be used only on
* data streams which have an unknown character encoding. For example,
* it should never be used on MIME text/xml entities.
*
* Note that XML processors are only required to support UTF-8 and
* UTF-16 character encodings. Autodetection permits the underlying Java
* implementation to provide support for many other encodings, such as
* US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
*
* @author David Brownell
* @author Janet Koenig
* @version 1.3 00/02/24
*/
// package private
final class XmlReader extends Reader {
private static final int MAXPUSHBACK = 512;
private Reader in;
private String assignedEncoding;
private boolean closed;
//
// This class always delegates I/O to a reader, which gets
// its data from the very beginning of the XML text. It needs
// to use a pushback stream since (a) autodetection can read
// partial UTF-8 characters which need to be fully processed,
// (b) the "Unicode" readers swallow characters that they think
// are byte order marks, so tests fail if they don't see the
// real byte order mark.
//
// It's got do this efficiently: character I/O is solidly on the
// critical path. (So keep buffer length over 2 Kbytes to avoid
// excess buffering. Many URL handlers stuff a BufferedInputStream
// between here and the real data source, and larger buffers keep
// that from slowing you down.)
//
/**
* Constructs the reader from an input stream, autodetecting
* the encoding to use according to the heuristic specified
* in the XML 1.0 recommendation.
*
* @param in the input stream from which the reader is constructed
* @throws IOException on error, such as unrecognized encoding
*/
public static Reader createReader(InputStream in) throws IOException {
return new XmlReader(in);
}
/**
* Creates a reader supporting the given encoding, mapping
* from standard encoding names to ones that understood by
* Java where necessary.
*
* @param in the input stream from which the reader is constructed
* @param encoding the IETF standard name of the encoding to use;
* if null, autodetection is used.
* @throws IOException on error, including unrecognized encoding
*/
public static Reader createReader(InputStream in, String encoding)
throws IOException {
if (encoding == null)
return new XmlReader(in);
if ("UTF-8".equalsIgnoreCase(encoding)
|| "UTF8".equalsIgnoreCase(encoding))
return new Utf8Reader(in);
if ("US-ASCII".equalsIgnoreCase(encoding)
|| "ASCII".equalsIgnoreCase(encoding))
return new AsciiReader(in);
if ("ISO-8859-1".equalsIgnoreCase(encoding)
// plus numerous aliases ...
)
return new Iso8859_1Reader(in);
//
// What we really want is an administerable resource mapping
// encoding names/aliases to classnames. For example a property
// file resource, "readers/mapping.props", holding and a set
// of readers in that (sub)package... defaulting to this call
// only if no better choice is available.
//
return new InputStreamReader(in, std2java(encoding));
}
//
// JDK doesn't know all of the standard encoding names, and
// in particular none of the EBCDIC ones IANA defines (and
// which IBM encourages).
//
static private final Hashtable charsets = new Hashtable(31);
static {
charsets.put("UTF-16", "Unicode");
charsets.put("ISO-10646-UCS-2", "Unicode");
// NOTE: no support for ISO-10646-UCS-4 yet.
charsets.put("EBCDIC-CP-US", "cp037");
charsets.put("EBCDIC-CP-CA", "cp037");
charsets.put("EBCDIC-CP-NL", "cp037");
charsets.put("EBCDIC-CP-WT", "cp037");
charsets.put("EBCDIC-CP-DK", "cp277");
charsets.put("EBCDIC-CP-NO", "cp277");
charsets.put("EBCDIC-CP-FI", "cp278");
charsets.put("EBCDIC-CP-SE", "cp278");
charsets.put("EBCDIC-CP-IT", "cp280");
charsets.put("EBCDIC-CP-ES", "cp284");
charsets.put("EBCDIC-CP-GB", "cp285");
charsets.put("EBCDIC-CP-FR", "cp297");
charsets.put("EBCDIC-CP-AR1", "cp420");
charsets.put("EBCDIC-CP-HE", "cp424");
charsets.put("EBCDIC-CP-BE", "cp500");
charsets.put("EBCDIC-CP-CH", "cp500");
charsets.put("EBCDIC-CP-ROECE", "cp870");
charsets.put("EBCDIC-CP-YU", "cp870");
charsets.put("EBCDIC-CP-IS", "cp871");
charsets.put("EBCDIC-CP-AR2", "cp918");
// IANA also defines two that JDK 1.2 doesn't handle:
// EBCDIC-CP-GR --> CP423
// EBCDIC-CP-TR --> CP905
}
// returns an encoding name supported by JDK >= 1.1.6
// for some cases required by the XML spec
private static String std2java(String encoding) {
String temp = encoding.toUpperCase();
temp = (String) charsets.get(temp);
return temp != null ? temp : encoding;
}
/**
* Returns the standard name of the encoding in use
*/
public String getEncoding() {
return assignedEncoding;
}
private XmlReader(InputStream stream) throws IOException {
super(stream);
PushbackInputStream pb;
byte buf [];
int len;
if (stream instanceof PushbackInputStream)
pb = (PushbackInputStream) stream;
else
pb = new PushbackInputStream(stream, MAXPUSHBACK);
//
// See if we can figure out the character encoding used
// in this file by peeking at the first few bytes.
//
buf = new byte[4];
len = pb.read(buf);
if (len > 0)
pb.unread(buf, 0, len);
if (len == 4)
switch (buf[0] & 0x0ff) {
case 0:
// 00 3c 00 3f == illegal UTF-16 big-endian
if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
setEncoding(pb, "UnicodeBig");
return;
}
// else it's probably UCS-4
break;
case '<': // 0x3c: the most common cases!
switch (buf[1] & 0x0ff) {
// First character is '<'; could be XML without
// an XML directive such as "", "