com.sun.xml.dtdparser.XmlReader Maven / Gradle / Ivy
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright (c) 1998-2012 Oracle and/or its affiliates. All rights reserved.
*
* The contents of this file are subject to the terms of either the GNU
* General Public License Version 2 only ("GPL") or the Common Development
* and Distribution License("CDDL") (collectively, the "License"). You
* may not use this file except in compliance with the License. You can
* obtain a copy of the License at
* http://glassfish.java.net/public/CDDL+GPL_1_1.html
* or packager/legal/LICENSE.txt. See the License for the specific
* language governing permissions and limitations under the License.
*
* When distributing the software, include this License Header Notice in each
* file and include the License file at packager/legal/LICENSE.txt.
*
* GPL Classpath Exception:
* Oracle designates this particular file as subject to the "Classpath"
* exception as provided by Oracle in the GPL Version 2 section of the License
* file that accompanied this code.
*
* Modifications:
* If applicable, add the following below the License Header, with the fields
* enclosed by brackets [] replaced by your own identifying information:
* "Portions Copyright [year] [name of copyright owner]"
*
* Contributor(s):
* If you wish your version of this file to be governed by only the CDDL or
* only the GPL Version 2, indicate your decision by adding "[Contributor]
* elects to include this software in this distribution under the [CDDL or GPL
* Version 2] license." If you don't indicate a single choice of license, a
* recipient has the option to distribute your version of this file under
* either the CDDL, the GPL Version 2 or to extend the choice of license to
* its licensees as provided above. However, if you add GPL Version 2 code
* and therefore, elected the GPL Version 2 license, then the option applies
* only if the new code is made subject to such option by the copyright
* holder.
*/
package com.sun.xml.dtdparser;
import java.io.ByteArrayInputStream;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Locale;
// NOTE: Add I18N support to this class when JDK gets the ability to
// defer selection of locale for exception messages ... use the same
// technique for both.
/**
* This handles several XML-related tasks that normal java.io Readers
* don't support, inluding use of IETF standard encoding names and
* automatic detection of most XML encodings. The former is needed
* for interoperability; the latter is needed to conform with the XML
* spec. This class also optimizes reading some common encodings by
* providing low-overhead unsynchronized Reader support.
*
* Note that the autodetection facility should be used only on
* data streams which have an unknown character encoding. For example,
* it should never be used on MIME text/xml entities.
*
* Note that XML processors are only required to support UTF-8 and
* UTF-16 character encodings. Autodetection permits the underlying Java
* implementation to provide support for many other encodings, such as
* US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
*
* @author David Brownell
* @author Janet Koenig
* @version 1.3 00/02/24
*/
// package private
final class XmlReader extends Reader {
private static final int MAXPUSHBACK = 512;
private Reader in;
private String assignedEncoding;
private boolean closed;
//
// This class always delegates I/O to a reader, which gets
// its data from the very beginning of the XML text. It needs
// to use a pushback stream since (a) autodetection can read
// partial UTF-8 characters which need to be fully processed,
// (b) the "Unicode" readers swallow characters that they think
// are byte order marks, so tests fail if they don't see the
// real byte order mark.
//
// It's got do this efficiently: character I/O is solidly on the
// critical path. (So keep buffer length over 2 Kbytes to avoid
// excess buffering. Many URL handlers stuff a BufferedInputStream
// between here and the real data source, and larger buffers keep
// that from slowing you down.)
//
/**
* Constructs the reader from an input stream, auto-detecting
* the encoding to use according to the heuristic specified
* in the XML 1.0 recommendation.
*
* @param in the input stream from which the reader is constructed
* @throws IOException on error, such as unrecognized encoding
*/
public static Reader createReader(InputStream in) throws IOException {
return new XmlReader(in);
}
/**
* Creates a reader supporting the given encoding, mapping
* from standard encoding names to ones that understood by
* Java where necessary.
*
* @param in the input stream from which the reader is constructed
* @param encoding the IETF standard name of the encoding to use;
* if null, auto-detection is used.
* @throws IOException on error, including unrecognized encoding
*/
public static Reader createReader(InputStream in, String encoding)
throws IOException {
if (encoding == null)
return new XmlReader(in);
if ("UTF-8".equalsIgnoreCase(encoding)
|| "UTF8".equalsIgnoreCase(encoding))
return new Utf8Reader(in);
if ("US-ASCII".equalsIgnoreCase(encoding)
|| "ASCII".equalsIgnoreCase(encoding))
return new AsciiReader(in);
if ("ISO-8859-1".equalsIgnoreCase(encoding)
// plus numerous aliases ...
)
return new Iso8859_1Reader(in);
//
// What we really want is an administerable resource mapping
// encoding names/aliases to classnames. For example a property
// file resource, "readers/mapping.props", holding and a set
// of readers in that (sub)package... defaulting to this call
// only if no better choice is available.
//
return new InputStreamReader(in, std2java(encoding));
}
//
// JDK doesn't know all of the standard encoding names, and
// in particular none of the EBCDIC ones IANA defines (and
// which IBM encourages).
//
static private final Hashtable charsets = new Hashtable(31);
static {
charsets.put("UTF-16", "Unicode");
charsets.put("ISO-10646-UCS-2", "Unicode");
// NOTE: no support for ISO-10646-UCS-4 yet.
charsets.put("EBCDIC-CP-US", "cp037");
charsets.put("EBCDIC-CP-CA", "cp037");
charsets.put("EBCDIC-CP-NL", "cp037");
charsets.put("EBCDIC-CP-WT", "cp037");
charsets.put("EBCDIC-CP-DK", "cp277");
charsets.put("EBCDIC-CP-NO", "cp277");
charsets.put("EBCDIC-CP-FI", "cp278");
charsets.put("EBCDIC-CP-SE", "cp278");
charsets.put("EBCDIC-CP-IT", "cp280");
charsets.put("EBCDIC-CP-ES", "cp284");
charsets.put("EBCDIC-CP-GB", "cp285");
charsets.put("EBCDIC-CP-FR", "cp297");
charsets.put("EBCDIC-CP-AR1", "cp420");
charsets.put("EBCDIC-CP-HE", "cp424");
charsets.put("EBCDIC-CP-BE", "cp500");
charsets.put("EBCDIC-CP-CH", "cp500");
charsets.put("EBCDIC-CP-ROECE", "cp870");
charsets.put("EBCDIC-CP-YU", "cp870");
charsets.put("EBCDIC-CP-IS", "cp871");
charsets.put("EBCDIC-CP-AR2", "cp918");
// IANA also defines two that JDK 1.2 doesn't handle:
// EBCDIC-CP-GR --> CP423
// EBCDIC-CP-TR --> CP905
}
// returns an encoding name supported by JDK >= 1.1.6
// for some cases required by the XML spec
private static String std2java(String encoding) {
String temp = encoding.toUpperCase(Locale.ENGLISH);
temp = (String) charsets.get(temp);
return temp != null ? temp : encoding;
}
/**
* Returns the standard name of the encoding in use
*/
public String getEncoding() {
return assignedEncoding;
}
private XmlReader(InputStream stream) throws IOException {
super(stream);
PushbackInputStream pb;
byte buf [];
int len;
if (stream instanceof PushbackInputStream)
pb = (PushbackInputStream) stream;
else
pb = new PushbackInputStream(stream, MAXPUSHBACK);
//
// See if we can figure out the character encoding used
// in this file by peeking at the first few bytes.
//
buf = new byte[4];
len = pb.read(buf);
if (len > 0)
pb.unread(buf, 0, len);
if (len == 4)
switch (buf[0] & 0x0ff) {
case 0:
// 00 3c 00 3f == illegal UTF-16 big-endian
if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
setEncoding(pb, "UnicodeBig");
return;
}
// else it's probably UCS-4
break;
case '<': // 0x3c: the most common cases!
switch (buf[1] & 0x0ff) {
// First character is '<'; could be XML without
// an XML directive such as "", "