All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sun.xml.dtdparser.XmlReader Maven / Gradle / Ivy

There is a newer version: 4.0.5
Show newest version
/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1998-2012 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * http://glassfish.java.net/public/CDDL+GPL_1_1.html
 * or packager/legal/LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at packager/legal/LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */

package com.sun.xml.dtdparser;

import java.io.ByteArrayInputStream;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Locale;


// NOTE:  Add I18N support to this class when JDK gets the ability to
// defer selection of locale for exception messages ... use the same
// technique for both.


/**
 * This handles several XML-related tasks that normal java.io Readers
 * don't support, inluding use of IETF standard encoding names and
 * automatic detection of most XML encodings.  The former is needed
 * for interoperability; the latter is needed to conform with the XML
 * spec.  This class also optimizes reading some common encodings by
 * providing low-overhead unsynchronized Reader support.
 * 

*

Note that the autodetection facility should be used only on * data streams which have an unknown character encoding. For example, * it should never be used on MIME text/xml entities. *

*

Note that XML processors are only required to support UTF-8 and * UTF-16 character encodings. Autodetection permits the underlying Java * implementation to provide support for many other encodings, such as * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. * * @author David Brownell * @author Janet Koenig * @version 1.3 00/02/24 */ // package private final class XmlReader extends Reader { private static final int MAXPUSHBACK = 512; private Reader in; private String assignedEncoding; private boolean closed; // // This class always delegates I/O to a reader, which gets // its data from the very beginning of the XML text. It needs // to use a pushback stream since (a) autodetection can read // partial UTF-8 characters which need to be fully processed, // (b) the "Unicode" readers swallow characters that they think // are byte order marks, so tests fail if they don't see the // real byte order mark. // // It's got do this efficiently: character I/O is solidly on the // critical path. (So keep buffer length over 2 Kbytes to avoid // excess buffering. Many URL handlers stuff a BufferedInputStream // between here and the real data source, and larger buffers keep // that from slowing you down.) // /** * Constructs the reader from an input stream, auto-detecting * the encoding to use according to the heuristic specified * in the XML 1.0 recommendation. * * @param in the input stream from which the reader is constructed * @throws IOException on error, such as unrecognized encoding */ public static Reader createReader(InputStream in) throws IOException { return new XmlReader(in); } /** * Creates a reader supporting the given encoding, mapping * from standard encoding names to ones that understood by * Java where necessary. * * @param in the input stream from which the reader is constructed * @param encoding the IETF standard name of the encoding to use; * if null, auto-detection is used. * @throws IOException on error, including unrecognized encoding */ public static Reader createReader(InputStream in, String encoding) throws IOException { if (encoding == null) return new XmlReader(in); if ("UTF-8".equalsIgnoreCase(encoding) || "UTF8".equalsIgnoreCase(encoding)) return new Utf8Reader(in); if ("US-ASCII".equalsIgnoreCase(encoding) || "ASCII".equalsIgnoreCase(encoding)) return new AsciiReader(in); if ("ISO-8859-1".equalsIgnoreCase(encoding) // plus numerous aliases ... ) return new Iso8859_1Reader(in); // // What we really want is an administerable resource mapping // encoding names/aliases to classnames. For example a property // file resource, "readers/mapping.props", holding and a set // of readers in that (sub)package... defaulting to this call // only if no better choice is available. // return new InputStreamReader(in, std2java(encoding)); } // // JDK doesn't know all of the standard encoding names, and // in particular none of the EBCDIC ones IANA defines (and // which IBM encourages). // static private final Hashtable charsets = new Hashtable(31); static { charsets.put("UTF-16", "Unicode"); charsets.put("ISO-10646-UCS-2", "Unicode"); // NOTE: no support for ISO-10646-UCS-4 yet. charsets.put("EBCDIC-CP-US", "cp037"); charsets.put("EBCDIC-CP-CA", "cp037"); charsets.put("EBCDIC-CP-NL", "cp037"); charsets.put("EBCDIC-CP-WT", "cp037"); charsets.put("EBCDIC-CP-DK", "cp277"); charsets.put("EBCDIC-CP-NO", "cp277"); charsets.put("EBCDIC-CP-FI", "cp278"); charsets.put("EBCDIC-CP-SE", "cp278"); charsets.put("EBCDIC-CP-IT", "cp280"); charsets.put("EBCDIC-CP-ES", "cp284"); charsets.put("EBCDIC-CP-GB", "cp285"); charsets.put("EBCDIC-CP-FR", "cp297"); charsets.put("EBCDIC-CP-AR1", "cp420"); charsets.put("EBCDIC-CP-HE", "cp424"); charsets.put("EBCDIC-CP-BE", "cp500"); charsets.put("EBCDIC-CP-CH", "cp500"); charsets.put("EBCDIC-CP-ROECE", "cp870"); charsets.put("EBCDIC-CP-YU", "cp870"); charsets.put("EBCDIC-CP-IS", "cp871"); charsets.put("EBCDIC-CP-AR2", "cp918"); // IANA also defines two that JDK 1.2 doesn't handle: // EBCDIC-CP-GR --> CP423 // EBCDIC-CP-TR --> CP905 } // returns an encoding name supported by JDK >= 1.1.6 // for some cases required by the XML spec private static String std2java(String encoding) { String temp = encoding.toUpperCase(Locale.ENGLISH); temp = (String) charsets.get(temp); return temp != null ? temp : encoding; } /** * Returns the standard name of the encoding in use */ public String getEncoding() { return assignedEncoding; } private XmlReader(InputStream stream) throws IOException { super(stream); PushbackInputStream pb; byte buf []; int len; if (stream instanceof PushbackInputStream) pb = (PushbackInputStream) stream; else pb = new PushbackInputStream(stream, MAXPUSHBACK); // // See if we can figure out the character encoding used // in this file by peeking at the first few bytes. // buf = new byte[4]; len = pb.read(buf); if (len > 0) pb.unread(buf, 0, len); if (len == 4) switch (buf[0] & 0x0ff) { case 0: // 00 3c 00 3f == illegal UTF-16 big-endian if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) { setEncoding(pb, "UnicodeBig"); return; } // else it's probably UCS-4 break; case '<': // 0x3c: the most common cases! switch (buf[1] & 0x0ff) { // First character is '<'; could be XML without // an XML directive such as "", "



© 2015 - 2024 Weber Informatics LLC | Privacy Policy