com.sun.xml.rpc.sp.XmlReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webservices-rt Show documentation
This module contains the Metro runtime code.
There is a newer version: 4.0.4
/*
 * Copyright (c) 1997, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License v. 2.0, which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * This Source Code may also be made available under the following Secondary
 * Licenses when the conditions for such availability set forth in the
 * Eclipse Public License v. 2.0 are satisfied: GNU General Public License,
 * version 2 with the GNU Classpath Exception, which is available at
 * https://www.gnu.org/software/classpath/license.html.
 *
 * SPDX-License-Identifier: EPL-2.0 OR GPL-2.0 WITH Classpath-exception-2.0
 */

package com.sun.xml.rpc.sp;

import java.io.ByteArrayInputStream;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;

// NOTE:  Add I18N support to this class when JDK gets the ability to
// defer selection of locale for exception messages ... use the same
// technique for both.

/**
 * This handles several XML-related tasks that normal java.io Readers
 * don't support, inluding use of IETF standard encoding names and
 * automatic detection of most XML encodings.  The former is needed
 * for interoperability; the latter is needed to conform with the XML
 * spec.  This class also optimizes reading some common encodings by
 * providing low-overhead unsynchronized Reader support.
 *
 *  Note that the autodetection facility should be used only on
 * data streams which have an unknown character encoding.  For example,
 * it should never be used on MIME text/xml entities.
 *
 * 
 Note that XML processors are only required to support UTF-8 and
 * UTF-16 character encodings.  Autodetection permits the underlying Java
 * implementation to provide support for many other encodings, such as
 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
 *
 * @author David Brownell
 * @author JAX-RPC RI Development Team
 */

final class XmlReader extends Reader {
    private static final int MAXPUSHBACK = 512;

    private Reader in;
    private String assignedEncoding;
    private boolean closed;

    //
    // This class always delegates I/O to a reader, which gets
    // its data from the very beginning of the XML text.  It needs
    // to use a pushback stream since (a) autodetection can read
    // partial UTF-8 characters which need to be fully processed,
    // (b) the "Unicode" readers swallow characters that they think
    // are byte order marks, so tests fail if they don't see the
    // real byte order mark.
    //
    // It's got do this efficiently:  character I/O is solidly on the
    // critical path.  (So keep buffer length over 2 Kbytes to avoid
    // excess buffering. Many URL handlers stuff a BufferedInputStream
    // between here and the real data source, and larger buffers keep
    // that from slowing you down.)
    //

    /**
     * Constructs the reader from an input stream, autodetecting
     * the encoding to use according to the heuristic specified
     * in the XML 1.0 recommendation.
     *
     * @param in the input stream from which the reader is constructed
     * @exception IOException on error, such as unrecognized encoding
     */
    public static Reader createReader(InputStream in) throws IOException {
        return new XmlReader(in);
    }

    /**
     * Creates a reader supporting the given encoding, mapping
     * from standard encoding names to ones that understood by
     * Java where necessary.
     *
     * @param in the input stream from which the reader is constructed
     * @param encoding the IETF standard name of the encoding to use;
     *	if null, autodetection is used.
     * @exception IOException on error, including unrecognized encoding
     */
    public static Reader createReader(InputStream in, String encoding)
        throws IOException {
        if (encoding == null)
            return new XmlReader(in);
        if ("UTF-8".equalsIgnoreCase(encoding)
            || "UTF8".equalsIgnoreCase(encoding))
            return new Utf8Reader(in);
        if ("US-ASCII".equalsIgnoreCase(encoding)
            || "ASCII".equalsIgnoreCase(encoding))
            return new AsciiReader(in);
        if ("ISO-8859-1".equalsIgnoreCase(encoding)) // plus numerous aliases ..
            return new Iso8859_1Reader(in);

        //
        // What we really want is an administerable resource mapping
        // encoding names/aliases to classnames.  For example a property
        // file resource, "readers/mapping.props", holding and a set
        // of readers in that (sub)package... defaulting to this call
        // only if no better choice is available.
        //
        return new InputStreamReader(in, std2java(encoding));
    }

    //
    // JDK doesn't know all of the standard encoding names, and
    // in particular none of the EBCDIC ones IANA defines (and
    // which IBM encourages).
    //
    static private final Map charsets = new HashMap(31);

    static {
        charsets.put("UTF-16", "Unicode");
        charsets.put("ISO-10646-UCS-2", "Unicode");

        // NOTE: no support for ISO-10646-UCS-4 yet.

        charsets.put("EBCDIC-CP-US", "cp037");
        charsets.put("EBCDIC-CP-CA", "cp037");
        charsets.put("EBCDIC-CP-NL", "cp037");
        charsets.put("EBCDIC-CP-WT", "cp037");

        charsets.put("EBCDIC-CP-DK", "cp277");
        charsets.put("EBCDIC-CP-NO", "cp277");
        charsets.put("EBCDIC-CP-FI", "cp278");
        charsets.put("EBCDIC-CP-SE", "cp278");

        charsets.put("EBCDIC-CP-IT", "cp280");
        charsets.put("EBCDIC-CP-ES", "cp284");
        charsets.put("EBCDIC-CP-GB", "cp285");
        charsets.put("EBCDIC-CP-FR", "cp297");

        charsets.put("EBCDIC-CP-AR1", "cp420");
        charsets.put("EBCDIC-CP-HE", "cp424");
        charsets.put("EBCDIC-CP-BE", "cp500");
        charsets.put("EBCDIC-CP-CH", "cp500");

        charsets.put("EBCDIC-CP-ROECE", "cp870");
        charsets.put("EBCDIC-CP-YU", "cp870");
        charsets.put("EBCDIC-CP-IS", "cp871");
        charsets.put("EBCDIC-CP-AR2", "cp918");

        // IANA also defines two that JDK 1.2 doesn't handle:
        //	EBCDIC-CP-GR		--> CP423
        //	EBCDIC-CP-TR		--> CP905
    }

    // returns an encoding name supported by JDK >= 1.1.6
    // for some cases required by the XML spec
    private static String std2java(String encoding) {
        String temp = encoding.toUpperCase();
        temp = (String) charsets.get(temp);
        return temp != null ? temp : encoding;
    }

    /** Returns the standard name of the encoding in use */
    public String getEncoding() {
        return assignedEncoding;
    }

    private XmlReader(InputStream stream) throws IOException {
        super(stream);
        PushbackInputStream pb;
        byte buf[];
        int len = 0;

        if (stream instanceof PushbackInputStream)
            pb = (PushbackInputStream) stream;
        else
            pb = new PushbackInputStream(stream, MAXPUSHBACK);

        //
        // See if we can figure out the character encoding used
        // in this file by peeking at the first few bytes.
        //
        final int peekSize = 4;
        buf = new byte[peekSize];
        while (len < peekSize) {
            int bytesRead = pb.read(buf, len, peekSize-len);
            if (bytesRead != -1){
                len += bytesRead;
            } else {                               
                break;
            }        
        }
        if (len != peekSize) {
            pb.unread(buf, 0, len);
        } else {
            if (buf[0] == (byte)0xEF && buf[1] == (byte)0xBB && buf[2] == (byte)0xBF) {
                setEncoding(pb, "UTF-8");
                pb.unread(buf, 3, peekSize-3);
                return;
            }

            // Unicode BOM mark not found, unread all bytes
            pb.unread(buf, 0, 4);
            // Use old logic to figure out encoding
            switch (buf[0] & 0x0ff) {
                case 0 :
                    // 00 3c 00 3f == illegal UTF-16 big-endian
                    if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
                        setEncoding(pb, "UnicodeBig");
                        return;
                    }
                    // else it's probably UCS-4
                    break;

                case '<' : // 0x3c: the most common cases!
                    switch (buf[1] & 0x0ff) {
                        // First character is '<'; could be XML without
                        // an XML directive such as "", "