com.sun.syndication.io.XmlReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sesame-tools-rio-rss
There is a newer version: 3.1.0-incubating
/*
 * Copyright 2004 Sun Microsystems, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.sun.syndication.io;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.text.MessageFormat;

/**
 * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
 * the charset encoding of the XML document within the stream.
 * 
 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
 * character stream.
 * 

 * All this has to be done without consuming characters from the stream, if not the XML parser
 * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
 * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
 * parsers).
 * 

 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
 * HTTP streams by offering a wide set of constructors.
 * 

 * By default the charset encoding detection is lenient, the constructor with the lenient flag
 * can be used for an script (following HTTP MIME and XML specifications).
 * All this is nicely explained by Mark Pilgrim in his blog,
 * 
 * Determining the character encoding of a feed.
 * 

 * @author Alejandro Abdelnur
 *
 */
public class XmlReader extends Reader {
    private static final int BUFFER_SIZE = 4096;

    private static final String UTF_8 = "UTF-8";
    private static final String US_ASCII = "US-ASCII";
    private static final String UTF_16BE = "UTF-16BE";
    private static final String UTF_16LE = "UTF-16LE";
    private static final String UTF_16 = "UTF-16";

    private static String _staticDefaultEncoding = null;

    private Reader _reader;
    private String _encoding;
    private String _defaultEncoding;

    /**
     * Sets the default encoding to use if none is set in HTTP content-type,
     * XML prolog and the rules based on content-type are not adequate.
     * 

     * If it is set to NULL the content-type based rules are used.
     * 

     * By default it is NULL.
     * 

     *
     * @param encoding charset encoding to default to.
     */
    public static void setDefaultEncoding(String encoding) {
        _staticDefaultEncoding = encoding;
    }

    /**
     * Returns the default encoding to use if none is set in HTTP content-type,
     * XML prolog and the rules based on content-type are not adequate.
     * 

     * If it is NULL the content-type based rules are used.
     * 

     *
     * @return the default encoding to use.
     */
    public static String getDefaultEncoding() {
        return _staticDefaultEncoding;
    }

    /**
     * Creates a Reader for a File.
     * 

     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
     * missing defaults to UTF-8.
     * 

     * It does a lenient charset encoding detection, check the constructor with the lenient parameter
     * for details.
     * 

     * @param file File to create a Reader from.
     * @throws IOException thrown if there is a problem reading the file.
     *
     */
    public XmlReader(File file) throws IOException {
        this(new FileInputStream(file));
    }

    /**
     * Creates a Reader for a raw InputStream.
     * 

     * It follows the same logic used for files.
     * 

     * It does a lenient charset encoding detection, check the constructor with the lenient parameter
     * for details.
     * 

     * @param is InputStream to create a Reader from.
     * @throws IOException thrown if there is a problem reading the stream.
     *
     */
    public XmlReader(InputStream is) throws IOException {
        this(is,true);
    }

    /**
     * Creates a Reader for a raw InputStream and uses the provided default encoding if none is determined.
     * 

     * It follows the same logic used for files.
     * 

     * If lenient detection is indicated and the detection above fails as per specifications it then attempts
     * the following:
     * 

     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * 

     * Else if the XML prolog had a charset encoding that encoding is used.
     * 

     * Else if the content type had a charset encoding that encoding is used.
     * 

     * Else 'UTF-8' is used.
     * 

     * If lenient detection is indicated an XmlReaderException is never thrown.
     * 

     * @param is InputStream to create a Reader from.
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @param defaultEncoding default encoding to use if one cannot be detected.
     * @throws IOException thrown if there is a problem reading the stream.
     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *
     */
    public XmlReader(InputStream is, boolean lenient, String defaultEncoding)
        throws IOException, XmlReaderException {
        _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
        try {
            doRawStream(is,lenient);
        }
        catch (XmlReaderException ex) {
            if (!lenient) {
                throw ex;
            }
            else {
                doLenientDetection(null,ex);
            }
        }
    }

    /**
     * Creates a Reader for a raw InputStream.
     * 

     * It follows the same logic used for files.
     * 

     * If lenient detection is indicated and the detection above fails as per specifications it then attempts
     * the following:
     * 

     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * 

     * Else if the XML prolog had a charset encoding that encoding is used.
     * 

     * Else if the content type had a charset encoding that encoding is used.
     * 

     * Else 'UTF-8' is used.
     * 

     * If lenient detection is indicated an XmlReaderException is never thrown.
     * 

     * @param is InputStream to create a Reader from.
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @throws IOException thrown if there is a problem reading the stream.
     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *
     */
    public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
        this(is, lenient, null);
    }

    /**
     * Creates a Reader using the InputStream of a URL.
     * 

     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
     * data it uses the same logic used for Files.
     * 

     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
     * data it uses the same logic used for an InputStream with content-type.
     * 

     * It does a lenient charset encoding detection, check the constructor with the lenient parameter
     * for details.
     * 

     * @param url URL to create a Reader from.
     * @throws IOException thrown if there is a problem reading the stream of the URL.
     *
     */
    public XmlReader(URL url) throws IOException {
        this(url.openConnection());
    }

    /**
     * Creates a Reader using the InputStream of a URLConnection.
     * 

     * If the URLConnection is not of type HttpURLConnection and there is not
     * 'content-type' header in the fetched data it uses the same logic used for files.
     * 

     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
     * data it uses the same logic used for an InputStream with content-type.
     * 

     * It does a lenient charset encoding detection, check the constructor with the lenient parameter
     * for details.
     * 

     * @param conn URLConnection to create a Reader from.
     * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
     *
     */
    public XmlReader(URLConnection conn) throws IOException {
        _defaultEncoding = _staticDefaultEncoding;
        boolean lenient = true;
        if (conn instanceof HttpURLConnection) {
            try {
                doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
            }
            catch (XmlReaderException ex) {
                doLenientDetection(conn.getContentType(),ex);
            }
        }
        else
        if (conn.getContentType()!=null) {
            try {
                doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
            }
            catch (XmlReaderException ex) {
                doLenientDetection(conn.getContentType(),ex);
            }
        }
        else {
            try {
                doRawStream(conn.getInputStream(),lenient);
            }
            catch (XmlReaderException ex) {
                doLenientDetection(null,ex);
            }
        }
    }

    /**
     * Creates a Reader using an InputStream and the associated content-type header.
     * 

     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
     * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
     * prolog encoding uses the default encoding mandated by the content-type MIME type.
     * 

     * It does a lenient charset encoding detection, check the constructor with the lenient parameter
     * for details.
     * 

     * @param is InputStream to create the reader from.
     * @param httpContentType content-type header to use for the resolution of the charset encoding.
     * @throws IOException thrown if there is a problem reading the file.
     *
     */
    public XmlReader(InputStream is,String httpContentType) throws IOException {
        this(is,httpContentType,true);
    }

    /**
     * Creates a Reader using an InputStream and the associated content-type header.
     * 

     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
     * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
     * prolog encoding uses the default encoding mandated by the content-type MIME type.
     * 

     * If lenient detection is indicated and the detection above fails as per specifications it then attempts
     * the following:
     * 

     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * 

     * Else if the XML prolog had a charset encoding that encoding is used.
     * 

     * Else if the content type had a charset encoding that encoding is used.
     * 

     * Else 'UTF-8' is used.
     * 

     * If lenient detection is indicated and XmlReaderException is never thrown.
     * 

     * @param is InputStream to create the reader from.
     * @param httpContentType content-type header to use for the resolution of the charset encoding.
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @param defaultEncoding default encoding to use if one cannot be detected.
     * @throws IOException thrown if there is a problem reading the file.
     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *
     */
    public XmlReader(InputStream is,String httpContentType,boolean lenient, String defaultEncoding)
        throws IOException, XmlReaderException {
        _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
        try {
            doHttpStream(is,httpContentType,lenient);
        }
        catch (XmlReaderException ex) {
            if (!lenient) {
                throw ex;
            }
            else {
                doLenientDetection(httpContentType,ex);
            }
        }
    }

    /**
     * Creates a Reader using an InputStream and the associated content-type header. 
     * 

     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
     * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
     * prolog encoding uses the default encoding mandated by the content-type MIME type.
     * 

     * If lenient detection is indicated and the detection above fails as per specifications it then attempts
     * the following:
     * 

     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * 

     * Else if the XML prolog had a charset encoding that encoding is used.
     * 

     * Else if the content type had a charset encoding that encoding is used.
     * 

     * Else 'UTF-8' is used.
     * 

     * If lenient detection is indicated and XmlReaderException is never thrown.
     * 

     * @param is InputStream to create the reader from.
     * @param httpContentType content-type header to use for the resolution of the charset encoding.
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @throws IOException thrown if there is a problem reading the file.
     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *
     */
    public XmlReader(InputStream is, String httpContentType, boolean lenient)
        throws IOException, XmlReaderException {
        this(is, httpContentType, lenient, null);
    }

    private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
        if (httpContentType!=null) {
            if (httpContentType.startsWith("text/html")) {
                httpContentType = httpContentType.substring("text/html".length());
                httpContentType = "text/xml" + httpContentType;
                try {
                    doHttpStream(ex.getInputStream(),httpContentType,true);
                    ex = null;
                }
                catch (XmlReaderException ex2) {
                    ex = ex2;
                }
            }
        }
        if (ex!=null) {
            String encoding = ex.getXmlEncoding();
            if (encoding==null) {
                encoding = ex.getContentTypeEncoding();
            }
            if (encoding==null) {
              encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
            }
            prepareReader(ex.getInputStream(),encoding);
        }
    }

    /**
     * Returns the charset encoding of the XmlReader.
     * 

     * @return charset encoding.
     *
     */
    public String getEncoding() {
        return _encoding;
    }

    public int read(char[] buf,int offset,int len) throws IOException {
        return _reader.read(buf,offset,len);
    }

    /**
     * Closes the XmlReader stream.
     * 
     * @throws IOException thrown if there was a problem closing the stream.
     *
     */
    public void close() throws IOException {
        _reader.close();
    }

    private void doRawStream(InputStream is,boolean lenient) throws IOException {
        BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
        String bomEnc = getBOMEncoding(pis);
        String xmlGuessEnc =  getXMLGuessEncoding(pis);
        String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
        String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
        prepareReader(pis,encoding);
    }

    private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException {
        BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
        String cTMime = getContentTypeMime(httpContentType);
        String cTEnc  = getContentTypeEncoding(httpContentType);
        String bomEnc = getBOMEncoding(pis);
        String xmlGuessEnc =  getXMLGuessEncoding(pis);
        String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
        String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
        prepareReader(pis,encoding);
    }

    private void prepareReader(InputStream is,String encoding) throws IOException {
        _reader = new InputStreamReader(is,encoding);
        _encoding = encoding;
    }

    // InputStream is passed for XmlReaderException creation only
    private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
        String encoding;
        if (bomEnc==null) {
            if (xmlGuessEnc==null || xmlEnc==null) {
                encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
            }
            else
            if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
                encoding = xmlGuessEnc;
            }
            else {
                encoding = xmlEnc;
            }
        }
        else
        if (bomEnc.equals(UTF_8)) {
            if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
                throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
                                             bomEnc,xmlGuessEnc,xmlEnc,is);
            }
            if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
                throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
                                             bomEnc,xmlGuessEnc,xmlEnc,is);
            }
            encoding = UTF_8;
        }
        else
        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
            if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
                throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
            }
            if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
                throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
                                             bomEnc,xmlGuessEnc,xmlEnc,is);
            }
            encoding =bomEnc;
        }
        else {
            throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
                                         bomEnc,xmlGuessEnc,xmlEnc,is);
        }
        return encoding;
    }

    // InputStream is passed for XmlReaderException creation only
    private String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException {
        String encoding;
        if (lenient & xmlEnc!=null) {
            encoding = xmlEnc;
        }
        else {
            boolean appXml = isAppXml(cTMime);
            boolean textXml = isTextXml(cTMime);
            if (appXml || textXml) {
                if (cTEnc==null) {
                    if (appXml) {
                        encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
                    }
                    else {
                        encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
                    }
                }
                else
                if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
                    throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
                                                 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
                }
                else
                if (cTEnc.equals(UTF_16)) {
                    if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
                        encoding = bomEnc;
                    }
                    else {
                        throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
                                                     cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
                    }
                }
                else {
                    encoding = cTEnc;
                }
            }
            else {
                throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
                                             cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
            }
        }
        return encoding;
    }

    // returns MIME type or NULL if httpContentType is NULL
    private static String getContentTypeMime(String httpContentType) {
        String mime = null;
        if (httpContentType!=null) {
            int i = httpContentType.indexOf(";");
            mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
        }
        return mime;
    }

    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");

    // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
    private static String getContentTypeEncoding(String httpContentType) {
        String encoding = null;
        if (httpContentType!=null) {
            int i = httpContentType.indexOf(";");
            if (i>-1) {
                String postMime = httpContentType.substring(i+1);
                Matcher m = CHARSET_PATTERN.matcher(postMime);
                encoding = (m.find()) ? m.group(1) : null;
                encoding = (encoding!=null) ? encoding.toUpperCase() : null;
            }
            if (encoding != null &&
                    ((encoding.startsWith("\"") && encoding.endsWith("\"")) ||
                     (encoding.startsWith("'") && encoding.endsWith("'"))
                    )) {
                encoding = encoding.substring(1, encoding.length() - 1);
            }
        }
        return encoding;
    }

    // returns the BOM in the stream, NULL if not present,
    // if there was BOM the in the stream it is consumed
    private static String getBOMEncoding(BufferedInputStream is) throws IOException {
        String encoding = null;
        int[] bytes = new int[3];
        is.mark(3);
        bytes[0] = is.read();
        bytes[1] = is.read();
        bytes[2] = is.read();

        if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
            encoding = UTF_16BE;
            is.reset();
            is.read();
            is.read();
        }
        else
        if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
            encoding = UTF_16LE;
            is.reset();
            is.read();
            is.read();
        }
        else
        if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
            encoding = UTF_8;
        }
        else {
            is.reset();
        }
        return encoding;
    }

    // returns the best guess for the encoding by looking the first bytes of the stream, ',  NULL if none
    private static String getXmlProlog(BufferedInputStream is,String guessedEnc) throws IOException {
        String encoding = null;
        if (guessedEnc!=null) {
            byte[] bytes = new byte[BUFFER_SIZE];
            is.mark(BUFFER_SIZE);
            int offset = 0;
            int max = BUFFER_SIZE;
            int c = is.read(bytes,offset,max);
            int firstGT = -1;
            while (c!=-1 && firstGT==-1 && offset< BUFFER_SIZE) {
                offset += c;
                max -= c;
                c = is.read(bytes,offset,max);
                firstGT = new String(bytes, 0, offset).indexOf(">");
            }
            if (firstGT == -1) {
                if (c == -1) {
                    throw new IOException("Unexpected end of XML stream");
                }
                else {
                    throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
                }
            }
            int bytesRead = offset;
            if (bytesRead>0) {
                is.reset();
                Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,firstGT + 1), guessedEnc);
                BufferedReader bReader = new BufferedReader(reader);
                StringBuffer prolog = new StringBuffer();
                String line = bReader.readLine();
                while (line != null) {
                    prolog.append(line);
                    line = bReader.readLine();
                }
                Matcher m = ENCODING_PATTERN.matcher(prolog);
                if (m.find()) {
                    encoding = m.group(1).toUpperCase();
                    encoding = encoding.substring(1,encoding.length()-1);
                }
            }
        }
        return encoding;
    }

    // indicates if the MIME type belongs to the APPLICATION XML family
    private static boolean isAppXml(String mime) {
        return mime!=null &&
               (mime.equals("application/xml") ||
                mime.equals("application/xml-dtd") ||
                mime.equals("application/xml-external-parsed-entity") ||
                (mime.startsWith("application/") && mime.endsWith("+xml")));
    }

    // indicates if the MIME type belongs to the TEXT XML family
    private static boolean isTextXml(String mime) {
        return mime!=null &&
               (mime.equals("text/xml") ||
                mime.equals("text/xml-external-parsed-entity") ||
                (mime.startsWith("text/") && mime.endsWith("+xml")));
    }

    private static final MessageFormat RAW_EX_1 = new MessageFormat(
            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");

    private static final MessageFormat RAW_EX_2 = new MessageFormat(
            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");

    private static final MessageFormat HTTP_EX_1 = new MessageFormat(
            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");

    private static final MessageFormat HTTP_EX_2 = new MessageFormat(
            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");

    private static final MessageFormat HTTP_EX_3 = new MessageFormat(
            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");

}