org.openmdx.kernel.xml.AdaptiveInputStreamReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openmdx-base Show documentation
openMDX/Base Library
There is a newer version: 2.18.10
/*
 * ====================================================================
 * Project:     openmdx, http://www.openmdx.org/
 * Description: Adaptive InputStream Reader
 * Owner:       OMEX AG, Switzerland, http://www.omex.ch
 * ====================================================================
 *
 * This software is published under the BSD license  as listed below.
 * 
 * Copyright (c) 2005-2013, OMEX AG, Switzerland
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 * 
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in
 *   the documentation and/or other materials provided with the
 *   distribution.
 * 
 * * Neither the name of the openMDX team nor the names of its
 *   contributors may be used to endorse or promote products derived
 *   from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * 
 * ------------------
 * 
 * This product includes or is based on software developed by other
 * organizations as listed in the NOTICE file.
 */
package org.openmdx.kernel.xml;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The Adaptive Input Stream Reader has the following encoding priorities
 * The constructor's encoding argument
 * 
A byte order mark
 * 
An XML declaration's encoding attribute
 * 
The platform's default encoding
 * 
 */
public class AdaptiveInputStreamReader extends Reader {
	
	/**
	 * Constructor
	 * 
	 * @param in
     * @param encoding overrides the adaptive encoding unless it is null
	 * @param byteOrderMarkAware 
	 * @param xmlDeclarationAware 
	 * @param propagateClose tells whether a close request is propagated to the input stream
	 * @throws IOException 
	 */
	public AdaptiveInputStreamReader(
		InputStream in,
		String overriddenEncoding, 
		boolean byteOrderMarkAware, 
		boolean xmlDeclarationAware, 
		boolean propagateClose
	) throws IOException {
	    String encoding = overriddenEncoding;
		if(byteOrderMarkAware || xmlDeclarationAware){
			InputStream stream = new BufferedInputStream(in);
			if (byteOrderMarkAware){
				String byteOrderMark = ByteOrderMark.readByteOrderMark(stream);
				if(encoding == null) {
                    encoding = byteOrderMark;
                }
			}
			if(xmlDeclarationAware) {
				if(encoding == null) {
				    XMLDeclaration xmlDeclaration = XMLDeclaration.readXMLDeclaration(
						stream
					); 
					if(xmlDeclaration != null) {
                        encoding = xmlDeclaration.getEncoding();
                    }
					this.delegate = new BufferedReader(
						encoding == null ? new InputStreamReader(stream) : new InputStreamReader(stream, encoding)
					);
				} else {
					XMLDeclaration.readXMLDeclaration(
						this.delegate = new BufferedReader(
							new InputStreamReader(stream, encoding)
						)
					);
				}
			} else {
                this.delegate = new BufferedReader(
                    encoding == null ? new InputStreamReader(in) : new InputStreamReader(in, encoding)
                );
            }
		} else {
			this.delegate = new BufferedReader(
				encoding == null ? new InputStreamReader(in) : new InputStreamReader(in, encoding)
			);
		}
        this.propagateClose = propagateClose;
	}	
	
	/**
	 * The delegate, unless the stream is closed
	 */
	private Reader delegate;

	/**
	 * Tells whether close operations shall be propagated to the underlying stream
	 */
	private final boolean propagateClose;

	/**
	 * Ensure that the Reader is open
	 * 
	 * @throws IOException 
	 */
	private final Reader getDelegate(
	) throws IOException{
	    if(this.delegate == null) {
	        throw new IOException("The reader is already closed");
	    }
	    return this.delegate;
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#mark(int)
	 */
    @Override
	public void mark(int readAheadLimit) throws IOException {
	    getDelegate().mark(readAheadLimit);
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#markSupported()
	 */
    @Override
	public boolean markSupported() {
        return this.delegate != null && this.delegate.markSupported();
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#read()
	 */
    @Override
	public int read() throws IOException {
		return getDelegate().read();
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#read(char[], int, int)
	 */
    @Override
	public int read(char[] cbuf, int off, int len) throws IOException {
        return getDelegate().read(cbuf, off, len);
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#read(char[])
	 */
    @Override
	public int read(char[] cbuf) throws IOException {
        return getDelegate().read(cbuf);
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#ready()
	 */
    @Override
	public boolean ready() throws IOException {
		return getDelegate().ready();
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#reset()
	 */
    @Override
	public void reset() throws IOException {
	    getDelegate().reset();
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#skip(long)
	 */
    @Override
	public long skip(long n) throws IOException {
		return getDelegate().skip(n);
	}

	/* (non-Javadoc)
	 * @see java.io.Reader#close()
	 */
    @Override
	public void close() throws IOException {
	    if(this.delegate != null) {
	        if(this.propagateClose) {
	            this.delegate.close();
	        }
	        this.delegate = null;
	    }
	}


	//------------------------------------------------------------------------
	// Class XMLDeclaration
    //------------------------------------------------------------------------
	
	/**
	 * XML Declaration
	 */
	protected static final class XMLDeclaration {

	    /**
	     * Constructor
	     */
	    protected XMLDeclaration(
	    ){
	        super();
	    }
	    
	    /**
	     * Constructor
	     */
	    protected XMLDeclaration(
	        String version,
	        String encoding,
	        String standalone
	    ) {
	        this.version = version;
	        this.encoding = encoding;
	        this.standalone = standalone;
	    }

	    /**
	     * Constructor
	     */
	    public XMLDeclaration(
	        XMLDeclaration that
	    ) {
	        this(
	            that.version,
	            that.encoding,
	            that.standalone
	        );
	    }

	    /**
	     * The mandatory XML version attribute
	     */
	    private String version;
	    
	    /**
	     * The optional encoding attribute
	     */
	    private String encoding;
	    
	    /**
	     * The optional standalone attribute
	     */
	    private String standalone;

	    /**
	     * Maximal number of characters to read ahead.
	     */
	    static final int READ_AHEAD_LIMIT = 100;
	    
	    /**
	     * 
	     */
	    private static final Pattern XML_DECLARATION_PATTERN;
	    
	    /**
	     * @return Returns the encoding.
	     */
	    public String getEncoding() {
	        return 
	            Encodings.ISO_8859_1.equals(this.encoding) ?
	                "ISO-8859-1" :
	            Encodings.UTF_16BE.equals(this.encoding) ||
	            Encodings.UTF_16BE_WITH_BOM.equals(this.encoding) ?
	                "UTF-16BE" :
	            Encodings.UTF_16LE.equals(this.encoding) ||
	            Encodings.UTF_16LE_WITH_BOM.equals(this.encoding) ?
	                "UTF-16LE" :
	            Encodings.UTF_8.equals(this.encoding) ?
	                "UTF-8" :
	            Encodings.WINDOWS_1252.equals(this.encoding) ?
	                "windows-1252" :
	                this.encoding;
	    }

	    /**
	     * @param encoding The encoding to set.
	     */
	    public void setQuotedEncoding(String encoding) {
	        this.encoding = unquote(encoding);
	    }

	    /**
	     * @return Returns the standalone.
	     */
	    public String getStandalone() {
	        return standalone;
	    }

	    /**
	     * @param standalone The standalone to set.
	     */
	    public void setQuotedStandalone(String standalone) {
	        this.standalone = unquote(standalone);
	    }

	    /**
	     * @param version The version to set.
	     */
	    public void setQuotedVersion(String version) {
	        this.version = unquote(version);
	    }
	    
	    /**
	     * @return Returns the version.
	     */
	    public String getVersion() {
	        return version;
	    }

	    /**
	     * Remove the suurounding «'» respectively «"» 
	     * characters.
	     *  
	     * @param quoted the embedded string; may be null
	     * @return the quoted string without its leading or trailing character;
	     * or null if quoted was null. 
	     */
	    private static final String unquote(
	        String quoted
	    ){
	        return quoted == null || quoted.length() < 2 ? 
	            null : 
	            quoted.substring(1, quoted.length() - 1);
	    }
	    
	    /**
	     * Consume the XML Declaration and return it or reset the stream otherwise.
	     * 
	     * @param in the stream
	     * @param regexpFactory 
	     * 
	     * @return the XML Declaration; or null if none has been 
	     * specified.
	     * 
	     * @throws IOException  
	     */
	    public static XMLDeclaration readXMLDeclaration(
	        InputStream in
	    ) throws IOException {
	        return readXMLDeclaration(
	            new ASCIIReader(in)
	        );
	    }

	    /**
	     * Consume the XML Declaration and return it or reset the stream otherwise.
	     * 
	     * @param in the stream
	     * @param regexpFactory TODO
	     * 
	     * @return the XML Declaration; or null if none has been 
	     * specified.
	     * 
	     * @throws IOException  
	     */
	    public static XMLDeclaration readXMLDeclaration(
	        Reader in
	    ) throws IOException {
	        in.mark(READ_AHEAD_LIMIT);
	        try {
	            if(
	                in.read() == '<' &&
	                in.read() == '?' &&
	                in.read() == 'x' &&
	                in.read() == 'm' &&
	                in.read() == 'l'
	            ){
	                StringBuffer b = new StringBuffer();
	                xmlDeclaration: for(
	                    int i = 5, c = in.read();
	                    c > 0 && i++ < READ_AHEAD_LIMIT;
	                    c = in.read()
	                ){
	                    if(c == '>'){
	                        Matcher matcher = XML_DECLARATION_PATTERN.matcher(b.toString());
	                        if(!matcher.matches()) break xmlDeclaration;
	                        XMLDeclaration reply = new XMLDeclaration();
	                        reply.setQuotedVersion(matcher.group(1));
	                        reply.setQuotedEncoding(matcher.group(3));
	                        reply.setQuotedStandalone(matcher.group(5));
	                        return reply;
	                    } else {
	                        b.append((char)c);
	                    }
	                }
	            }
	            in.reset();
	            return null;
	        } catch (IOException exception) {
	            in.reset();
	            throw exception;
	        } catch (RuntimeException exception) {
	            in.reset();
	            throw exception;
	        }
	    }
	    

	    //------------------------------------------------------------------------
	    // Extends Object
	    //------------------------------------------------------------------------
	    
	    /* (non-Javadoc)
	     * @see java.lang.Object#toString()
	     */
	    @Override
	    public String toString() {
	        StringBuffer b = new StringBuffer(
	            ""
	        ).toString();
	    }

	    static {
	        String whitespace = "[ \n\r\t]";
	        String optionalWhitespace = whitespace + '*';
	        String mandatoryWhitespace = whitespace + '+';
	        String value = optionalWhitespace + "=" + optionalWhitespace + 
	            "('[^']*'|\"[^\"]*\")";
	        XML_DECLARATION_PATTERN = Pattern.compile(
	            "^" + 
	            mandatoryWhitespace + "version" + value + "(" + 
	            mandatoryWhitespace + "encoding" + value + ")?(" + 
	            mandatoryWhitespace + "standalone" + value + ")?" + 
	            optionalWhitespace + "\\?$"
	        );
	    }

	}

	
    //------------------------------------------------------------------------
    // Class ASICCReader
    //------------------------------------------------------------------------
    
    /**
     * ASCII Reader
     * 
     * This InputStream Reader is able to read ASCII
     * characters encoded in any of  the following formats provided the stream 
     * does not contain a byte order mark or any other non-ASCII character up
     * the position it is read through the InputStreamASCIIReader.
     * 

     * US-ASCII
     * 
ISO-8859-1
     * 
UTF-8
     * 
UTF-16
     * 
UTF-32
     * 
     * The InputStreamASCIIReader is designed not to read ahead.
     */
    protected static class ASCIIReader extends Reader {

        /**
         * Constructor
         * 
         * @param source
         */
        protected ASCIIReader(
            InputStream in
        ) {
            this.in = in;
        }

        /**
         * 
         */
        protected InputStream in;

        /**
         * 
         */
        private int prefix0 = -1;
        
        /**
         * 
         */
        private int suffix0 = -1;
        
        /**
         * Close disconnects the reader from the underlying 
         * InputStream rather than closing it.
         * 
         * @exception IOException
         */
        @Override
        public void close() throws IOException {
            this.in = null;
        }

        /* (non-Javadoc)
         * @see java.io.Reader#read(char[], int, int)
         */
        @Override
        public int read(
            char[] cbuf, 
            int off, 
            int len
        ) throws IOException {
            for(
                int i = 0;
                i < len;
                i++
            ){
                int c = read();
                if(c < 0) return i == 0 ? -1 : i;
                cbuf[i + off] = (char) c;
            }
            return len;
        }

        /* (non-Javadoc)
         * @see java.io.Reader#read()
         */
        @Override
        public int read(
        ) throws IOException {
            if(isEncodingKnown()){
                for(int i = prefix0; i > 0; i--) this.in.read();
                int c = this.in.read();
                for(int i = suffix0; i > 0; i--) this.in.read();
                return c > 127 ? 0 : c;
            } else if (prefix0 < 0) { // prefix0 unkown
                for(
                    int i = 0;
                    i < 4;
                    i++
                ){
                    int c = this.in.read();
                    if(c != 0) {
                        this.prefix0 = i;
                        if(i > 0) this.suffix0 = 0;
                        return c;
                    }
                }
            } else { // suffix0 unknown
                for(
                    int i = 0;
                    i < 4;
                    i++
                ){
                    int c = this.in.read();
                    if(c != 0) {
                        this.suffix0 = i;
                        return c;
                    }
                }
            }
            return 0;
        }

        /**
         * Tells whether the encoding is already konwn
         * 
         * @return true if the encoding is already konwn
         */
        private boolean isEncodingKnown(){
            return prefix0 >= 0 && suffix0 >= 0;
        }
        
        /**
         * Determine the maximal number of bytes per character
         * 
         * @return the maximal number of bytes per character
         */
        private int maxBytesPerCharacter(
        ){
            return isEncodingKnown() ? 1 + prefix0 + suffix0 : 4;
        }
        
        /* (non-Javadoc)
         * @see java.io.Reader#mark(int)
         */
        @Override
        public void mark(int readAheadLimit) throws IOException {
            this.in.mark(
                readAheadLimit * maxBytesPerCharacter()
            );
        }

        /* (non-Javadoc)
         * @see java.io.Reader#markSupported()
         */
        @Override
        public boolean markSupported() {
            return this.in.markSupported();
        }

        /* (non-Javadoc)
         * @see java.io.Reader#reset()
         */
        @Override
        public void reset() throws IOException {
            this.in.reset();
        }
        
    }

    
    //------------------------------------------------------------------------
    // Class Encodings
    //------------------------------------------------------------------------
    
    /**
     * Canonical Encoding Names for java.io and 
     * java.lang API. 
     */
    protected static class Encodings {

        /**
         * Eight-bit UCS Transformation Format
         * 
         * @since JRE 1.2
         */
        protected final static String UTF_8 = "UTF8";

        /**
         * American Standard Code for Information Interchange
         * 
         * @since JRE 1.2
         */
        protected final static String US_ASCII = "ASCII";
        
        /**
         * ISO 8859-1, Latin Alphabet No. 1
         * 
         * @since JRE 1.2
         */
        protected final static String ISO_8859_1 = "ISO8859_1";
            
        /**
         * Sixteen-bit Unicode Transformation Format, big-endian byte order, with byte-order mark 
         * 
         * @since JRE 1.2
         */
        protected final static String UTF_16BE_WITH_BOM = "UnicodeBig";
        
        /**
         * Sixteen-bit Unicode Transformation Format, little-endian byte order, with byte-order mark 
         * 
         * @since JRE 1.2
         */
        protected final static String UTF_16LE_WITH_BOM = "UnicodeLittle";
        
        /**
         * Windows Latin-1
         * 
         * @since JRE 1.2
         */
        protected final static String WINDOWS_1252 = "Cp1252";
        /**
         * Sixteen-bit Unicode Transformation Format, big-endian byte order
         * 
         * @since JRE 1.3
         */
        protected final static String UTF_16BE = "UnicodeBigUnmarked";
        
        /**
         * Sixteen-bit Unicode Transformation Format, little-endian byte order
         * 
         * @since JRE 1.3
         */
        protected final static String UTF_16LE = "UnicodeLittleUnmarked";
        
        /**
         * Sixteen-bit UCS Transformation Format, byte order identified by 
         * a mandatory initial byte-order mark 
         * @since JRE 1.3
         * 
         * 
         * an optional byte-order mark 
         * @since JRE 1.4
         * 
         */
        protected final static String UTF_16 = "UTF-16";
        
        /**
         * Thirtytwo-bit UCS Transformation Format, byte order identified by an optional byte-order mark.
         */
        protected final static String UTF_32 = "UTF-32";
        
        /**
         * Thirtytwo-bit UCS Transformation Format, big-endian byte order
         */
        protected final static String UTF_32BE = "UTF-32BE";

        /**
         * Thirtytwo-bit UCS Transformation Format, little-endian byte order
         */
        protected final static String UTF_32LE = "UTF-32LE";
        
    }
    
    
    //------------------------------------------------------------------------
    // Class ByteOrderMark
    //------------------------------------------------------------------------
    
    /**
     * Byte Order Mark
     * 
     *  The exact bytes 
     *  comprising the BOM will be whatever the Unicode character FEFF is 
     *  converted into by that transformation format. In that form, the BOM 
     *  serves to indicate both that it is a Unicode file, and which of the 
     *  formats it is in. Examples:
     *  
     *    
     *    
     *      
     *        
     *        
     *      
     *      
     *        
     *        
     *      
     *      
     *        
     *        
     *      
     *      
     *        
     *        
     *      
     *      
     *        
     *        
     *      
     *      
     *        
     *        
     *      
     *    Bytes Encoding Form
EF BB BF UTF-8
00 00 FE FF UTF-32, big-endian
FF FE 00 00 UTF-32, little-endian
FE FF UTF-16, big-endian
FF FE UTF-16, little-endian
     *    
     *  
     */
    protected static class ByteOrderMark {

        /**
         * Constructor 
         */
        private ByteOrderMark() {
            // Avoid instantiation
        }

        /**
         * The Unicode character point used as byte order mark.
         */
        public final static char VALUE = 0xFEFF;
        
        /**
         * Each ENCODINGS entry corresponds to a
         * REPRESENTATIONS entry.
         */
        final public static String[] ENCODINGS = {
            Encodings.UTF_8, 
            Encodings.UTF_32BE,
            Encodings.UTF_32LE,
            Encodings.UTF_16BE,
            Encodings.UTF_16LE      
        };

        /**
         * Each REPRESENTATIONS entry corresponds to an 
         * ENCODINGS entry.
         */
        final public static byte[][] REPRESENTATIONS = new byte[][]{
            new byte[]{(byte)0xEF, (byte)0xBB, (byte)0xBF},
            new byte[]{(byte)0x00, (byte)0x00, (byte)0xFE, (byte)0xFF},
            new byte[]{(byte)0xFF, (byte)0xFE, (byte)0x00, (byte)0x00},
            new byte[]{(byte)0xFE, (byte)0xFF},
            new byte[]{(byte)0xFF, (byte)0xFE}
        };

        /**
         * Consume the input stream's byte order mark if any and return the 
         * corresponding encoding or reset the input stream otherwise.
         * 
         * @param in the input stream
         * 
         * @return the byte order mark's encoding; or null in absence
         * of a byte order mark.
         * 
         * @throws IOException  
         */
        public static String readByteOrderMark(
            InputStream in
        ) throws IOException {
            in.mark(4);
            byte[] head = new byte[4];
            int limit = in.read(head);
            encodings: for(
                int encoding = 0;
                encoding < REPRESENTATIONS.length;
                encoding++
            ){
                byte[] bom = ByteOrderMark.REPRESENTATIONS[encoding];
                if(limit < bom.length) continue encodings;
                for(
                    int j = 0;
                    j < bom.length;
                    j++
                ) if(
                    bom[j] != head[j]
                ) continue encodings;
                in.reset();
                in.skip(bom.length);
                return ByteOrderMark.ENCODINGS[encoding];               
            }
            in.reset();
            return null;
        }

    }

}
Bytes	Encoding Form
EF BB BF	UTF-8
00 00 FE FF	UTF-32, big-endian
FF FE 00 00	UTF-32, little-endian
FE FF	UTF-16, big-endian
FF FE	UTF-16, little-endian