com.sun.org.apache.xml.internal.serializer.EncodingInfo Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jaxp-ri Show documentation
Java API for XML Processing Reference Implementation
The newest version!
/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1997-2010 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * https://glassfish.dev.java.net/public/CDDL+GPL_1_1.html
 * or packager/legal/LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at packager/legal/LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 *
 *
 * This file incorporates work covered by the following copyright and
 * permission notice:
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Id: EncodingInfo.java,v 1.9 2010-11-01 04:34:44 joehw Exp $
 */
package com.sun.org.apache.xml.internal.serializer;

import java.io.UnsupportedEncodingException;

/**
 * Holds information about a given encoding, which is the Java name for the
 * encoding, the equivalent ISO name.
 * 
 * An object of this type has two useful methods
 * 
 * isInEncoding(char ch);
 * 
 * which can be called if the character is not the high one in
 * a surrogate pair and:
 *  * isInEncoding(char high, char low);
 * 
 * which can be called if the two characters from a high/low surrogate pair.
 * 
 * An EncodingInfo object is a node in a binary search tree. Such a node
 * will answer if a character is in the encoding, and do so for a given
 * range of unicode values (m_first to
 * m_last). It will handle a certain range of values
 * explicitly (m_explFirst to m_explLast).
 * If the unicode point is before that explicit range, that is it
 * is in the range m_first <= value < m_explFirst, then it will delegate to another EncodingInfo object for The root
 * of such a tree, m_before.  Likewise for values in the range 
 * m_explLast < value <= m_last, but delgating to m_after
 * 

 * Actually figuring out if a code point is in the encoding is expensive. So the
 * purpose of this tree is to cache such determinations, and not to build the
 * entire tree of information at the start, but only build up as much of the 
 * tree as is used during the transformation.
 * 

 * This Class is not a public API, and should only be used internally within
 * the serializer.
 * 
 * @xsl.usage internal
 */
public final class EncodingInfo extends Object
{

    /**
     * The ISO encoding name.
     */
    final String name;

    /**
     * The name used by the Java convertor.
     */
    final String javaName;

    /**
     * A helper object that we can ask if a
     * single char, or a surrogate UTF-16 pair
     * of chars that form a single character,
     * is in this encoding.
     */
    private InEncoding m_encoding;
    
    /**
     * This is not a public API. It returns true if the
     * char in question is in the encoding.
     * @param ch the char in question.
     * @xsl.usage internal
     */
    public boolean isInEncoding(char ch) {
        if (m_encoding == null) {
            m_encoding = new EncodingImpl();
            
            // One could put alternate logic in here to
            // instantiate another object that implements the
            // InEncoding interface. For example if the JRE is 1.4 or up
            // we could have an object that uses JRE 1.4 methods
        }
        return m_encoding.isInEncoding(ch); 
    }
    
    /**
     * This is not a public API. It returns true if the
     * character formed by the high/low pair is in the encoding.
     * @param high a char that the a high char of a high/low surrogate pair.
     * @param low a char that is the low char of a high/low surrogate pair.
     * @xsl.usage internal
     */
    public boolean isInEncoding(char high, char low) {
        if (m_encoding == null) {
            m_encoding = new EncodingImpl();
            
            // One could put alternate logic in here to
            // instantiate another object that implements the
            // InEncoding interface. For example if the JRE is 1.4 or up
            // we could have an object that uses JRE 1.4 methods
        }
        return m_encoding.isInEncoding(high, low); 
    }

    /**
     * Create an EncodingInfo object based on the ISO name and Java name.
     * If both parameters are null any character will be considered to
     * be in the encoding. This is useful for when the serializer is in
     * temporary output state, and has no assciated encoding.
     *
     * @param name reference to the ISO name.
     * @param javaName reference to the Java encoding name.
     */
    public EncodingInfo(String name, String javaName)
    {

        this.name = name;
        this.javaName = javaName;
    }
    
    
    
    /**
     * A simple interface to isolate the implementation.
     * We could also use some new JRE 1.4 methods in another implementation
     * provided we use reflection with them.
     * 

     * This interface is not a public API,
     * and should only be used internally within the serializer. 
     * @xsl.usage internal
     */
    private interface InEncoding {
        /**
         * Returns true if the char is in the encoding
         */
        public boolean isInEncoding(char ch);
        /**
         * Returns true if the high/low surrogate pair forms
         * a character that is in the encoding.
         */
        public boolean isInEncoding(char high, char low);
    }

    /**
     * This class implements the 
     */
    private class EncodingImpl implements InEncoding {
        


        public boolean isInEncoding(char ch1) {
            final boolean ret;
            int codePoint = Encodings.toCodePoint(ch1);
            if (codePoint < m_explFirst) {
                // The unicode value is before the range
                // that we explictly manage, so we delegate the answer.
                
                // If we don't have an m_before object to delegate to, make one.
                if (m_before == null)
                    m_before =
                        new EncodingImpl(
                            m_encoding,
                            m_first,
                            m_explFirst - 1,
                            codePoint);
                ret = m_before.isInEncoding(ch1);
            } else if (m_explLast < codePoint) {
                // The unicode value is after the range
                // that we explictly manage, so we delegate the answer.
                
                // If we don't have an m_after object to delegate to, make one.
                if (m_after == null)
                    m_after =
                        new EncodingImpl(
                            m_encoding,
                            m_explLast + 1,
                            m_last,
                            codePoint);
                ret = m_after.isInEncoding(ch1);
            } else {
                // The unicode value is in the range we explitly handle
                final int idx = codePoint - m_explFirst;
                
                // If we already know the answer, just return it.
                if (m_alreadyKnown[idx])
                    ret = m_isInEncoding[idx];
                else {
                    // We don't know the answer, so find out,
                    // which may be expensive, then cache the answer 
                    ret = inEncoding(ch1, m_encoding);
                    m_alreadyKnown[idx] = true;
                    m_isInEncoding[idx] = ret;
                }
            }
            return ret;
        }

        public boolean isInEncoding(char high, char low) {
            final boolean ret;
            int codePoint = Encodings.toCodePoint(high,low);
            if (codePoint < m_explFirst) {
                // The unicode value is before the range
                // that we explictly manage, so we delegate the answer.
                
                // If we don't have an m_before object to delegate to, make one.
                if (m_before == null)
                    m_before =
                        new EncodingImpl(
                            m_encoding,
                            m_first,
                            m_explFirst - 1,
                            codePoint);
                ret = m_before.isInEncoding(high,low);
            } else if (m_explLast < codePoint) {
                // The unicode value is after the range
                // that we explictly manage, so we delegate the answer.
                
                // If we don't have an m_after object to delegate to, make one.
                if (m_after == null)
                    m_after =
                        new EncodingImpl(
                            m_encoding,
                            m_explLast + 1,
                            m_last,
                            codePoint);
                ret = m_after.isInEncoding(high,low);
            } else {
                // The unicode value is in the range we explitly handle
                final int idx = codePoint - m_explFirst;
                
                // If we already know the answer, just return it.
                if (m_alreadyKnown[idx])
                    ret = m_isInEncoding[idx];
                else {
                    // We don't know the answer, so find out,
                    // which may be expensive, then cache the answer 
                    ret = inEncoding(high, low, m_encoding);
                    m_alreadyKnown[idx] = true;
                    m_isInEncoding[idx] = ret;
                }
            }
            return ret;
        }

        /**
         * The encoding.
         */
        final private String m_encoding;
        /**
         * m_first through m_last is the range of unicode
         * values that this object will return an answer on.
         * It may delegate to a similar object with a different
         * range
         */
        final private int m_first;
        
        /**
         * m_explFirst through m_explLast is the range of unicode
         * value that this object handles explicitly and does not
         * delegate to a similar object.
         */
        final private int m_explFirst;
        final private int m_explLast;
        final private int m_last;

        /**
         * The object, of the same type as this one,
         * that handles unicode values in a range before
         * the range explictly handled by this object, and
         * to which this object may delegate.
         */
        private InEncoding m_before;
        /**
         * The object, of the same type as this one,
         * that handles unicode values in a range after
         * the range explictly handled by this object, and
         * to which this object may delegate.
         */
        private InEncoding m_after;
        
        /**
         * The number of unicode values explicitly handled
         * by a single EncodingInfo object. This value is 
         * tuneable, but is set to 128 because that covers the
         * entire low range of ASCII type chars within a single
         * object.
         */
        private static final int RANGE = 128;

        /**
         * A flag to record if we already know the answer
         * for the given unicode value.
         */
        final private boolean m_alreadyKnown[] = new boolean[RANGE];
        /**
         * A table holding the answer on whether the given unicode
         * value is in the encoding.
         */
        final private boolean m_isInEncoding[] = new boolean[RANGE];
        
        private EncodingImpl() {
            // This object will answer whether any unicode value
            // is in the encoding, it handles values 0 through Integer.MAX_VALUE
            this(javaName, 0, Integer.MAX_VALUE, (char) 0);
        }

        private EncodingImpl(String encoding, int first, int last, int codePoint) {
            // Set the range of unicode values that this object manages
            // either explicitly or implicitly.
            m_first = first;
            m_last = last;  
                      
            // Set the range of unicode values that this object 
            // explicitly manages. Align the explicitly managed values
            // to RANGE so multiple EncodingImpl objects dont manage the same 
            // values.
            m_explFirst = codePoint / RANGE * RANGE;
            m_explLast = m_explFirst + (RANGE-1);
            
            m_encoding = encoding;
            
            if (javaName != null)
            {
                // Some optimization.
                if (0 <= m_explFirst && m_explFirst <= 127) {
                    // This particular EncodingImpl explicitly handles
                    // characters in the low range.
                    if ("UTF8".equals(javaName)
                        || "UTF-16".equals(javaName)
                        || "ASCII".equals(javaName)
                        || "US-ASCII".equals(javaName)
                        || "Unicode".equals(javaName)
                        || "UNICODE".equals(javaName)
                        || javaName.startsWith("ISO8859")) {
                        
                        // Not only does this EncodingImpl object explicitly
                        // handle chracters in the low range, it is
                        // also one that we know something about, without
                        // needing to call inEncoding(char ch, String encoding)
                        // for this low range
                        //
                        // By initializing the table ahead of time
                        // for these low values, we prevent the expensive
                        // inEncoding(char ch, String encoding)
                        // from being called, at least for these common
                        // encodings.
                        for (int unicode = 1; unicode < 127; unicode++) {
                            final int idx = unicode - m_explFirst;
                            if (0 <= idx && idx < RANGE) {
                                m_alreadyKnown[idx] = true;
                                m_isInEncoding[idx] = true;
                            }
                        }
                    }
                }

                /* A little bit more than optimization.
                 * 
                 * We will say that any character is in the encoding if
                 * we don't have an encoding.
                 * This is meaningful when the serializer is being used
                 * in temporary output state, where we are not writing to
                 * the final output tree.  It is when writing to the
                 * final output tree that we need to worry about the output
                 * encoding
                 */
                if (javaName == null) {
                    for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
                        m_alreadyKnown[idx] = true;
                        m_isInEncoding[idx] = true;
                    }
                }
            }
        }
    }

    /**
     * This is heart of the code that determines if a given character
     * is in the given encoding. This method is probably expensive,
     * and the answer should be cached.
     * 

     * This method is not a public API,
     * and should only be used internally within the serializer.
     * @param ch the char in question, that is not a high char of
     * a high/low surrogate pair.
     * @param encoding the Java name of the enocding.
     * 
     * @xsl.usage internal
     * 
     */
    private static boolean inEncoding(char ch, String encoding) {
        boolean isInEncoding;
        try {
            char cArray[] = new char[1];
            cArray[0] = ch;
            // Construct a String from the char 
            String s = new String(cArray);
            // Encode the String into a sequence of bytes 
            // using the given, named charset. 
            byte[] bArray = s.getBytes(encoding);
            isInEncoding = inEncoding(ch, bArray);

        } catch (Exception e) {
            isInEncoding = false;
            
            // If for some reason the encoding is null, e.g.
            // for a temporary result tree, we should just
            // say that every character is in the encoding.
            if (encoding == null)
            	isInEncoding = true;
        }
        return isInEncoding;
    }
    
    /**
     * This is heart of the code that determines if a given high/low
     * surrogate pair forms a character that is in the given encoding.
     * This method is probably expensive, and the answer should be cached. 
     * 
     * This method is not a public API,
     * and should only be used internally within the serializer.
     * @param high the high char of
     * a high/low surrogate pair.
     * @param low the low char of a high/low surrogate pair.
     * @param encoding the Java name of the encoding.
     * 
     * @xsl.usage internal
     * 
     */ 
    private static boolean inEncoding(char high, char low, String encoding) {
        boolean isInEncoding;
        try {
            char cArray[] = new char[2];
            cArray[0] = high;
            cArray[1] = low;
            // Construct a String from the char 
            String s = new String(cArray);
            // Encode the String into a sequence of bytes 
            // using the given, named charset. 
            byte[] bArray = s.getBytes(encoding);
            isInEncoding = inEncoding(high,bArray);
        } catch (Exception e) {
            isInEncoding = false;
        }
        
        return isInEncoding;
    } 
    
    /**
     * This method is the core of determining if character
     * is in the encoding. The method is not foolproof, because
     * s.getBytes(encoding) has specified behavior only if the
     * characters are in the specified encoding. However this
     * method tries it's best.
     * @param ch the char that was converted using getBytes, or
     * the first char of a high/low pair that was converted.
     * @param data the bytes written out by the call to s.getBytes(encoding);
     * @return true if the character is in the encoding.
     */
    private static boolean inEncoding(char ch, byte[] data) {
        final boolean isInEncoding;
        // If the string written out as data is not in the encoding,
        // the output is not specified according to the documentation
        // on the String.getBytes(encoding) method,
        // but we do our best here.        
        if (data==null || data.length == 0) {
            isInEncoding = false;
        }
        else {
            if (data[0] == 0)
                isInEncoding = false;
            else if (data[0] == '?' && ch != '?')
                isInEncoding = false;
            /*
             * else if (isJapanese) {
             *   // isJapanese is really 
             *   //   (    "EUC-JP".equals(javaName) 
             *   //    ||  "EUC_JP".equals(javaName)
             *  //     ||  "SJIS".equals(javaName)   )
             * 
             *   // Work around some bugs in JRE for Japanese
             *   if(data[0] == 0x21)
             *     isInEncoding = false;
             *   else if (ch == 0xA5)
             *     isInEncoding = false;
             *   else
             *     isInEncoding = true;
             * }
             */ 
                
            else {
                // We don't know for sure, but it looks like it is in the encoding
                isInEncoding = true; 
            }
        }
        return isInEncoding;
    }

}