org.apache.xml.serializer.CharInfo Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the  "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * $Id: CharInfo.java 468654 2006-10-28 07:09:23Z minchau $
 */
package org.apache.xml.serializer;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.PropertyResourceBundle;
import java.util.ResourceBundle;
import java.security.AccessController;
import java.security.PrivilegedAction;

import javax.xml.transform.TransformerException;

import org.apache.xml.serializer.utils.MsgKey;
import org.apache.xml.serializer.utils.SystemIDResolver;
import org.apache.xml.serializer.utils.Utils;
import org.apache.xml.serializer.utils.WrappedRuntimeException;

/**
 * This class provides services that tell if a character should have
 * special treatement, such as entity reference substitution or normalization
 * of a newline character.  It also provides character to entity reference
 * lookup.
 *
 * DEVELOPERS: See Known Issue in the constructor.
 * 
 * @xsl.usage internal
 */
final class CharInfo
{
    /** Given a character, lookup a String to output (e.g. a decorated entity reference). */
    private HashMap m_charToString;

    /**
     * The name of the HTML entities file.
     * If specified, the file will be resource loaded with the default class loader.
     */
    public static final String HTML_ENTITIES_RESOURCE = 
                SerializerBase.PKG_NAME+".HTMLEntities";

    /**
     * The name of the XML entities file.
     * If specified, the file will be resource loaded with the default class loader.
     */
    public static final String XML_ENTITIES_RESOURCE = 
                SerializerBase.PKG_NAME+".XMLEntities";

    /** The horizontal tab character, which the parser should always normalize. */
    static final char S_HORIZONAL_TAB = 0x09;

    /** The linefeed character, which the parser should always normalize. */
    static final char S_LINEFEED = 0x0A;

    /** The carriage return character, which the parser should always normalize. */
    static final char S_CARRIAGERETURN = 0x0D;
    static final char S_SPACE = 0x20;
    static final char S_QUOTE = 0x22;
    static final char S_LT = 0x3C;
    static final char S_GT = 0x3E;
    static final char S_NEL = 0x85;    
    static final char S_LINE_SEPARATOR = 0x2028;
    
    /** This flag is an optimization for HTML entities. It false if entities 
     * other than quot (34), amp (38), lt (60) and gt (62) are defined
     * in the range 0 to 127.
     * @xsl.usage internal
     */    
    boolean onlyQuotAmpLtGt;
    
    /** Copy the first 0,1 ... ASCII_MAX values into an array */
    static final int ASCII_MAX = 128;
    
    /** Array of values is faster access than a set of bits 
     * to quickly check ASCII characters in attribute values,
     * the value is true if the character in an attribute value
     * should be mapped to a String. 
     */
    private final boolean[] shouldMapAttrChar_ASCII;
    
    /** Array of values is faster access than a set of bits 
     * to quickly check ASCII characters in text nodes, 
     * the value is true if the character in a text node
     * should be mapped to a String. 
     */
    private final boolean[] shouldMapTextChar_ASCII;

    /** An array of bits to record if the character is in the set.
     * Although information in this array is complete, the
     * isSpecialAttrASCII array is used first because access to its values
     * is common and faster.
     */   
    private final int array_of_bits[];
     
    
    // 5 for 32 bit words,  6 for 64 bit words ...
    /*
     * This constant is used to shift an integer to quickly
     * calculate which element its bit is stored in.
     * 5 for 32 bit words (int) ,  6 for 64 bit words (long)
     */
    private static final int SHIFT_PER_WORD = 5;
    
    /*
     * A mask to get the low order bits which are used to
     * calculate the value of the bit within a given word,
     * that will represent the presence of the integer in the 
     * set.
     * 
     * 0x1F for 32 bit words (int),
     * or 0x3F for 64 bit words (long) 
     */
    private static final int LOW_ORDER_BITMASK = 0x1f;
    
    /*
     * This is used for optimizing the lookup of bits representing
     * the integers in the set. It is the index of the first element
     * in the array array_of_bits[] that is not used.
     */
    private int firstWordNotUsed;


    /**
     * A base constructor just to explicitly create the fields,
     * with the exception of m_charToString which is handled
     * by the constructor that delegates base construction to this one.
     * 
     * m_charToString is not created here only for performance reasons,
     * to avoid creating a Hashtable that will be replaced when
     * making a mutable copy, {@link #mutableCopyOf(CharInfo)}. 
     *
     */
    private CharInfo() 
    {
    	this.array_of_bits = createEmptySetOfIntegers(65535);
    	this.firstWordNotUsed = 0;
    	this.shouldMapAttrChar_ASCII = new boolean[ASCII_MAX];
    	this.shouldMapTextChar_ASCII = new boolean[ASCII_MAX];
    	this.m_charKey = new CharKey();
    	
    	// Not set here, but in a constructor that uses this one
    	// this.m_charToString =  new Hashtable();  
    	
    	this.onlyQuotAmpLtGt = true;
    	

    	return;
    }
    
    private CharInfo(String entitiesResource, String method, boolean internal)
    {
    	// call the default constructor to create the fields
    	this();
    	m_charToString = new HashMap();

        ResourceBundle entities = null;
        boolean noExtraEntities = true;

        // Make various attempts to interpret the parameter as a properties
        // file or resource file, as follows:
        //
        //   1) attempt to load .properties file using ResourceBundle
        //   2) try using the class loader to find the specified file a resource
        //      file
        //   3) try treating the resource a URI

        if (internal) { 
            try {
                // Load entity property files by using PropertyResourceBundle,
                // cause of security issure for applets
                entities = PropertyResourceBundle.getBundle(entitiesResource);
            } catch (Exception e) {}
        }

        if (entities != null) {
            Enumeration keys = entities.getKeys();
            while (keys.hasMoreElements()){
                String name = (String) keys.nextElement();
                String value = entities.getString(name);
                int code = Integer.parseInt(value);
                boolean extra = defineEntity(name, (char) code);
                if (extra)
                    noExtraEntities = false;
            }
        } else {
            InputStream is = null;

            // Load user specified resource file by using URL loading, it
            // requires a valid URI as parameter
            try {
                if (internal) {
                    is = CharInfo.class.getResourceAsStream(entitiesResource);
                } else {
                    ClassLoader cl = ObjectFactory.findClassLoader();
                    if (cl == null) {
                        is = ClassLoader.getSystemResourceAsStream(entitiesResource);
                    } else {
                        is = cl.getResourceAsStream(entitiesResource);
                    }

                    if (is == null) {
                        try {
                            URL url = new URL(entitiesResource);
                            is = url.openStream();
                        } catch (Exception e) {}
                    }
                }

                if (is == null) {
                    throw new RuntimeException(
                        Utils.messages.createMessage(
                            MsgKey.ER_RESOURCE_COULD_NOT_FIND,
                            new Object[] {entitiesResource, entitiesResource}));
                }

                // Fix Bugzilla#4000: force reading in UTF-8
                //  This creates the de facto standard that Xalan's resource 
                //  files must be encoded in UTF-8. This should work in all
                // JVMs.
                //
                // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
                // didn't implement the UTF-8 encoding. Theoretically, we should
                // simply let it fail in that case, since the JVM is obviously
                // broken if it doesn't support such a basic standard.  But
                // since there are still some users attempting to use VJ++ for
                // development, we have dropped in a fallback which makes a
                // second attempt using the platform's default encoding. In VJ++
                // this is apparently ASCII, which is subset of UTF-8... and
                // since the strings we'll be reading here are also primarily
                // limited to the 7-bit ASCII range (at least, in English
                // versions of Xalan), this should work well enough to keep us
                // on the air until we're ready to officially decommit from
                // VJ++.

                BufferedReader reader;
                try {
                    reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    reader = new BufferedReader(new InputStreamReader(is));
                }

                String line = reader.readLine();

                while (line != null) {
                    if (line.length() == 0 || line.charAt(0) == '#') {
                        line = reader.readLine();

                        continue;
                    }

                    int index = line.indexOf(' ');

                    if (index > 1) {
                        String name = line.substring(0, index);

                        ++index;

                        if (index < line.length()) {
                            String value = line.substring(index);
                            index = value.indexOf(' ');

                            if (index > 0) {
                                value = value.substring(0, index);
                            }

                            int code = Integer.parseInt(value);

                            boolean extra = defineEntity(name, (char) code);
                            if (extra)
                                noExtraEntities = false;
                        }
                    }

                    line = reader.readLine();
                }

                is.close();
            } catch (Exception e) {
                throw new RuntimeException(
                    Utils.messages.createMessage(
                        MsgKey.ER_RESOURCE_COULD_NOT_LOAD,
                        new Object[] { entitiesResource,
                                       e.toString(),
                                       entitiesResource,
                                       e.toString()}));
            } finally {
                if (is != null) {
                    try {
                        is.close();
                    } catch (Exception except) {}
                }
            }
        }

        onlyQuotAmpLtGt = noExtraEntities;
            
        /* Now that we've used get(ch) just above to initialize the
         * two arrays we will change by adding a tab to the set of 
         * special chars for XML (but not HTML!).
         * We do this because a tab is always a
         * special character in an XML attribute, 
         * but only a special character in XML text 
         * if it has an entity defined for it.
         * This is the reason for this delay.
         */
        if (Method.XML.equals(method)) 
        {       
            // We choose not to escape the quotation mark as " in text nodes
            shouldMapTextChar_ASCII[S_QUOTE] = false;
        }
        
        if (Method.HTML.equals(method)) {
        	// The XSLT 1.0 recommendation says 
        	// "The html output method should not escape < characters occurring in attribute values."
        	// So we don't escape '<' in an attribute for HTML
        	shouldMapAttrChar_ASCII['<'] = false;    
        	
        	// We choose not to escape the quotation mark as " in text nodes.
            shouldMapTextChar_ASCII[S_QUOTE] = false;
        }
    }

    /**
     * Defines a new character reference. The reference's name and value are
     * supplied. Nothing happens if the character reference is already defined.
     * 
Unlike internal entities, character references are a string to single
     * character mapping. They are used to map non-ASCII characters both on
     * parsing and printing, primarily for HTML documents. '&lt;' is an
     * example of a character reference.
     *
     * @param name The entity's name
     * @param value The entity's value
     * @return true if the mapping is not one of:
     * 
     *  '<' to "<"
     * 
 '>' to ">"
     * 
 '&' to "&"
     * 
 '"' to """
     * 
     */
    private boolean defineEntity(String name, char value)
    {
        StringBuffer sb = new StringBuffer("&");
        sb.append(name);
        sb.append(';');
        String entityString = sb.toString();
        
        boolean extra = defineChar2StringMapping(entityString, value);
        return extra;
    }

    /**
     * A utility object, just used to map characters to output Strings,
     * needed because a HashMap needs to map an object as a key, not a 
     * Java primitive type, like a char, so this object gets around that
     * and it is reusable.
     */
    private final CharKey m_charKey;

    /**
     * Map a character to a String. For example given
     * the character '>' this method would return the fully decorated
     * entity name "<".
     * Strings for entity references are loaded from a properties file,
     * but additional mappings defined through calls to defineChar2String()
     * are possible. Such entity reference mappings could be over-ridden.
     *
     * This is reusing a stored key object, in an effort to avoid
     * heap activity. Unfortunately, that introduces a threading risk.
     * Simplest fix for now is to make it a synchronized method, or to give
     * up the reuse; I see very little performance difference between them.
     * Long-term solution would be to replace the hashtable with a sparse array
     * keyed directly from the character's integer value; see DTM's
     * string pool for a related solution.
     *
     * @param value The character that should be resolved to
     * a String, e.g. resolve '>' to  "<".
     *
     * @return The String that the character is mapped to, or null if not found.
     * @xsl.usage internal
     */
    String getOutputStringForChar(char value)
    {
        // CharKey m_charKey = new CharKey(); //Alternative to synchronized
        m_charKey.setChar(value);
        return (String) m_charToString.get(m_charKey);
    }
    
    /**
     * Tell if the character argument that is from
     * an attribute value has a mapping to a String.
     * 
     * @param value the value of a character that is in an attribute value
     * @return true if the character should have any special treatment, 
     * such as when writing out entity references.
     * @xsl.usage internal
     */
    final boolean shouldMapAttrChar(int value)
    {
        // for performance try the values in the boolean array first,
        // this is faster access than the BitSet for common ASCII values

        if (value < ASCII_MAX)
            return shouldMapAttrChar_ASCII[value];

        // rather than java.util.BitSet, our private
        // implementation is faster (and less general).
        return get(value);
    }    

    /**
     * Tell if the character argument that is from a 
     * text node has a mapping to a String, for example
     * to map '<' to "<".
     * 
     * @param value the value of a character that is in a text node
     * @return true if the character has a mapping to a String, 
     * such as when writing out entity references.
     * @xsl.usage internal
     */
    final boolean shouldMapTextChar(int value)
    {
        // for performance try the values in the boolean array first,
        // this is faster access than the BitSet for common ASCII values

        if (value < ASCII_MAX)
            return shouldMapTextChar_ASCII[value];

        // rather than java.util.BitSet, our private
        // implementation is faster (and less general).
        return get(value);
    }
    

     
    private static CharInfo getCharInfoBasedOnPrivilege(
        final String entitiesFileName, final String method, 
        final boolean internal){
            return (CharInfo) AccessController.doPrivileged(
                new PrivilegedAction() {
                        public Object run() {
                            return new CharInfo(entitiesFileName, 
                              method, internal);}
            });            
    }
     
    /**
     * Factory that reads in a resource file that describes the mapping of
     * characters to entity references.
     *
     * Resource files must be encoded in UTF-8 and have a format like:
     *      * # First char # is a comment
     * Entity numericValue
     * quot 34
     * amp 38
     * 
     * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
     *
     * @param entitiesResource Name of entities resource file that should
     * be loaded, which describes that mapping of characters to entity references.
     * @param method the output method type, which should be one of "xml", "html", "text"...
     * 
     * @xsl.usage internal
     */
    static CharInfo getCharInfo(String entitiesFileName, String method)
    {
        CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
        if (charInfo != null) {
        	return mutableCopyOf(charInfo);
        }

        // try to load it internally - cache
        try {
            charInfo = getCharInfoBasedOnPrivilege(entitiesFileName, 
                                        method, true);
            // Put the common copy of charInfo in the cache, but return
            // a copy of it.
            m_getCharInfoCache.put(entitiesFileName, charInfo);
            return mutableCopyOf(charInfo);
        } catch (Exception e) {}

        // try to load it externally - do not cache
        try {
            return getCharInfoBasedOnPrivilege(entitiesFileName, 
                                method, false);
        } catch (Exception e) {}

        String absoluteEntitiesFileName;

        if (entitiesFileName.indexOf(':') < 0) {
            absoluteEntitiesFileName =
                SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName);
        } else {
            try {
                absoluteEntitiesFileName =
                    SystemIDResolver.getAbsoluteURI(entitiesFileName, null);
            } catch (TransformerException te) {
                throw new WrappedRuntimeException(te);
            }
        }

        return getCharInfoBasedOnPrivilege(entitiesFileName, 
                                method, false);
    }

    /**
     * Create a mutable copy of the cached one.
     * @param charInfo The cached one.
     * @return
     */
    private static CharInfo mutableCopyOf(CharInfo charInfo) {
    	CharInfo copy = new CharInfo();
    	
    	int max = charInfo.array_of_bits.length;
    	System.arraycopy(charInfo.array_of_bits,0,copy.array_of_bits,0,max);
    	
    	copy.firstWordNotUsed = charInfo.firstWordNotUsed;
    	
    	max = charInfo.shouldMapAttrChar_ASCII.length;
    	System.arraycopy(charInfo.shouldMapAttrChar_ASCII,0,copy.shouldMapAttrChar_ASCII,0,max);
    	
    	max = charInfo.shouldMapTextChar_ASCII.length;
    	System.arraycopy(charInfo.shouldMapTextChar_ASCII,0,copy.shouldMapTextChar_ASCII,0,max);
    	
    	// utility field copy.m_charKey is already created in the default constructor 
    	
    	copy.m_charToString = (HashMap) charInfo.m_charToString.clone();
    	
    	copy.onlyQuotAmpLtGt = charInfo.onlyQuotAmpLtGt;
    	    	
		return copy;
	}

	/** 
	 * Table of user-specified char infos.
	 * The table maps entify file names (the name of the
	 * property file without the .properties extension)
	 * to CharInfo objects populated with entities defined in 
	 * corresponding property file.  
	 */
    private static Hashtable m_getCharInfoCache = new Hashtable();

    /**
     * Returns the array element holding the bit value for the
     * given integer
     * @param i the integer that might be in the set of integers
     * 
     */
    private static int arrayIndex(int i) {
        return (i >> SHIFT_PER_WORD);
    }

    /**
     * For a given integer in the set it returns the single bit
     * value used within a given word that represents whether
     * the integer is in the set or not.
     */
    private static int bit(int i) {
        int ret = (1 << (i & LOW_ORDER_BITMASK));
        return ret;
    }

    /**
     * Creates a new empty set of integers (characters)
     * @param max the maximum integer to be in the set.
     */
    private int[] createEmptySetOfIntegers(int max) {
        firstWordNotUsed = 0; // an optimization 

        int[] arr = new int[arrayIndex(max - 1) + 1];
            return arr;
 
    }

    /**
     * Adds the integer (character) to the set of integers.
     * @param i the integer to add to the set, valid values are 
     * 0, 1, 2 ... up to the maximum that was specified at
     * the creation of the set.
     */
    private final void set(int i) {   
        setASCIItextDirty(i);
        setASCIIattrDirty(i); 
             
        int j = (i >> SHIFT_PER_WORD); // this word is used
        int k = j + 1;       
        
        if(firstWordNotUsed < k) // for optimization purposes.
            firstWordNotUsed = k;
            
        array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
    }


    /**
     * Return true if the integer (character)is in the set of integers.
     * 
     * This implementation uses an array of integers with 32 bits per
     * integer.  If a bit is set to 1 the corresponding integer is 
     * in the set of integers.
     * 
     * @param i an integer that is tested to see if it is the
     * set of integers, or not.
     */
    private final boolean get(int i) {

        boolean in_the_set = false;
        int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
        // an optimization here, ... a quick test to see
        // if this integer is beyond any of the words in use
        if(j < firstWordNotUsed)
            in_the_set = (array_of_bits[j] & 
                          (1 << (i & LOW_ORDER_BITMASK))
            ) != 0;  // 0L for 64 bit words
        return in_the_set;
    }
    
    /**
     * This method returns true if there are some non-standard mappings to
     * entities other than quot, amp, lt, gt, and its only purpose is for
     * performance.
     * @param charToMap The value of the character that is mapped to a String
     * @param outputString The String to which the character is mapped, usually
     * an entity reference such as "<".
     * @return true if the mapping is not one of:
     * 
     *  '<' to "<"
     * 
 '>' to ">"
     * 
 '&' to "&"
     * 
 '"' to """
     * 
     */
    private boolean extraEntity(String outputString, int charToMap)
    {
        boolean extra = false;
        if (charToMap < ASCII_MAX)
        {
            switch (charToMap)
            {
                case '"' : // quot
                	if (!outputString.equals("""))
                		extra = true;  
                	break;
                case '&' : // amp
                	if (!outputString.equals("&"))
                		extra = true;
                	break;
                case '<' : // lt
                	if (!outputString.equals("<"))
                		extra = true;
                	break;
                case '>' : // gt
                	if (!outputString.equals(">"))
                		extra = true;
                    break;
                default : // other entity in range 0 to 127  
                    extra = true;
            }
        }
        return extra;
    }    
    
    /**
     * If the character is in the ASCII range then
     * mark it as needing replacement with
     * a String on output if it occurs in a text node.
     * @param ch
     */
    private void setASCIItextDirty(int j) 
    {
        if (0 <= j && j < ASCII_MAX) 
        {
            shouldMapTextChar_ASCII[j] = true;
        } 
    }
    
    /**
     * If the character is in the ASCII range then
     * mark it as needing replacement with
     * a String on output if it occurs in a attribute value.
     * @param ch
     */
    private void setASCIIattrDirty(int j) 
    {
        if (0 <= j && j < ASCII_MAX) 
        {
            shouldMapAttrChar_ASCII[j] = true;
        } 
    }

    
    /**
     * Call this method to register a char to String mapping, for example
     * to map '<' to "<".
     * @param outputString The String to map to.
     * @param inputChar The char to map from.
     * @return true if the mapping is not one of:
     * 
     *  '<' to "<"
     * 
 '>' to ">"
     * 
 '&' to "&"
     * 
 '"' to """
     * 
     */
    boolean defineChar2StringMapping(String outputString, char inputChar) 
    {
        CharKey character = new CharKey(inputChar);
        m_charToString.put(character, outputString);
        set(inputChar);  // mark the character has having a mapping to a String
        
        boolean extraMapping = extraEntity(outputString, inputChar);
        return extraMapping;
        	
    }

    /**
     * Simple class for fast lookup of char values, when used with
     * hashtables.  You can set the char, then use it as a key.
     *  
     * @xsl.usage internal
     */
    private static class CharKey extends Object
    {

      /** String value          */
      private char m_char;

      /**
       * Constructor CharKey
       *
       * @param key char value of this object.
       */
      public CharKey(char key)
      {
        m_char = key;
      }
  
      /**
       * Default constructor for a CharKey.
       *
       * @param key char value of this object.
       */
      public CharKey()
      {
      }
  
      /**
       * Get the hash value of the character.  
       *
       * @return hash value of the character.
       */
      public final void setChar(char c)
      {
        m_char = c;
      }



      /**
       * Get the hash value of the character.  
       *
       * @return hash value of the character.
       */
      public final int hashCode()
      {
        return (int)m_char;
      }

      /**
       * Override of equals() for this object 
       *
       * @param obj to compare to
       *
       * @return True if this object equals this string value 
       */
      public final boolean equals(Object obj)
      {
        return ((CharKey)obj).m_char == m_char;
      }
    }
   

}