All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.xml.serializer.CharInfo Maven / Gradle / Ivy

Go to download

Tool to convert CSV and XLS to XML, to transform XML and to convert XML to CSV, HTML, other text files, PDF etc., useful as command line tool and integrated in other projects.

There is a newer version: 3.119
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the  "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * $Id: CharInfo.java 468654 2006-10-28 07:09:23Z minchau $
 */
package org.apache.xml.serializer;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.PropertyResourceBundle;
import java.util.ResourceBundle;
import java.security.AccessController;
import java.security.PrivilegedAction;

import javax.xml.transform.TransformerException;

import org.apache.xml.serializer.utils.MsgKey;
import org.apache.xml.serializer.utils.SystemIDResolver;
import org.apache.xml.serializer.utils.Utils;
import org.apache.xml.serializer.utils.WrappedRuntimeException;

/**
 * This class provides services that tell if a character should have
 * special treatement, such as entity reference substitution or normalization
 * of a newline character.  It also provides character to entity reference
 * lookup.
 *
 * DEVELOPERS: See Known Issue in the constructor.
 * 
 * @xsl.usage internal
 */
final class CharInfo
{
    /** Given a character, lookup a String to output (e.g. a decorated entity reference). */
    private HashMap m_charToString;

    /**
     * The name of the HTML entities file.
     * If specified, the file will be resource loaded with the default class loader.
     */
    public static final String HTML_ENTITIES_RESOURCE = 
                SerializerBase.PKG_NAME+".HTMLEntities";

    /**
     * The name of the XML entities file.
     * If specified, the file will be resource loaded with the default class loader.
     */
    public static final String XML_ENTITIES_RESOURCE = 
                SerializerBase.PKG_NAME+".XMLEntities";

    /** The horizontal tab character, which the parser should always normalize. */
    static final char S_HORIZONAL_TAB = 0x09;

    /** The linefeed character, which the parser should always normalize. */
    static final char S_LINEFEED = 0x0A;

    /** The carriage return character, which the parser should always normalize. */
    static final char S_CARRIAGERETURN = 0x0D;
    static final char S_SPACE = 0x20;
    static final char S_QUOTE = 0x22;
    static final char S_LT = 0x3C;
    static final char S_GT = 0x3E;
    static final char S_NEL = 0x85;    
    static final char S_LINE_SEPARATOR = 0x2028;
    
    /** This flag is an optimization for HTML entities. It false if entities 
     * other than quot (34), amp (38), lt (60) and gt (62) are defined
     * in the range 0 to 127.
     * @xsl.usage internal
     */    
    boolean onlyQuotAmpLtGt;
    
    /** Copy the first 0,1 ... ASCII_MAX values into an array */
    static final int ASCII_MAX = 128;
    
    /** Array of values is faster access than a set of bits 
     * to quickly check ASCII characters in attribute values,
     * the value is true if the character in an attribute value
     * should be mapped to a String. 
     */
    private final boolean[] shouldMapAttrChar_ASCII;
    
    /** Array of values is faster access than a set of bits 
     * to quickly check ASCII characters in text nodes, 
     * the value is true if the character in a text node
     * should be mapped to a String. 
     */
    private final boolean[] shouldMapTextChar_ASCII;

    /** An array of bits to record if the character is in the set.
     * Although information in this array is complete, the
     * isSpecialAttrASCII array is used first because access to its values
     * is common and faster.
     */   
    private final int array_of_bits[];
     
    
    // 5 for 32 bit words,  6 for 64 bit words ...
    /*
     * This constant is used to shift an integer to quickly
     * calculate which element its bit is stored in.
     * 5 for 32 bit words (int) ,  6 for 64 bit words (long)
     */
    private static final int SHIFT_PER_WORD = 5;
    
    /*
     * A mask to get the low order bits which are used to
     * calculate the value of the bit within a given word,
     * that will represent the presence of the integer in the 
     * set.
     * 
     * 0x1F for 32 bit words (int),
     * or 0x3F for 64 bit words (long) 
     */
    private static final int LOW_ORDER_BITMASK = 0x1f;
    
    /*
     * This is used for optimizing the lookup of bits representing
     * the integers in the set. It is the index of the first element
     * in the array array_of_bits[] that is not used.
     */
    private int firstWordNotUsed;


    /**
     * A base constructor just to explicitly create the fields,
     * with the exception of m_charToString which is handled
     * by the constructor that delegates base construction to this one.
     * 

* m_charToString is not created here only for performance reasons, * to avoid creating a Hashtable that will be replaced when * making a mutable copy, {@link #mutableCopyOf(CharInfo)}. * */ private CharInfo() { this.array_of_bits = createEmptySetOfIntegers(65535); this.firstWordNotUsed = 0; this.shouldMapAttrChar_ASCII = new boolean[ASCII_MAX]; this.shouldMapTextChar_ASCII = new boolean[ASCII_MAX]; this.m_charKey = new CharKey(); // Not set here, but in a constructor that uses this one // this.m_charToString = new Hashtable(); this.onlyQuotAmpLtGt = true; return; } private CharInfo(String entitiesResource, String method, boolean internal) { // call the default constructor to create the fields this(); m_charToString = new HashMap(); ResourceBundle entities = null; boolean noExtraEntities = true; // Make various attempts to interpret the parameter as a properties // file or resource file, as follows: // // 1) attempt to load .properties file using ResourceBundle // 2) try using the class loader to find the specified file a resource // file // 3) try treating the resource a URI if (internal) { try { // Load entity property files by using PropertyResourceBundle, // cause of security issure for applets entities = PropertyResourceBundle.getBundle(entitiesResource); } catch (Exception e) {} } if (entities != null) { Enumeration keys = entities.getKeys(); while (keys.hasMoreElements()){ String name = (String) keys.nextElement(); String value = entities.getString(name); int code = Integer.parseInt(value); boolean extra = defineEntity(name, (char) code); if (extra) noExtraEntities = false; } } else { InputStream is = null; // Load user specified resource file by using URL loading, it // requires a valid URI as parameter try { if (internal) { is = CharInfo.class.getResourceAsStream(entitiesResource); } else { ClassLoader cl = ObjectFactory.findClassLoader(); if (cl == null) { is = ClassLoader.getSystemResourceAsStream(entitiesResource); } else { is = cl.getResourceAsStream(entitiesResource); } if (is == null) { try { URL url = new URL(entitiesResource); is = url.openStream(); } catch (Exception e) {} } } if (is == null) { throw new RuntimeException( Utils.messages.createMessage( MsgKey.ER_RESOURCE_COULD_NOT_FIND, new Object[] {entitiesResource, entitiesResource})); } // Fix Bugzilla#4000: force reading in UTF-8 // This creates the de facto standard that Xalan's resource // files must be encoded in UTF-8. This should work in all // JVMs. // // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which // didn't implement the UTF-8 encoding. Theoretically, we should // simply let it fail in that case, since the JVM is obviously // broken if it doesn't support such a basic standard. But // since there are still some users attempting to use VJ++ for // development, we have dropped in a fallback which makes a // second attempt using the platform's default encoding. In VJ++ // this is apparently ASCII, which is subset of UTF-8... and // since the strings we'll be reading here are also primarily // limited to the 7-bit ASCII range (at least, in English // versions of Xalan), this should work well enough to keep us // on the air until we're ready to officially decommit from // VJ++. BufferedReader reader; try { reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); } catch (UnsupportedEncodingException e) { reader = new BufferedReader(new InputStreamReader(is)); } String line = reader.readLine(); while (line != null) { if (line.length() == 0 || line.charAt(0) == '#') { line = reader.readLine(); continue; } int index = line.indexOf(' '); if (index > 1) { String name = line.substring(0, index); ++index; if (index < line.length()) { String value = line.substring(index); index = value.indexOf(' '); if (index > 0) { value = value.substring(0, index); } int code = Integer.parseInt(value); boolean extra = defineEntity(name, (char) code); if (extra) noExtraEntities = false; } } line = reader.readLine(); } is.close(); } catch (Exception e) { throw new RuntimeException( Utils.messages.createMessage( MsgKey.ER_RESOURCE_COULD_NOT_LOAD, new Object[] { entitiesResource, e.toString(), entitiesResource, e.toString()})); } finally { if (is != null) { try { is.close(); } catch (Exception except) {} } } } onlyQuotAmpLtGt = noExtraEntities; /* Now that we've used get(ch) just above to initialize the * two arrays we will change by adding a tab to the set of * special chars for XML (but not HTML!). * We do this because a tab is always a * special character in an XML attribute, * but only a special character in XML text * if it has an entity defined for it. * This is the reason for this delay. */ if (Method.XML.equals(method)) { // We choose not to escape the quotation mark as " in text nodes shouldMapTextChar_ASCII[S_QUOTE] = false; } if (Method.HTML.equals(method)) { // The XSLT 1.0 recommendation says // "The html output method should not escape < characters occurring in attribute values." // So we don't escape '<' in an attribute for HTML shouldMapAttrChar_ASCII['<'] = false; // We choose not to escape the quotation mark as " in text nodes. shouldMapTextChar_ASCII[S_QUOTE] = false; } } /** * Defines a new character reference. The reference's name and value are * supplied. Nothing happens if the character reference is already defined. *

Unlike internal entities, character references are a string to single * character mapping. They are used to map non-ASCII characters both on * parsing and printing, primarily for HTML documents. '&lt;' is an * example of a character reference.

* * @param name The entity's name * @param value The entity's value * @return true if the mapping is not one of: *
    *
  • '<' to "<" *
  • '>' to ">" *
  • '&' to "&" *
  • '"' to """ *
*/ private boolean defineEntity(String name, char value) { StringBuffer sb = new StringBuffer("&"); sb.append(name); sb.append(';'); String entityString = sb.toString(); boolean extra = defineChar2StringMapping(entityString, value); return extra; } /** * A utility object, just used to map characters to output Strings, * needed because a HashMap needs to map an object as a key, not a * Java primitive type, like a char, so this object gets around that * and it is reusable. */ private final CharKey m_charKey; /** * Map a character to a String. For example given * the character '>' this method would return the fully decorated * entity name "<". * Strings for entity references are loaded from a properties file, * but additional mappings defined through calls to defineChar2String() * are possible. Such entity reference mappings could be over-ridden. * * This is reusing a stored key object, in an effort to avoid * heap activity. Unfortunately, that introduces a threading risk. * Simplest fix for now is to make it a synchronized method, or to give * up the reuse; I see very little performance difference between them. * Long-term solution would be to replace the hashtable with a sparse array * keyed directly from the character's integer value; see DTM's * string pool for a related solution. * * @param value The character that should be resolved to * a String, e.g. resolve '>' to "<". * * @return The String that the character is mapped to, or null if not found. * @xsl.usage internal */ String getOutputStringForChar(char value) { // CharKey m_charKey = new CharKey(); //Alternative to synchronized m_charKey.setChar(value); return (String) m_charToString.get(m_charKey); } /** * Tell if the character argument that is from * an attribute value has a mapping to a String. * * @param value the value of a character that is in an attribute value * @return true if the character should have any special treatment, * such as when writing out entity references. * @xsl.usage internal */ final boolean shouldMapAttrChar(int value) { // for performance try the values in the boolean array first, // this is faster access than the BitSet for common ASCII values if (value < ASCII_MAX) return shouldMapAttrChar_ASCII[value]; // rather than java.util.BitSet, our private // implementation is faster (and less general). return get(value); } /** * Tell if the character argument that is from a * text node has a mapping to a String, for example * to map '<' to "<". * * @param value the value of a character that is in a text node * @return true if the character has a mapping to a String, * such as when writing out entity references. * @xsl.usage internal */ final boolean shouldMapTextChar(int value) { // for performance try the values in the boolean array first, // this is faster access than the BitSet for common ASCII values if (value < ASCII_MAX) return shouldMapTextChar_ASCII[value]; // rather than java.util.BitSet, our private // implementation is faster (and less general). return get(value); } private static CharInfo getCharInfoBasedOnPrivilege( final String entitiesFileName, final String method, final boolean internal){ return (CharInfo) AccessController.doPrivileged( new PrivilegedAction() { public Object run() { return new CharInfo(entitiesFileName, method, internal);} }); } /** * Factory that reads in a resource file that describes the mapping of * characters to entity references. * * Resource files must be encoded in UTF-8 and have a format like: *
     * # First char # is a comment
     * Entity numericValue
     * quot 34
     * amp 38
     * 
* (Note: Why don't we just switch to .properties files? Oct-01 -sc) * * @param entitiesResource Name of entities resource file that should * be loaded, which describes that mapping of characters to entity references. * @param method the output method type, which should be one of "xml", "html", "text"... * * @xsl.usage internal */ static CharInfo getCharInfo(String entitiesFileName, String method) { CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName); if (charInfo != null) { return mutableCopyOf(charInfo); } // try to load it internally - cache try { charInfo = getCharInfoBasedOnPrivilege(entitiesFileName, method, true); // Put the common copy of charInfo in the cache, but return // a copy of it. m_getCharInfoCache.put(entitiesFileName, charInfo); return mutableCopyOf(charInfo); } catch (Exception e) {} // try to load it externally - do not cache try { return getCharInfoBasedOnPrivilege(entitiesFileName, method, false); } catch (Exception e) {} String absoluteEntitiesFileName; if (entitiesFileName.indexOf(':') < 0) { absoluteEntitiesFileName = SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName); } else { try { absoluteEntitiesFileName = SystemIDResolver.getAbsoluteURI(entitiesFileName, null); } catch (TransformerException te) { throw new WrappedRuntimeException(te); } } return getCharInfoBasedOnPrivilege(entitiesFileName, method, false); } /** * Create a mutable copy of the cached one. * @param charInfo The cached one. * @return */ private static CharInfo mutableCopyOf(CharInfo charInfo) { CharInfo copy = new CharInfo(); int max = charInfo.array_of_bits.length; System.arraycopy(charInfo.array_of_bits,0,copy.array_of_bits,0,max); copy.firstWordNotUsed = charInfo.firstWordNotUsed; max = charInfo.shouldMapAttrChar_ASCII.length; System.arraycopy(charInfo.shouldMapAttrChar_ASCII,0,copy.shouldMapAttrChar_ASCII,0,max); max = charInfo.shouldMapTextChar_ASCII.length; System.arraycopy(charInfo.shouldMapTextChar_ASCII,0,copy.shouldMapTextChar_ASCII,0,max); // utility field copy.m_charKey is already created in the default constructor copy.m_charToString = (HashMap) charInfo.m_charToString.clone(); copy.onlyQuotAmpLtGt = charInfo.onlyQuotAmpLtGt; return copy; } /** * Table of user-specified char infos. * The table maps entify file names (the name of the * property file without the .properties extension) * to CharInfo objects populated with entities defined in * corresponding property file. */ private static Hashtable m_getCharInfoCache = new Hashtable(); /** * Returns the array element holding the bit value for the * given integer * @param i the integer that might be in the set of integers * */ private static int arrayIndex(int i) { return (i >> SHIFT_PER_WORD); } /** * For a given integer in the set it returns the single bit * value used within a given word that represents whether * the integer is in the set or not. */ private static int bit(int i) { int ret = (1 << (i & LOW_ORDER_BITMASK)); return ret; } /** * Creates a new empty set of integers (characters) * @param max the maximum integer to be in the set. */ private int[] createEmptySetOfIntegers(int max) { firstWordNotUsed = 0; // an optimization int[] arr = new int[arrayIndex(max - 1) + 1]; return arr; } /** * Adds the integer (character) to the set of integers. * @param i the integer to add to the set, valid values are * 0, 1, 2 ... up to the maximum that was specified at * the creation of the set. */ private final void set(int i) { setASCIItextDirty(i); setASCIIattrDirty(i); int j = (i >> SHIFT_PER_WORD); // this word is used int k = j + 1; if(firstWordNotUsed < k) // for optimization purposes. firstWordNotUsed = k; array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK)); } /** * Return true if the integer (character)is in the set of integers. * * This implementation uses an array of integers with 32 bits per * integer. If a bit is set to 1 the corresponding integer is * in the set of integers. * * @param i an integer that is tested to see if it is the * set of integers, or not. */ private final boolean get(int i) { boolean in_the_set = false; int j = (i >> SHIFT_PER_WORD); // wordIndex(i) // an optimization here, ... a quick test to see // if this integer is beyond any of the words in use if(j < firstWordNotUsed) in_the_set = (array_of_bits[j] & (1 << (i & LOW_ORDER_BITMASK)) ) != 0; // 0L for 64 bit words return in_the_set; } /** * This method returns true if there are some non-standard mappings to * entities other than quot, amp, lt, gt, and its only purpose is for * performance. * @param charToMap The value of the character that is mapped to a String * @param outputString The String to which the character is mapped, usually * an entity reference such as "<". * @return true if the mapping is not one of: *
    *
  • '<' to "<" *
  • '>' to ">" *
  • '&' to "&" *
  • '"' to """ *
*/ private boolean extraEntity(String outputString, int charToMap) { boolean extra = false; if (charToMap < ASCII_MAX) { switch (charToMap) { case '"' : // quot if (!outputString.equals(""")) extra = true; break; case '&' : // amp if (!outputString.equals("&")) extra = true; break; case '<' : // lt if (!outputString.equals("<")) extra = true; break; case '>' : // gt if (!outputString.equals(">")) extra = true; break; default : // other entity in range 0 to 127 extra = true; } } return extra; } /** * If the character is in the ASCII range then * mark it as needing replacement with * a String on output if it occurs in a text node. * @param ch */ private void setASCIItextDirty(int j) { if (0 <= j && j < ASCII_MAX) { shouldMapTextChar_ASCII[j] = true; } } /** * If the character is in the ASCII range then * mark it as needing replacement with * a String on output if it occurs in a attribute value. * @param ch */ private void setASCIIattrDirty(int j) { if (0 <= j && j < ASCII_MAX) { shouldMapAttrChar_ASCII[j] = true; } } /** * Call this method to register a char to String mapping, for example * to map '<' to "<". * @param outputString The String to map to. * @param inputChar The char to map from. * @return true if the mapping is not one of: *
    *
  • '<' to "<" *
  • '>' to ">" *
  • '&' to "&" *
  • '"' to """ *
*/ boolean defineChar2StringMapping(String outputString, char inputChar) { CharKey character = new CharKey(inputChar); m_charToString.put(character, outputString); set(inputChar); // mark the character has having a mapping to a String boolean extraMapping = extraEntity(outputString, inputChar); return extraMapping; } /** * Simple class for fast lookup of char values, when used with * hashtables. You can set the char, then use it as a key. * * @xsl.usage internal */ private static class CharKey extends Object { /** String value */ private char m_char; /** * Constructor CharKey * * @param key char value of this object. */ public CharKey(char key) { m_char = key; } /** * Default constructor for a CharKey. * * @param key char value of this object. */ public CharKey() { } /** * Get the hash value of the character. * * @return hash value of the character. */ public final void setChar(char c) { m_char = c; } /** * Get the hash value of the character. * * @return hash value of the character. */ public final int hashCode() { return (int)m_char; } /** * Override of equals() for this object * * @param obj to compare to * * @return True if this object equals this string value */ public final boolean equals(Object obj) { return ((CharKey)obj).m_char == m_char; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy