All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.xml.XMLUtil Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.xml;

// IMPORTS
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import org.xml.sax.Attributes;

import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.xml.parser.Latin1Entities;
import com.hfg.xml.parser.SpecialCharacterEntities;
import com.hfg.xml.parser.SymbolEntities;


//------------------------------------------------------------------------------
/**
  Helper class with static methods for XML tag constrution and 
  some parser helper methods.
  
  @author J. Alex Taylor
 */
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class XMLUtil
{
      
   //###########################################################################
   // PUBLIC FIELDS
   //###########################################################################


    public static final String COMPRESSION_ATT = "compression";
    public static final String GZIP            = "gzip";
    public static final String ENCODING_ATT    = "encoding";
    public static final String BASE64          = "base64";

   
   /**
     Constant which can be used with composeStartTag().
    */
   public static final boolean EMPTY_TAG     = true;
   
   /**
     Constant which can be used with composeStartTag().
    */
   public static final boolean NOT_EMPTY_TAG = false;

   //###########################################################################
   // PRIVATE FIELDS
   //###########################################################################

   private static char sQuoteChar = '\'';

   private static ValidatedNameCache sValidatedNameCache = new ValidatedNameCache();

//   private static Pattern sEntityPattern = Pattern.compile("^&(amp|gt|lt|apos);");
   private static Pattern sCharacterEntityPattern = Pattern.compile("&(\\w{2,6});");
   private static Pattern sEntityPattern          = Pattern.compile("^&(\\w{2,6}|#\\d{1,6});");
//   private static Pattern sISOControlPattern = Pattern.compile("([\\\\x00-\\\\x08]|[\\x0B\\x0C\\x7F]|[\\x0E-\\x1F])");
   private static Pattern sISOControlPattern = Pattern.compile("([\u0001-\u0008]|[\u0011\u0012\u0127]|[\u0014-\u0031])");
//   private static Pattern sISOControlEntityPattern = Pattern.compile("&#([1-8]|11|12|127|[14-31]);");
   private static Pattern sISOControlEntityPattern = Pattern.compile("�{0,3}([1-8]|11|12|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|127);");


   //###########################################################################
   // PUBLIC FUNCTIONS
   //###########################################################################


   //---------------------------------------------------------------------------
   /**
    Set whether to use double quotes or single quotes for attribute values.
    Single quotes are used by default.
    */
   public static void useDoubleQuotes(boolean inValue)
   {
      sQuoteChar = inValue ? '"' : '\'';
   }

   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName>").
    */
   public static String composeStartTag(String inName) 
   {
      ArrayList emptyAttrList = null;
      return composeStartTag(inName, emptyAttrList, NOT_EMPTY_TAG);
   }
   
   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
     Attributes appear in alphabetical order for consistency.
    */
   public static String composeStartTag(String inName, Attributes inAttributes)
   {
      return composeStartTag(inName, inAttributes, NOT_EMPTY_TAG);
   }

   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
     Attributes appear in alphabetical order for consistency.
    */
   public static String composeStartTag(String inName, Attributes inAttributes, boolean isEmptyTag)
   {
      Collection attributes = null;
      if (inAttributes.getLength() > 0)
      {
         attributes = new ArrayList<>(inAttributes.getLength());
         for (int i = 0; i < inAttributes.getLength(); i++)
         {
            attributes.add(new XMLAttribute(inAttributes.getQName(i), inAttributes.getValue(i)));
         }
      }

      return composeStartTag(inName, attributes, isEmptyTag);
   }

   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
     Attributes appear in alphabetical order for consistency.
    */
   public static String composeStartTag(String inName, Collection inAttributes)
   {
      return composeStartTag(inName, inAttributes, NOT_EMPTY_TAG);
   }

   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
     If the isEmptyTag parameter is true, the output tag will end with "/>".
     ex: "<inName att1='value1' att2='value2'/>". Attributes appear in
     alphabetical order for consistency.
     
     @param inName the tag name
     @param inAttributes a List of XMLAttribute objects
     @param isEmptyTag   whether or not the tag has any content or subtags
    */
   public static String composeStartTag(String inName, Collection inAttributes,
                                        boolean isEmptyTag) 
   {
      StringBuilder tag = new StringBuilder();
     
      tag.append("<");
      tag.append(inName);
     
      // Write attributes.
      if (inAttributes != null)
      {
         List sordidAttributes;
         if (inAttributes instanceof List)
         {
            sordidAttributes = (List) inAttributes;
         }
         else
         {
            sordidAttributes = new ArrayList<>(inAttributes);
         }

         Collections.sort(sordidAttributes);

         for (XMLAttribute attribute : sordidAttributes)
         {
            XMLNamespace namespace = attribute.getNamespace();
            tag.append(" ");
            tag.append(namespace != null && StringUtil.isSet(namespace.getPrefix()) ? namespace + ":" : "");
            tag.append(attribute.getName());

            String value = attribute.getValue();
            if (null == value) value = "";
            tag.append("=");
            tag.append(getQuotedAttributeValue(value));
         }
      }
     
      if (isEmptyTag) tag.append(" /");
      tag.append(">");
     
      return tag.toString();
   }
   
   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
     Attributes appear in alphabetical order for consistency.
    */
   public static String composeStartTag(String inName, Map inAttributes)
   {
      return composeStartTag(inName, inAttributes, NOT_EMPTY_TAG);
   }
   
   //---------------------------------------------------------------------------
   /**
     Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
     If the isEmptyTag parameter is true, the output tag will end with "/>".
     ex: "<inName att1='value1' att2='value2'/>". Attributes appear in
     alphabetical order for consistency.
    */
   public static String composeStartTag(String inName, Map inAttributes,
                                        boolean isEmptyTag) 
   {
      StringBuilder tag = new StringBuilder();
     
      tag.append("<");
      tag.append(inName);
     
      // Write attributes.
      if (inAttributes != null)
      {
         List keys = new ArrayList<>(inAttributes.keySet());
         Collections.sort(keys);

         for (String attributeName : keys)
         {

            tag.append(" ");
            tag.append(attributeName);

            Object value = inAttributes.get(attributeName);
            if (null == value) value = "";
            tag.append("=");
            tag.append(getQuotedAttributeValue(value.toString()));
         }
      }
     
      if (isEmptyTag) tag.append(" /");
      tag.append(">");
     
      return tag.toString();
   }
  
   //---------------------------------------------------------------------------
   /**
     Composes an xml end tag (ex: "</inName>").
    */
   public static String composeEndTag(String inName) 
   {
      return "";
   }

   //---------------------------------------------------------------------------
   public static boolean isWellFormedFragment(String inXML)
   {
      boolean result = true;
      try
      {
         XMLTag testTag = new XMLTag(new StringReader(inXML));
      }
      catch (Exception e)
      {
         result = false;
      }

      return result;
   }

   //---------------------------------------------------------------------------
   /**
    Returns whether or not the content fragment is well-formed and if
    tags are present internal to the fragment, whether the tag section
    and preceding or trailing raw content is well-formed.
    
    Examples of valid content fragments:  '<foo />', 'foo &amp; bar', 'foo', 'foo <bar>1</bar> one'
    Examples of invalid content fragments:  '<foo>', 'foo & bar', 'foo <bar>1</zoot> one'
    
*/ public static boolean isWellFormedContentFragment(String inXML) { boolean result = true; int startIndex = inXML.indexOf("<"); int endIndex = inXML.lastIndexOf(">"); if (startIndex > -1 && endIndex > startIndex) { // Test the tag frag result = isWellFormedFragment(inXML.substring(startIndex, endIndex + 1)); } if (startIndex > 0 && result) { // Test preceeding raw content String preceedingContent = inXML.substring(0, startIndex); result = preceedingContent.equals(escapeContentIfNecessary(preceedingContent)); } if (endIndex < inXML.length() - 1 && result) { // Test trailing raw content String trailingContent = inXML.substring(endIndex + 1); result = trailingContent.equals(escapeContentIfNecessary(trailingContent)); } return result; } //--------------------------------------------------------------------------- public static String convertCharacterEntitiesToNumeric(String inContent) { String outContent = inContent; if (inContent != null) { if (inContent.contains("&")) { StringBuilder buffer = new StringBuilder(inContent); int index = 0; Matcher m = sCharacterEntityPattern.matcher(buffer); while (index < buffer.length() && m.find(index)) { String numericEntity = Latin1Entities.getInstance().getNumericEntity(m.group(1)); if (null == numericEntity) { numericEntity = SymbolEntities.getInstance().getNumericEntity(m.group(1)); } index = m.end(); if (numericEntity != null) { int length = m.group(1).length(); buffer.replace(m.start(1), m.end(1), numericEntity); index += (numericEntity.length() - length); } // Since the buffer changed we need to reinstantiate the matcher m = sCharacterEntityPattern.matcher(buffer); } outContent = buffer.toString(); } } return outContent; } //--------------------------------------------------------------------------- public static String convertCharacterEntitiesToUnicode(String inContent) { String outContent = inContent; if (inContent != null) { if (inContent.contains("&")) { StringBuilder buffer = new StringBuilder(inContent); int index = 0; Matcher m = sCharacterEntityPattern.matcher(buffer); while (index < buffer.length() && m.find(index)) { Character unicodeChar = Latin1Entities.getInstance().getUnicodeChar(m.group(1)); if (null == unicodeChar) { unicodeChar = SymbolEntities.getInstance().getUnicodeChar(m.group(1)); } index = m.end(); if (unicodeChar != null) { int length = m.group(0).length(); buffer.replace(m.start(0), m.end(0), unicodeChar + ""); index += (1 - length); } // Since the buffer changed we need to reinstantiate the matcher m = sCharacterEntityPattern.matcher(buffer); } outContent = buffer.toString(); } } return outContent; } // // RULES FOR ESCAPING: // - '&' and '<' must *always* be escaped as & and <, without // exception. // - '>' needs to be escaped as > only when (a) it is in character // data content and (b) it immediately follows the string "]]". // - '"' and "'" *never* need to be escaped in character data content; in // attribute value literals, they need to be escaped only when that // character is being used as a delimiter, i.e. """ or '''. //--------------------------------------------------------------------------- /** * Makes the content XML safe by ensuring that all '<'s are all escaped and that * all '&'s are part of an entity. Avoids double escaping. * @param inContent * @return XML-safe content string */ public static String escapeContentIfNecessary(String inContent) { String safeContent = inContent; if (inContent != null) { if (inContent.indexOf("&") >= 0) { StringBuilder buffer = new StringBuilder(inContent); int index = 0; while ((index = buffer.indexOf("&", index)) >= 0) { Matcher m = sEntityPattern.matcher(buffer.substring(index)); if (!m.find()) { buffer.replace(index, index + 1, "&"); index += 3; } else { index++; } } safeContent = buffer.toString(); } safeContent = safeContent.replaceAll("<", "<"); if (sISOControlPattern.matcher(safeContent).find()) { safeContent = replaceISOControlWithEntities(safeContent); } } return safeContent; } //--------------------------------------------------------------------------- public static String escapeContent(String inContent) { String encodedString = null; if (inContent != null) { encodedString = inContent.replaceAll("&", "&"); encodedString = encodedString.replaceAll("<", "<"); } return encodedString; } //--------------------------------------------------------------------------- public static String unescapeContent(String inContent) { String unencodedString = inContent; if (inContent != null) { unencodedString = inContent.replaceAll("<", "<"); unencodedString = unencodedString.replaceAll("&", "&"); unencodedString = expandISOControlEntities(unencodedString); } return unencodedString; } //--------------------------------------------------------------------------- public static String escapeAttributeValue(String inAttributeValue) { String encodedString = null; if (inAttributeValue != null) { encodedString = escapeAmp(inAttributeValue); encodedString = escapeApos(encodedString); encodedString = encodedString.replaceAll("<", "<"); if (sISOControlPattern.matcher(encodedString).find()) { encodedString = replaceISOControlWithEntities(encodedString); } } return encodedString; } //--------------------------------------------------------------------------- public static String unescapeAttributeValue(String inAttributeValue) { String unencodedString = null; if (inAttributeValue != null) { unencodedString = inAttributeValue.replaceAll("'", "'"); unencodedString = unencodedString.replaceAll("<", "<"); unencodedString = unencodedString.replaceAll("&", "&"); } return unencodedString; } //--------------------------------------------------------------------------- public static String escapeApos(String inAttributeValue) { String escapedVaule = null; if (inAttributeValue != null) { escapedVaule = inAttributeValue.replaceAll("'", "'"); } return escapedVaule; } //--------------------------------------------------------------------------- public static String escapeQuote(String inAttributeValue) { String escapedVaule = null; if (inAttributeValue != null) { escapedVaule = inAttributeValue.replaceAll("\"", """); } return escapedVaule; } //--------------------------------------------------------------------------- public static String escapeAmp(String inAttributeValue) { String escapedVaule = null; if (inAttributeValue != null) { escapedVaule = inAttributeValue.replaceAll("&", "&"); } return escapedVaule; } public static Pattern sEntityPattern2 = Pattern.compile("&(\\S{3,8});"); //--------------------------------------------------------------------------- public static synchronized String unescapeEntities(String inValue) { StringBuilder unescapedVaule = null; if (inValue != null) { unescapedVaule = new StringBuilder(inValue); Matcher m = sEntityPattern2.matcher(unescapedVaule); int index = 0; while (m.find(index)) { String entity = m.group(1); String expandedEntity; if (entity.startsWith("#")) { if (entity.charAt(1) == 'x') { // Hex expandedEntity = "" + (char) Integer.parseInt(entity.substring(2), 16); } else { // Decimal expandedEntity = "" + (char) Integer.parseInt(entity.substring(1)); } } else { // Try to resolve it with our battery of standard entity classes. expandedEntity = SpecialCharacterEntities.resolveEntity(entity); if (null == expandedEntity) { expandedEntity = Latin1Entities.getInstance().getNumericEntity(entity); } if (null == expandedEntity) { expandedEntity = SymbolEntities.getInstance().getNumericEntity(entity); } if (expandedEntity != null && expandedEntity.startsWith("#")) { expandedEntity = "" + (char) Integer.parseInt(expandedEntity.substring(1)); } } if (expandedEntity != null) { unescapedVaule.replace(m.start(), m.start() + m.group(1).length() + 2, expandedEntity); index = m.start(1) + expandedEntity.length(); } else { index++; } if (index >= unescapedVaule.length()) { break; } } } return unescapedVaule != null ? unescapedVaule.toString() : null; } //--------------------------------------------------------------------------- /** Is the tag or element name valid? */ public static void checkXMLNameValidity(String inValue) throws InvalidXMLNameException { if (null == inValue) { throw new InvalidXMLNameException("XML tag/element names cannot be set to null."); } // Check for xml validity of the tag name. if (inValue.length() == 0) { throw new InvalidXMLNameException("XML tag/element names cannot be an empty string."); } else if (! sValidatedNameCache.contains(inValue)) { if (! Character.isLetter(inValue.charAt(0)) && inValue.charAt(0) != '_') { throw new InvalidXMLNameException("'" + inValue + "' is not a valid XML tag/element name. " + "The first character must be a letter " + "or an underscore."); } else if (inValue.contains(" ")) { throw new InvalidXMLNameException("'" + inValue + "' is not a valid XML tag/element name. " + "It cannot contain whitespace."); } else { for (int i = 0; i < inValue.length(); i++) { char tagChar = inValue.charAt(i); if (! Character.isLetterOrDigit(tagChar) && tagChar != '_' && tagChar != '.' && tagChar != '-') { throw new InvalidXMLNameException("'" + inValue + "' is not a valid XML tag/element name. " + "'" + tagChar + "' is an invalid character."); } } sValidatedNameCache.add(inValue); } } } //--------------------------------------------------------------------------- public static List findNodesByAttribute(XMLNode inRootNode, XMLAttribute inAttribute) { List nodes = new ArrayList<>(); recursivelyFindByAttribute(nodes, inRootNode, inAttribute); return nodes; } //--------------------------------------------------------------------------- public static List findNodesByAttribute(XMLNode inRootNode, String inAttribute) { List nodes = new ArrayList<>(); recursivelyFindByAttribute(nodes, inRootNode, new XMLAttribute(inAttribute, null)); return nodes; } //--------------------------------------------------------------------------- private static void recursivelyFindByAttribute(List inNodes, XMLNode inNode, XMLAttribute inAttribute) { if (inNode.hasAttribute(inAttribute.getName()) && null == inAttribute.getValue() || inAttribute.getValue().equals(inNode.getAttributeValue(inAttribute.getName()))) { inNodes.add(inNode); } List subnodes = inNode.getSubtags(); if (CollectionUtil.hasValues(subnodes)) { for (XMLNode subnode : subnodes) { recursivelyFindByAttribute(inNodes, subnode, inAttribute); } } } //########################################################################### // PRIVATE FUNCTIONS //########################################################################### //--------------------------------------------------------------------------- private static String getQuotedAttributeValue(String inAttributeValue) { String safeValue = "''"; if (inAttributeValue != null) { if (sQuoteChar == '\'') { safeValue = "'" + escapeAttributeValue(inAttributeValue) + "'"; } else { safeValue = "\"" + escapeDoubleQuotedAttributeValue(inAttributeValue) + "\""; } } return safeValue; } //--------------------------------------------------------------------------- public static String escapeDoubleQuotedAttributeValue(String inAttributeValue) { String encodedString = null; if (inAttributeValue != null) { encodedString = escapeAmp(inAttributeValue); encodedString = escapeQuote(encodedString); encodedString = encodedString.replaceAll("<", "<"); } return encodedString; } //--------------------------------------------------------------------------- private static String replaceISOControlWithEntities(String inString) { StringBuilder buffer = new StringBuilder(); for (char theChar : inString.toCharArray()) { if (! Character.isWhitespace(theChar) // Skip \t, \r, and \n && Character.isISOControl(theChar)) { buffer.append("&#" + (int)theChar + ";"); } else { buffer.append(theChar); } } return buffer.toString(); } //--------------------------------------------------------------------------- private static String expandISOControlEntities(String inString) { String resultString = inString; if (sISOControlEntityPattern.matcher(inString).find()) { StringBuilder buffer = new StringBuilder(inString); int start = 0; Matcher m = sISOControlEntityPattern.matcher(buffer); while (m.find(start)) { buffer.replace(m.start(), m.end(), "" + (char)Integer.parseInt(m.group(1))); start = m.start() + 1; } resultString = buffer.toString(); } return resultString; } //--------------------------------------------------------------------------- public static String replaceUnicodeWithEntities(String inString) { StringBuilder buffer = new StringBuilder(); if (inString != null) { for (int i = 0; i < inString.length(); i++) { char theChar = inString.charAt(i); if (126 < (int) theChar) { buffer.append("&#" + ((int) theChar) + ";"); } else { buffer.append(theChar); } } } return inString != null ? buffer.toString() : null; } // This cache is used for improved efficiency in name validation. //--------------------------------------------------------------------------- private static class ValidatedNameCache { private int mMaxCacheSize = 150; private Set mNameSet = new HashSet<>(); private LinkedList mNameQueue = new LinkedList<>(); //------------------------------------------------------------------------ public boolean contains(String inValue) { return (mNameSet.contains(inValue)); } //------------------------------------------------------------------------ public synchronized void add(String inValue) { if (mNameSet.size() >= mMaxCacheSize) { mNameSet.remove(mNameQueue.removeFirst()); } mNameSet.add(inValue); mNameQueue.add(inValue); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy