com.hfg.xml.XMLUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.xml;
// IMPORTS
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.xml.sax.Attributes;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.xml.parser.Latin1Entities;
import com.hfg.xml.parser.SpecialCharacterEntities;
import com.hfg.xml.parser.SymbolEntities;
//------------------------------------------------------------------------------
/**
Helper class with static methods for XML tag constrution and
some parser helper methods.
@author J. Alex Taylor
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class XMLUtil
{
//###########################################################################
// PUBLIC FIELDS
//###########################################################################
public static final String COMPRESSION_ATT = "compression";
public static final String GZIP = "gzip";
public static final String ENCODING_ATT = "encoding";
public static final String BASE64 = "base64";
/**
Constant which can be used with composeStartTag().
*/
public static final boolean EMPTY_TAG = true;
/**
Constant which can be used with composeStartTag().
*/
public static final boolean NOT_EMPTY_TAG = false;
//###########################################################################
// PRIVATE FIELDS
//###########################################################################
private static char sQuoteChar = '\'';
private static ValidatedNameCache sValidatedNameCache = new ValidatedNameCache();
// private static Pattern sEntityPattern = Pattern.compile("^&(amp|gt|lt|apos);");
private static Pattern sCharacterEntityPattern = Pattern.compile("&(\\w{2,6});");
private static Pattern sEntityPattern = Pattern.compile("^&(\\w{2,6}|#\\d{1,6});");
// private static Pattern sISOControlPattern = Pattern.compile("([\\\\x00-\\\\x08]|[\\x0B\\x0C\\x7F]|[\\x0E-\\x1F])");
private static Pattern sISOControlPattern = Pattern.compile("([\u0001-\u0008]|[\u0011\u0012\u0127]|[\u0014-\u0031])");
// private static Pattern sISOControlEntityPattern = Pattern.compile("([1-8]|11|12|127|[14-31]);");
private static Pattern sISOControlEntityPattern = Pattern.compile("{0,3}([1-8]|11|12|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|127);");
//###########################################################################
// PUBLIC FUNCTIONS
//###########################################################################
//---------------------------------------------------------------------------
/**
Set whether to use double quotes or single quotes for attribute values.
Single quotes are used by default.
*/
public static void useDoubleQuotes(boolean inValue)
{
sQuoteChar = inValue ? '"' : '\'';
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName>").
*/
public static String composeStartTag(String inName)
{
ArrayList emptyAttrList = null;
return composeStartTag(inName, emptyAttrList, NOT_EMPTY_TAG);
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
Attributes appear in alphabetical order for consistency.
*/
public static String composeStartTag(String inName, Attributes inAttributes)
{
return composeStartTag(inName, inAttributes, NOT_EMPTY_TAG);
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
Attributes appear in alphabetical order for consistency.
*/
public static String composeStartTag(String inName, Attributes inAttributes, boolean isEmptyTag)
{
Collection attributes = null;
if (inAttributes.getLength() > 0)
{
attributes = new ArrayList<>(inAttributes.getLength());
for (int i = 0; i < inAttributes.getLength(); i++)
{
attributes.add(new XMLAttribute(inAttributes.getQName(i), inAttributes.getValue(i)));
}
}
return composeStartTag(inName, attributes, isEmptyTag);
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
Attributes appear in alphabetical order for consistency.
*/
public static String composeStartTag(String inName, Collection inAttributes)
{
return composeStartTag(inName, inAttributes, NOT_EMPTY_TAG);
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
If the isEmptyTag parameter is true, the output tag will end with "/>".
ex: "<inName att1='value1' att2='value2'/>". Attributes appear in
alphabetical order for consistency.
@param inName the tag name
@param inAttributes a List of XMLAttribute objects
@param isEmptyTag whether or not the tag has any content or subtags
*/
public static String composeStartTag(String inName, Collection inAttributes,
boolean isEmptyTag)
{
StringBuilder tag = new StringBuilder();
tag.append("<");
tag.append(inName);
// Write attributes.
if (inAttributes != null)
{
List sordidAttributes;
if (inAttributes instanceof List)
{
sordidAttributes = (List) inAttributes;
}
else
{
sordidAttributes = new ArrayList<>(inAttributes);
}
Collections.sort(sordidAttributes);
for (XMLAttribute attribute : sordidAttributes)
{
XMLNamespace namespace = attribute.getNamespace();
tag.append(" ");
tag.append(namespace != null && StringUtil.isSet(namespace.getPrefix()) ? namespace + ":" : "");
tag.append(attribute.getName());
String value = attribute.getValue();
if (null == value) value = "";
tag.append("=");
tag.append(getQuotedAttributeValue(value));
}
}
if (isEmptyTag) tag.append(" /");
tag.append(">");
return tag.toString();
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
Attributes appear in alphabetical order for consistency.
*/
public static String composeStartTag(String inName, Map inAttributes)
{
return composeStartTag(inName, inAttributes, NOT_EMPTY_TAG);
}
//---------------------------------------------------------------------------
/**
Composes an xml start tag (ex: "<inName att1='value1' att2='value2'>").
If the isEmptyTag parameter is true, the output tag will end with "/>".
ex: "<inName att1='value1' att2='value2'/>". Attributes appear in
alphabetical order for consistency.
*/
public static String composeStartTag(String inName, Map inAttributes,
boolean isEmptyTag)
{
StringBuilder tag = new StringBuilder();
tag.append("<");
tag.append(inName);
// Write attributes.
if (inAttributes != null)
{
List keys = new ArrayList<>(inAttributes.keySet());
Collections.sort(keys);
for (String attributeName : keys)
{
tag.append(" ");
tag.append(attributeName);
Object value = inAttributes.get(attributeName);
if (null == value) value = "";
tag.append("=");
tag.append(getQuotedAttributeValue(value.toString()));
}
}
if (isEmptyTag) tag.append(" /");
tag.append(">");
return tag.toString();
}
//---------------------------------------------------------------------------
/**
Composes an xml end tag (ex: "</inName>").
*/
public static String composeEndTag(String inName)
{
return "" + inName + ">";
}
//---------------------------------------------------------------------------
public static boolean isWellFormedFragment(String inXML)
{
boolean result = true;
try
{
XMLTag testTag = new XMLTag(new StringReader(inXML));
}
catch (Exception e)
{
result = false;
}
return result;
}
//---------------------------------------------------------------------------
/**
Returns whether or not the content fragment is well-formed and if
tags are present internal to the fragment, whether the tag section
and preceding or trailing raw content is well-formed.
Examples of valid content fragments: '<foo />', 'foo & bar', 'foo', 'foo <bar>1</bar> one'
Examples of invalid content fragments: '<foo>', 'foo & bar', 'foo <bar>1</zoot> one'
*/
public static boolean isWellFormedContentFragment(String inXML)
{
boolean result = true;
int startIndex = inXML.indexOf("<");
int endIndex = inXML.lastIndexOf(">");
if (startIndex > -1
&& endIndex > startIndex)
{
// Test the tag frag
result = isWellFormedFragment(inXML.substring(startIndex, endIndex + 1));
}
if (startIndex > 0
&& result)
{
// Test preceeding raw content
String preceedingContent = inXML.substring(0, startIndex);
result = preceedingContent.equals(escapeContentIfNecessary(preceedingContent));
}
if (endIndex < inXML.length() - 1
&& result)
{
// Test trailing raw content
String trailingContent = inXML.substring(endIndex + 1);
result = trailingContent.equals(escapeContentIfNecessary(trailingContent));
}
return result;
}
//---------------------------------------------------------------------------
public static String convertCharacterEntitiesToNumeric(String inContent)
{
String outContent = inContent;
if (inContent != null)
{
if (inContent.contains("&"))
{
StringBuilder buffer = new StringBuilder(inContent);
int index = 0;
Matcher m = sCharacterEntityPattern.matcher(buffer);
while (index < buffer.length()
&& m.find(index))
{
String numericEntity = Latin1Entities.getInstance().getNumericEntity(m.group(1));
if (null == numericEntity)
{
numericEntity = SymbolEntities.getInstance().getNumericEntity(m.group(1));
}
index = m.end();
if (numericEntity != null)
{
int length = m.group(1).length();
buffer.replace(m.start(1), m.end(1), numericEntity);
index += (numericEntity.length() - length);
}
// Since the buffer changed we need to reinstantiate the matcher
m = sCharacterEntityPattern.matcher(buffer);
}
outContent = buffer.toString();
}
}
return outContent;
}
//---------------------------------------------------------------------------
public static String convertCharacterEntitiesToUnicode(String inContent)
{
String outContent = inContent;
if (inContent != null)
{
if (inContent.contains("&"))
{
StringBuilder buffer = new StringBuilder(inContent);
int index = 0;
Matcher m = sCharacterEntityPattern.matcher(buffer);
while (index < buffer.length()
&& m.find(index))
{
Character unicodeChar = Latin1Entities.getInstance().getUnicodeChar(m.group(1));
if (null == unicodeChar)
{
unicodeChar = SymbolEntities.getInstance().getUnicodeChar(m.group(1));
}
index = m.end();
if (unicodeChar != null)
{
int length = m.group(0).length();
buffer.replace(m.start(0), m.end(0), unicodeChar + "");
index += (1 - length);
}
// Since the buffer changed we need to reinstantiate the matcher
m = sCharacterEntityPattern.matcher(buffer);
}
outContent = buffer.toString();
}
}
return outContent;
}
//
// RULES FOR ESCAPING:
// - '&' and '<' must *always* be escaped as & and <, without
// exception.
// - '>' needs to be escaped as > only when (a) it is in character
// data content and (b) it immediately follows the string "]]".
// - '"' and "'" *never* need to be escaped in character data content; in
// attribute value literals, they need to be escaped only when that
// character is being used as a delimiter, i.e. """ or '''.
//---------------------------------------------------------------------------
/**
* Makes the content XML safe by ensuring that all '<'s are all escaped and that
* all '&'s are part of an entity. Avoids double escaping.
* @param inContent
* @return XML-safe content string
*/
public static String escapeContentIfNecessary(String inContent)
{
String safeContent = inContent;
if (inContent != null)
{
if (inContent.indexOf("&") >= 0)
{
StringBuilder buffer = new StringBuilder(inContent);
int index = 0;
while ((index = buffer.indexOf("&", index)) >= 0)
{
Matcher m = sEntityPattern.matcher(buffer.substring(index));
if (!m.find())
{
buffer.replace(index, index + 1, "&");
index += 3;
}
else
{
index++;
}
}
safeContent = buffer.toString();
}
safeContent = safeContent.replaceAll("<", "<");
if (sISOControlPattern.matcher(safeContent).find())
{
safeContent = replaceISOControlWithEntities(safeContent);
}
}
return safeContent;
}
//---------------------------------------------------------------------------
public static String escapeContent(String inContent)
{
String encodedString = null;
if (inContent != null)
{
encodedString = inContent.replaceAll("&", "&");
encodedString = encodedString.replaceAll("<", "<");
}
return encodedString;
}
//---------------------------------------------------------------------------
public static String unescapeContent(String inContent)
{
String unencodedString = inContent;
if (inContent != null)
{
unencodedString = inContent.replaceAll("<", "<");
unencodedString = unencodedString.replaceAll("&", "&");
unencodedString = expandISOControlEntities(unencodedString);
}
return unencodedString;
}
//---------------------------------------------------------------------------
public static String escapeAttributeValue(String inAttributeValue)
{
StringBuilderPlus buffer = null; // We won't instantiate this unless it is necessary
if (inAttributeValue != null)
{
for (int i = inAttributeValue.length() - 1; i >= 0; i--)
{
char c = inAttributeValue.charAt(i);
if (c < 32 || 127 == c // ISO control character range
|| '&' == c
|| '\'' == c
|| '<' == c)
{
String entity;
switch (c)
{
case '&':
entity = "&";
break;
case '\'':
entity = "'";
break;
case '<':
entity = "<";
break;
default:
entity = "" + (int) c + ";";
}
if (null == buffer)
{
buffer = new StringBuilderPlus();
if (i < inAttributeValue.length() - 1)
{
// Include any regular characters that we have skipped
buffer.append(inAttributeValue.substring(i + 1));
}
}
buffer.insert(0, entity);
}
else if (buffer != null)
{
// This character should be safe without escaping
buffer.insert(0, c);
}
}
}
return (buffer != null ? buffer.toString() : inAttributeValue);
}
//---------------------------------------------------------------------------
public static String unescapeAttributeValue(String inAttributeValue)
{
return unescapeEntities(inAttributeValue);
}
//---------------------------------------------------------------------------
public static String escapeApos(String inAttributeValue)
{
String escapedVaule = null;
if (inAttributeValue != null)
{
escapedVaule = inAttributeValue.replaceAll("'", "'");
}
return escapedVaule;
}
//---------------------------------------------------------------------------
public static String escapeQuote(String inAttributeValue)
{
String escapedVaule = null;
if (inAttributeValue != null)
{
escapedVaule = inAttributeValue.replaceAll("\"", """);
}
return escapedVaule;
}
//---------------------------------------------------------------------------
public static String escapeAmp(String inAttributeValue)
{
String escapedVaule = null;
if (inAttributeValue != null)
{
escapedVaule = inAttributeValue.replaceAll("&", "&");
}
return escapedVaule;
}
// public static Pattern sEntityPattern2 = Pattern.compile("&(\\S{3,8});");
public static Pattern sEntityPattern2 = Pattern.compile("&(#?[\\w\\d]{1,7});");
//---------------------------------------------------------------------------
public static synchronized String unescapeEntities(String inValue)
{
StringBuilder unescapedVaule = null;
if (inValue != null)
{
unescapedVaule = new StringBuilder(inValue);
Matcher m = sEntityPattern2.matcher(unescapedVaule);
int index = 0;
while (m.find(index))
{
String entity = m.group(1);
String expandedEntity;
if (entity.startsWith("#"))
{
if (entity.charAt(1) == 'x')
{
// Hex
expandedEntity = "" + (char) Integer.parseInt(entity.substring(2), 16);
}
else
{
// Decimal
expandedEntity = "" + (char) Integer.parseInt(entity.substring(1));
}
}
else
{
// Try to resolve it with our battery of standard entity classes.
expandedEntity = SpecialCharacterEntities.resolveEntity(entity);
if (null == expandedEntity)
{
expandedEntity = Latin1Entities.getInstance().getNumericEntity(entity);
}
if (null == expandedEntity)
{
expandedEntity = SymbolEntities.getInstance().getNumericEntity(entity);
}
if (expandedEntity != null
&& expandedEntity.startsWith("#"))
{
expandedEntity = "" + (char) Integer.parseInt(expandedEntity.substring(1));
}
}
if (expandedEntity != null)
{
unescapedVaule.replace(m.start(), m.start() + m.group(1).length() + 2, expandedEntity);
index = m.start(1) + expandedEntity.length() - 1;
}
else
{
index++;
}
if (index >= unescapedVaule.length())
{
break;
}
}
}
return unescapedVaule != null ? unescapedVaule.toString() : null;
}
//---------------------------------------------------------------------------
/**
Is the tag or element name valid?
*/
public static void checkXMLNameValidity(String inValue)
throws InvalidXMLNameException
{
if (null == inValue)
{
throw new InvalidXMLNameException("XML tag/element names cannot be set to null.");
}
// Check for xml validity of the tag name.
if (inValue.length() == 0)
{
throw new InvalidXMLNameException("XML tag/element names cannot be an empty string.");
}
else if (! sValidatedNameCache.contains(inValue))
{
if (! Character.isLetter(inValue.charAt(0))
&& inValue.charAt(0) != '_')
{
throw new InvalidXMLNameException("'" + inValue + "' is not a valid XML tag/element name. " +
"The first character must be a letter " +
"or an underscore.");
}
else if (inValue.contains(" "))
{
throw new InvalidXMLNameException("'" + inValue + "' is not a valid XML tag/element name. " +
"It cannot contain whitespace.");
}
else
{
for (int i = 0; i < inValue.length(); i++)
{
char tagChar = inValue.charAt(i);
if (! Character.isLetterOrDigit(tagChar)
&& tagChar != '_'
&& tagChar != '.'
&& tagChar != '-')
{
throw new InvalidXMLNameException("'" + inValue + "' is not a valid XML tag/element name. " +
"'" + tagChar + "' is an invalid character.");
}
}
sValidatedNameCache.add(inValue);
}
}
}
//---------------------------------------------------------------------------
public static List findNodesByAttribute(XMLNode inRootNode, XMLAttribute inAttribute)
{
List nodes = new ArrayList<>();
recursivelyFindByAttribute(nodes, inRootNode, inAttribute);
return nodes;
}
//---------------------------------------------------------------------------
public static List findNodesByAttribute(XMLNode inRootNode, String inAttribute)
{
List nodes = new ArrayList<>();
recursivelyFindByAttribute(nodes, inRootNode, new XMLAttribute(inAttribute, null));
return nodes;
}
//---------------------------------------------------------------------------
private static void recursivelyFindByAttribute(List inNodes, XMLNode inNode, XMLAttribute inAttribute)
{
if (inNode.hasAttribute(inAttribute.getName())
&& null == inAttribute.getValue() || inAttribute.getValue().equals(inNode.getAttributeValue(inAttribute.getName())))
{
inNodes.add(inNode);
}
List extends XMLNode> subnodes = inNode.getSubtags();
if (CollectionUtil.hasValues(subnodes))
{
for (XMLNode subnode : subnodes)
{
recursivelyFindByAttribute(inNodes, subnode, inAttribute);
}
}
}
//###########################################################################
// PRIVATE FUNCTIONS
//###########################################################################
//---------------------------------------------------------------------------
private static String getQuotedAttributeValue(String inAttributeValue)
{
String safeValue = "''";
if (inAttributeValue != null)
{
if (sQuoteChar == '\'')
{
safeValue = "'" + escapeAttributeValue(inAttributeValue) + "'";
}
else
{
safeValue = "\"" + escapeDoubleQuotedAttributeValue(inAttributeValue) + "\"";
}
}
return safeValue;
}
//---------------------------------------------------------------------------
public static String escapeDoubleQuotedAttributeValue(String inAttributeValue)
{
String encodedString = null;
if (inAttributeValue != null)
{
encodedString = escapeAmp(inAttributeValue);
encodedString = escapeQuote(encodedString);
encodedString = encodedString.replaceAll("<", "<");
}
return encodedString;
}
//---------------------------------------------------------------------------
private static String replaceISOControlWithEntities(String inString)
{
StringBuilder buffer = new StringBuilder();
for (char theChar : inString.toCharArray())
{
if (! Character.isWhitespace(theChar) // Skip \t, \r, and \n
&& Character.isISOControl(theChar))
{
buffer.append("" + (int)theChar + ";");
}
else
{
buffer.append(theChar);
}
}
return buffer.toString();
}
//---------------------------------------------------------------------------
private static String expandISOControlEntities(String inString)
{
String resultString = inString;
if (sISOControlEntityPattern.matcher(inString).find())
{
StringBuilder buffer = new StringBuilder(inString);
int start = 0;
Matcher m = sISOControlEntityPattern.matcher(buffer);
while (m.find(start))
{
buffer.replace(m.start(), m.end(), "" + (char)Integer.parseInt(m.group(1)));
start = m.start() + 1;
}
resultString = buffer.toString();
}
return resultString;
}
//---------------------------------------------------------------------------
public static String replaceUnicodeWithEntities(String inString)
{
StringBuilder buffer = new StringBuilder();
if (inString != null)
{
for (int i = 0; i < inString.length(); i++)
{
char theChar = inString.charAt(i);
if (126 < (int) theChar)
{
buffer.append("" + ((int) theChar) + ";");
}
else
{
buffer.append(theChar);
}
}
}
return inString != null ? buffer.toString() : null;
}
// This cache is used for improved efficiency in name validation.
//---------------------------------------------------------------------------
private static class ValidatedNameCache
{
private int mMaxCacheSize = 150;
private Set mNameSet = new HashSet<>();
private LinkedList mNameQueue = new LinkedList<>();
//------------------------------------------------------------------------
public boolean contains(String inValue)
{
return (mNameSet.contains(inValue));
}
//------------------------------------------------------------------------
public synchronized void add(String inValue)
{
if (mNameSet.size() >= mMaxCacheSize)
{
mNameSet.remove(mNameQueue.removeFirst());
}
mNameSet.add(inValue);
mNameQueue.add(inValue);
}
}
}