com.helger.xml.serialize.write.XMLMaskHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ph-xml Show documentation
Show all versions of ph-xml Show documentation
Java 1.8+ Library with XML handling routines
/*
* Copyright (C) 2014-2024 Philip Helger (www.helger.com)
* philip[at]helger[dot]com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.helger.xml.serialize.write;
import java.io.IOException;
import java.io.Writer;
import java.util.Set;
import javax.annotation.Nonnegative;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.helger.commons.CGlobal;
import com.helger.commons.ValueEnforcer;
import com.helger.commons.annotation.Nonempty;
import com.helger.commons.annotation.PresentForCodeCoverage;
import com.helger.commons.annotation.ReturnsMutableCopy;
import com.helger.commons.annotation.ReturnsMutableObject;
import com.helger.commons.collection.ArrayHelper;
import com.helger.commons.collection.impl.ICommonsOrderedSet;
import com.helger.commons.collection.impl.ICommonsSet;
import com.helger.commons.string.StringHelper;
import com.helger.xml.EXMLVersion;
/**
* This class contains all the methods for masking XML content.
*
* @author Philip Helger
*/
public final class XMLMaskHelper
{
private static final char TAB = '\t';
private static final char CR = '\r';
private static final char LF = '\n';
private static final char DOUBLE_QUOTE = '"';
private static final char AMPERSAND = '&';
private static final char LT = '<';
private static final char GT = '>';
private static final char APOS = '\'';
// XML 1.0
private static final char [] MASK_ATTRIBUTE_VALUE_XML10_DQ = { TAB, LF, CR, DOUBLE_QUOTE, AMPERSAND, LT };
private static final char [] MASK_ATTRIBUTE_VALUE_XML10_SQ = { TAB, LF, CR, APOS, AMPERSAND, LT };
private static final char [] MASK_TEXT_XML10 = { CR,
AMPERSAND,
LT,
GT,
0x7f,
0x80,
0x81,
0x82,
0x83,
0x84,
0x85,
0x86,
0x87,
0x88,
0x89,
0x8a,
0x8b,
0x8c,
0x8d,
0x8e,
0x8f,
0x90,
0x91,
0x92,
0x93,
0x94,
0x95,
0x96,
0x97,
0x98,
0x99,
0x9a,
0x9b,
0x9c,
0x9d,
0x9e,
0x9f };
// XML 1.1
private static final char [] MASK_ATTRIBUTE_VALUE_XML11_DQ = { 0x1,
0x2,
0x3,
0x4,
0x5,
0x6,
0x7,
0x8,
0x9,
0xa,
0xb,
0xc,
0xd,
0xe,
0xf,
0x10,
0x11,
0x12,
0x13,
0x14,
0x15,
0x16,
0x17,
0x18,
0x19,
0x1a,
0x1b,
0x1c,
0x1d,
0x1e,
0x1f,
DOUBLE_QUOTE,
AMPERSAND,
LT };
private static final char [] MASK_ATTRIBUTE_VALUE_XML11_SQ = { 0x1,
0x2,
0x3,
0x4,
0x5,
0x6,
0x7,
0x8,
0x9,
0xa,
0xb,
0xc,
0xd,
0xe,
0xf,
0x10,
0x11,
0x12,
0x13,
0x14,
0x15,
0x16,
0x17,
0x18,
0x19,
0x1a,
0x1b,
0x1c,
0x1d,
0x1e,
0x1f,
APOS,
AMPERSAND,
LT };
private static final char [] MASK_TEXT_XML11 = { 0x1,
0x2,
0x3,
0x4,
0x5,
0x6,
0x7,
0x8,
0xb,
0xc,
0xd,
0xe,
0xf,
0x10,
0x11,
0x12,
0x13,
0x14,
0x15,
0x16,
0x17,
0x18,
0x19,
0x1a,
0x1b,
0x1c,
0x1d,
0x1e,
0x1f,
AMPERSAND,
LT,
GT,
0x7f,
0x80,
0x81,
0x82,
0x83,
0x84,
0x85,
0x86,
0x87,
0x88,
0x89,
0x8a,
0x8b,
0x8c,
0x8d,
0x8e,
0x8f,
0x90,
0x91,
0x92,
0x93,
0x94,
0x95,
0x96,
0x97,
0x98,
0x99,
0x9a,
0x9b,
0x9c,
0x9d,
0x9e,
0x9f,
0x2028 };
// HTML
private static final char [] MASK_TEXT_HTML_DQ = { AMPERSAND,
DOUBLE_QUOTE,
LT,
GT,
APOS,
0x80,
0x81,
0x82,
0x83,
0x84,
0x85,
0x86,
0x87,
0x88,
0x89,
0x8a,
0x8b,
0x8c,
0x8d,
0x8e,
0x8f,
0x90,
0x91,
0x92,
0x93,
0x94,
0x95,
0x96,
0x97,
0x98,
0x99,
0x9a,
0x9b,
0x9c,
0x9d,
0x9e,
0x9f };
private static final char [] MASK_TEXT_HTML_SQ = { AMPERSAND,
LT,
GT,
APOS,
0x80,
0x81,
0x82,
0x83,
0x84,
0x85,
0x86,
0x87,
0x88,
0x89,
0x8a,
0x8b,
0x8c,
0x8d,
0x8e,
0x8f,
0x90,
0x91,
0x92,
0x93,
0x94,
0x95,
0x96,
0x97,
0x98,
0x99,
0x9a,
0x9b,
0x9c,
0x9d,
0x9e,
0x9f };
// XML 1.0
private static final char [] [] MASK_ATTRIBUTE_VALUE_XML10_DQ_REPLACE = new char [MASK_ATTRIBUTE_VALUE_XML10_DQ.length] [];
private static final char [] [] MASK_ATTRIBUTE_VALUE_XML10_SQ_REPLACE = new char [MASK_ATTRIBUTE_VALUE_XML10_SQ.length] [];
private static final char [] [] MASK_TEXT_XML10_REPLACE = new char [MASK_TEXT_XML10.length] [];
// XML 1.1
private static final char [] [] MASK_ATTRIBUTE_VALUE_XML11_DQ_REPLACE = new char [MASK_ATTRIBUTE_VALUE_XML11_DQ.length] [];
private static final char [] [] MASK_ATTRIBUTE_VALUE_XML11_SQ_REPLACE = new char [MASK_ATTRIBUTE_VALUE_XML11_SQ.length] [];
private static final char [] [] MASK_TEXT_XML11_REPLACE = new char [MASK_TEXT_XML11.length] [];
// HTML
private static final char [] [] MASK_TEXT_HTML_DQ_REPLACE = new char [MASK_TEXT_HTML_DQ.length] [];
private static final char [] [] MASK_TEXT_HTML_SQ_REPLACE = new char [MASK_TEXT_HTML_SQ.length] [];
private static final char [] INT_HEX_UC = "0123456789ABCDEF".toCharArray ();
@Nonnull
public static String getXMLNumericReference (final char n)
{
final StringBuilder aSB = new StringBuilder (16);
aSB.append ("");
// Don't use Integer.toString because it delivers lowercase chars
if (n > 0xfff)
aSB.append (INT_HEX_UC[(n >> 12) & 0xf]);
if (n > 0xff)
aSB.append (INT_HEX_UC[(n >> 8) & 0xf]);
if (n > 0xf)
aSB.append (INT_HEX_UC[(n >> 4) & 0xf]);
aSB.append (INT_HEX_UC[n & 0xf]);
aSB.append (';');
return aSB.toString ();
}
/**
* Get the entity reference for the specified character. This returns e.g.
* < for '<' etc. This method has special handling for <, >,
* &, " and '. All other chars are encoded by their numeric value
* (e.g. È)
*
* @param c
* Character to use.
* @return The entity reference string. Never null
nor empty.
*/
@Nonnull
@Nonempty
public static String getXML10EntityReferenceString (final char c)
{
if (c == LT)
return "<";
if (c == GT)
return ">";
if (c == AMPERSAND)
return "&";
if (c == DOUBLE_QUOTE)
return """;
if (c == APOS)
return "'";
return getXMLNumericReference (c);
}
/**
* Get the entity reference for the specified character. This returns e.g.
* < for '<' etc. This method has special handling for <, >,
* &, " and '. All other chars are encoded by their numeric value
* (e.g. È)
*
* @param c
* Character to use.
* @return The entity reference string. Never null
nor empty.
*/
@Nonnull
@Nonempty
public static String getXML11EntityReferenceString (final char c)
{
if (c == LT)
return "<";
if (c == GT)
return ">";
if (c == AMPERSAND)
return "&";
if (c == DOUBLE_QUOTE)
return """;
if (c == APOS)
return "'";
if (c == '\u2028')
return "\n";
return getXMLNumericReference (c);
}
/**
* Get the entity reference for the specified character. This returns e.g.
* <
for '<
' etc. This method has special
* handling for <, >, & and ". All other chars are encoded by
* their numeric value (e.g. È
). In contrast to
* {@link #getXML10EntityReferenceString(char)} this method does not handle
* '
*
* @param c
* Character to use.
* @return The entity reference string. Never null
nor empty.
*/
@Nonnull
@Nonempty
public static String getHTMLEntityReferenceString (final char c)
{
if (c == LT)
return "<";
if (c == GT)
return ">";
if (c == AMPERSAND)
return "&";
if (c == DOUBLE_QUOTE)
return """;
// Use of ' in XHTML should generally be avoided for compatibility
// reasons. ' or ' may be used instead.
return "" + (int) c + ";";
}
static
{
// XML 1.0
for (int i = 0; i < MASK_ATTRIBUTE_VALUE_XML10_DQ.length; ++i)
MASK_ATTRIBUTE_VALUE_XML10_DQ_REPLACE[i] = getXML10EntityReferenceString (MASK_ATTRIBUTE_VALUE_XML10_DQ[i]).toCharArray ();
for (int i = 0; i < MASK_ATTRIBUTE_VALUE_XML10_SQ.length; ++i)
MASK_ATTRIBUTE_VALUE_XML10_SQ_REPLACE[i] = getXML10EntityReferenceString (MASK_ATTRIBUTE_VALUE_XML10_SQ[i]).toCharArray ();
for (int i = 0; i < MASK_TEXT_XML10.length; ++i)
MASK_TEXT_XML10_REPLACE[i] = getXML10EntityReferenceString (MASK_TEXT_XML10[i]).toCharArray ();
// XML 1.1
for (int i = 0; i < MASK_ATTRIBUTE_VALUE_XML11_DQ.length; ++i)
MASK_ATTRIBUTE_VALUE_XML11_DQ_REPLACE[i] = getXML11EntityReferenceString (MASK_ATTRIBUTE_VALUE_XML11_DQ[i]).toCharArray ();
for (int i = 0; i < MASK_ATTRIBUTE_VALUE_XML11_SQ.length; ++i)
MASK_ATTRIBUTE_VALUE_XML11_SQ_REPLACE[i] = getXML11EntityReferenceString (MASK_ATTRIBUTE_VALUE_XML11_SQ[i]).toCharArray ();
for (int i = 0; i < MASK_TEXT_XML11.length; ++i)
MASK_TEXT_XML11_REPLACE[i] = getXML11EntityReferenceString (MASK_TEXT_XML11[i]).toCharArray ();
// HTML
for (int i = 0; i < MASK_TEXT_HTML_DQ.length; ++i)
MASK_TEXT_HTML_DQ_REPLACE[i] = getHTMLEntityReferenceString (MASK_TEXT_HTML_DQ[i]).toCharArray ();
for (int i = 0; i < MASK_TEXT_HTML_SQ.length; ++i)
MASK_TEXT_HTML_SQ_REPLACE[i] = getHTMLEntityReferenceString (MASK_TEXT_HTML_SQ[i]).toCharArray ();
}
@PresentForCodeCoverage
private static final XMLMaskHelper INSTANCE = new XMLMaskHelper ();
private XMLMaskHelper ()
{}
@Nullable
@ReturnsMutableObject ("internal use only")
private static char [] _findSourceMap (@Nonnull final EXMLSerializeVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode)
{
switch (eXMLVersion)
{
case XML_10:
switch (eXMLCharMode)
{
case ATTRIBUTE_VALUE_DOUBLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML10_DQ;
case ATTRIBUTE_VALUE_SINGLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML10_SQ;
case TEXT:
return MASK_TEXT_XML10;
default:
break;
}
break;
case XML_11:
switch (eXMLCharMode)
{
case ATTRIBUTE_VALUE_DOUBLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML11_DQ;
case ATTRIBUTE_VALUE_SINGLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML11_SQ;
case TEXT:
return MASK_TEXT_XML11;
default:
break;
}
break;
case HTML:
switch (eXMLCharMode)
{
case ATTRIBUTE_VALUE_SINGLE_QUOTES:
return MASK_TEXT_HTML_SQ;
case ATTRIBUTE_VALUE_DOUBLE_QUOTES:
case TEXT:
return MASK_TEXT_HTML_DQ;
default:
break;
}
break;
default:
throw new IllegalArgumentException ("Unsupported XML version " + eXMLVersion + "!");
}
return null;
}
@Nullable
@ReturnsMutableObject ("internal use only")
private static char [] [] _findReplaceMap (@Nonnull final EXMLSerializeVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode)
{
switch (eXMLVersion)
{
case XML_10:
switch (eXMLCharMode)
{
case ATTRIBUTE_VALUE_DOUBLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML10_DQ_REPLACE;
case ATTRIBUTE_VALUE_SINGLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML10_SQ_REPLACE;
case TEXT:
return MASK_TEXT_XML10_REPLACE;
default:
break;
}
break;
case XML_11:
switch (eXMLCharMode)
{
case ATTRIBUTE_VALUE_DOUBLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML11_DQ_REPLACE;
case ATTRIBUTE_VALUE_SINGLE_QUOTES:
return MASK_ATTRIBUTE_VALUE_XML11_SQ_REPLACE;
case TEXT:
return MASK_TEXT_XML11_REPLACE;
default:
break;
}
break;
case HTML:
switch (eXMLCharMode)
{
case ATTRIBUTE_VALUE_SINGLE_QUOTES:
return MASK_TEXT_HTML_SQ_REPLACE;
case ATTRIBUTE_VALUE_DOUBLE_QUOTES:
case TEXT:
return MASK_TEXT_HTML_DQ_REPLACE;
default:
break;
}
break;
default:
throw new IllegalArgumentException ("Unsupported XML version " + eXMLVersion + "!");
}
return null;
}
/**
* Convert the passed set to an array
*
* @param aChars
* Character set to use. May not be null
.
* @return A new array with the same length as the source set.
*/
@Nonnull
@ReturnsMutableCopy
private static char [] _getAsCharArray (@Nonnull final Set aChars)
{
ValueEnforcer.notNull (aChars, "Chars");
final char [] ret = new char [aChars.size ()];
int nIndex = 0;
for (final Character aChar : aChars)
ret[nIndex++] = aChar.charValue ();
return ret;
}
@Nonnull
@ReturnsMutableCopy
private static char [] [] _createEmptyReplacement (@Nonnull final char [] aSrcMap)
{
final char [] [] ret = new char [aSrcMap.length] [];
for (int i = 0; i < aSrcMap.length; ++i)
ret[i] = ArrayHelper.EMPTY_CHAR_ARRAY;
return ret;
}
@Nonnull
public static char [] getMaskedXMLText (@Nonnull final EXMLSerializeVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode,
@Nonnull final EXMLIncorrectCharacterHandling eIncorrectCharHandling,
@Nullable final String s)
{
if (StringHelper.hasNoText (s))
return ArrayHelper.EMPTY_CHAR_ARRAY;
char [] aChars = s.toCharArray ();
// 1. do incorrect character handling
if (eIncorrectCharHandling.isTestRequired ())
{
if (XMLCharHelper.containsInvalidXMLChar (eXMLVersion, eXMLCharMode, aChars))
{
final ICommonsSet aAllInvalidChars = XMLCharHelper.getAllInvalidXMLChars (eXMLVersion,
eXMLCharMode,
aChars);
eIncorrectCharHandling.notifyOnInvalidXMLCharacter (s, aAllInvalidChars);
if (eIncorrectCharHandling.isReplaceWithNothing ())
{
final char [] aSrcMap = _getAsCharArray (aAllInvalidChars);
final char [] [] aDstMap = _createEmptyReplacement (aSrcMap);
aChars = StringHelper.replaceMultiple (s, aSrcMap, aDstMap);
}
}
}
// 2. perform entity replacements if necessary
final char [] aSrcMap = _findSourceMap (eXMLVersion, eXMLCharMode);
if (aSrcMap == null)
{
// Nothing to replace
return aChars;
}
final char [] [] aDstMap = _findReplaceMap (eXMLVersion, eXMLCharMode);
return StringHelper.replaceMultiple (aChars, aSrcMap, aDstMap);
}
@Nonnegative
public static int getMaskedXMLTextLength (@Nonnull final EXMLVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode,
@Nonnull final EXMLIncorrectCharacterHandling eIncorrectCharHandling,
@Nullable final String s)
{
return getMaskedXMLTextLength (EXMLSerializeVersion.getFromXMLVersionOrThrow (eXMLVersion),
eXMLCharMode,
eIncorrectCharHandling,
s);
}
@Nonnegative
public static int getMaskedXMLTextLength (@Nonnull final EXMLSerializeVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode,
@Nonnull final EXMLIncorrectCharacterHandling eIncorrectCharHandling,
@Nullable final String s)
{
if (StringHelper.hasNoText (s))
return 0;
char [] aChars = s.toCharArray ();
// 1. do incorrect character handling
if (eIncorrectCharHandling.isTestRequired () &&
XMLCharHelper.containsInvalidXMLChar (eXMLVersion, eXMLCharMode, aChars))
{
final ICommonsSet aAllInvalidChars = XMLCharHelper.getAllInvalidXMLChars (eXMLVersion,
eXMLCharMode,
aChars);
eIncorrectCharHandling.notifyOnInvalidXMLCharacter (s, aAllInvalidChars);
if (eIncorrectCharHandling.isReplaceWithNothing ())
{
final char [] aSrcMap = _getAsCharArray (aAllInvalidChars);
final char [] [] aDstMap = _createEmptyReplacement (aSrcMap);
aChars = StringHelper.replaceMultiple (s, aSrcMap, aDstMap);
}
}
// 2. perform entity replacements if necessary
final char [] aSrcMap = _findSourceMap (eXMLVersion, eXMLCharMode);
final int ret;
if (aSrcMap != null)
{
final char [] [] aDstMap = _findReplaceMap (eXMLVersion, eXMLCharMode);
final int nResLen = StringHelper.getReplaceMultipleResultLength (aChars, aSrcMap, aDstMap);
ret = nResLen == CGlobal.ILLEGAL_UINT ? aChars.length : nResLen;
}
else
{
// Nothing to replace
ret = aChars.length;
}
// Reminder: Surrogate characters count as 2
return ret;
}
public static void maskXMLTextTo (@Nonnull final EXMLSerializeVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode,
@Nonnull final EXMLIncorrectCharacterHandling eIncorrectCharHandling,
@Nullable final String s,
@Nonnull final Writer aWriter) throws IOException
{
if (StringHelper.hasText (s))
maskXMLTextTo (eXMLVersion, eXMLCharMode, eIncorrectCharHandling, s.toCharArray (), 0, s.length (), aWriter);
}
public static void maskXMLTextTo (@Nonnull final EXMLSerializeVersion eXMLVersion,
@Nonnull final EXMLCharMode eXMLCharMode,
@Nonnull final EXMLIncorrectCharacterHandling eIncorrectCharHandling,
@Nonnull final char [] aSrcText,
@Nonnegative final int nOfs,
@Nonnegative final int nLen,
@Nonnull final Writer aWriter) throws IOException
{
if (nLen == 0)
return;
char [] aChars = aSrcText;
int nRealOfs = nOfs;
int nRealLen = nLen;
// 1. do incorrect character handling
if (eIncorrectCharHandling.isTestRequired ())
{
if (XMLCharHelper.containsInvalidXMLChar (eXMLVersion, eXMLCharMode, aChars, nRealOfs, nRealLen))
{
final ICommonsOrderedSet aAllInvalidChars = XMLCharHelper.getAllInvalidXMLChars (eXMLVersion,
eXMLCharMode,
aChars,
nRealOfs,
nRealLen);
// Here we can convert, because this part should not be called very
// often
final String s = new String (aSrcText, nRealOfs, nRealLen);
eIncorrectCharHandling.notifyOnInvalidXMLCharacter (s, aAllInvalidChars);
if (eIncorrectCharHandling.isReplaceWithNothing ())
{
final char [] aSrcMap = _getAsCharArray (aAllInvalidChars);
final char [] [] aDstMap = _createEmptyReplacement (aSrcMap);
aChars = StringHelper.replaceMultiple (s, aSrcMap, aDstMap);
nRealOfs = 0;
nRealLen = aChars.length;
if (nRealLen == 0)
{
// Nothing left after replacement
return;
}
}
}
}
// 2. perform entity replacements if necessary
final char [] aSrcMap = _findSourceMap (eXMLVersion, eXMLCharMode);
if (aSrcMap == null)
{
// Nothing to replace
// TODO write code points as XML entities
aWriter.write (aChars, nRealOfs, nRealLen);
}
else
{
final char [] [] aDstMap = _findReplaceMap (eXMLVersion, eXMLCharMode);
// TODO write code points as XML entities
StringHelper.replaceMultipleTo (aChars, nRealOfs, nRealLen, aSrcMap, aDstMap, aWriter);
}
}
}