com.fitbur.guava.common.xml.XmlEscapers Maven / Gradle / Ivy
/*
* Copyright (C) 2009 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fitbur.guava.common.xml;
import com.fitbur.guava.common.annotations.Beta;
import com.fitbur.guava.common.annotations.GwtCompatible;
import com.fitbur.guava.common.escape.Escaper;
import com.fitbur.guava.common.escape.Escapers;
/**
* {@code Escaper} instances suitable for strings to be included in XML
* attribute values and elements' text contents. When possible, avoid manual
* escaping by using templating systems and high-level APIs that provide
* autoescaping. For example, consider XOM or
* JDOM.
*
* Note: Currently the escapers provided by this class do not escape
* any characters outside the ASCII character range. Unlike HTML escaping the
* XML escapers will not escape non-ASCII characters to their numeric entity
* replacements. These XML escapers provide the minimal level of escaping to
* ensure that the output can be safely included in a Unicode XML document.
*
*
*
For details on the behavior of the escapers in this class, see sections
* 2.2 and
* 2.4 of the
* XML specification.
*
* @author Alex Matevossian
* @author David Beaumont
* @since 15.0
*/
@Beta
@GwtCompatible
public class XmlEscapers {
private XmlEscapers() {}
private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
// For each xxxEscaper() method, please add links to external reference pages
// that are considered authoritative for the behavior of that escaper.
/**
* Returns an {@link Escaper} instance that escapes special characters in a
* string so it can safely be included in an XML document as element content.
* See section
* 2.4 of the
* XML specification.
*
*
Note: Double and single quotes are not escaped, so it is not
* safe to use this escaper to escape attribute values. Use
* {@link #xmlContentEscaper} if the output can appear in element content or
* {@link #xmlAttributeEscaper} in attribute values.
*
*
This escaper substitutes {@code 0xFFFD} for non-whitespace control
* characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
* are not permitted in XML. For more detail see section 2.2 of the
* XML specification.
*
*
This escaper does not escape non-ASCII characters to their numeric
* character references (NCR). Any non-ASCII characters appearing in the input
* will be preserved in the output. Specifically "\r" (carriage return) is
* preserved in the output, which may result in it being silently converted to
* "\n" when the XML is parsed.
*
*
This escaper does not treat surrogate pairs specially and does not
* perform Unicode validation on its input.
*/
public static Escaper xmlContentEscaper() {
return XML_CONTENT_ESCAPER;
}
/**
* Returns an {@link Escaper} instance that escapes special characters in a
* string so it can safely be included in XML document as an attribute value.
* See section
* 3.3.3
* of the XML specification.
*
*
This escaper substitutes {@code 0xFFFD} for non-whitespace control
* characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
* are not permitted in XML. For more detail see section 2.2 of the
* XML specification.
*
*
This escaper does not escape non-ASCII characters to their numeric
* character references (NCR). However, horizontal tab {@code '\t'}, line feed
* {@code '\n'} and carriage return {@code '\r'} are escaped to a
* corresponding NCR {@code " "}, {@code "
"}, and {@code "
"}
* respectively. Any other non-ASCII characters appearing in the input will
* be preserved in the output.
*
*
This escaper does not treat surrogate pairs specially and does not
* perform Unicode validation on its input.
*/
public static Escaper xmlAttributeEscaper() {
return XML_ATTRIBUTE_ESCAPER;
}
private static final Escaper XML_ESCAPER;
private static final Escaper XML_CONTENT_ESCAPER;
private static final Escaper XML_ATTRIBUTE_ESCAPER;
static {
Escapers.Builder builder = Escapers.builder();
// The char values \uFFFE and \uFFFF are explicitly not allowed in XML
// (Unicode code points above \uFFFF are represented via surrogate pairs
// which means they are treated as pairs of safe characters).
builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
// Unsafe characters are replaced with the Unicode replacement character.
builder.setUnsafeReplacement("\uFFFD");
/*
* Except for \n, \t, and \r, all ASCII control characters are replaced with
* the Unicode replacement character.
*
* Implementation note: An alternative to the following would be to make a
* map that simply replaces the allowed ASCII whitespace characters with
* themselves and to set the minimum safe character to 0x20. However this
* would slow down the escaping of simple strings that contain \t, \n, or
* \r.
*/
for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
if (c != '\t' && c != '\n' && c != '\r') {
builder.addEscape(c, "\uFFFD");
}
}
// Build the content escaper first and then add quote escaping for the
// general escaper.
builder.addEscape('&', "&");
builder.addEscape('<', "<");
builder.addEscape('>', ">");
XML_CONTENT_ESCAPER = builder.build();
builder.addEscape('\'', "'");
builder.addEscape('"', """);
XML_ESCAPER = builder.build();
builder.addEscape('\t', " ");
builder.addEscape('\n', "
");
builder.addEscape('\r', "
");
XML_ATTRIBUTE_ESCAPER = builder.build();
}
}