com.landawn.abacus.util.EscapeUtil Maven / Gradle / Ivy
Show all versions of abacus-util-se Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.landawn.abacus.util;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
/**
* Note: it's copied from StringEscaperUtils in Apache Commons Lang under Apache License 2.0
*
* Escapes and unescapes {@code String}s for
* Java, Java Script, HTML and XML.
*
* #ThreadSafe#
* @since 2.0
*/
public class EscapeUtil {
/**
* {@code \u000a} linefeed LF ('\n').
*
* @see JLF: Escape Sequences
* for Character and String Literals
* @since 2.2
*/
static final char LF = '\n';
/**
* {@code \u000d} carriage return CR ('\r').
*
* @see JLF: Escape Sequences
* for Character and String Literals
* @since 2.2
*/
static final char CR = '\r';
/* ESCAPE TRANSLATORS */
/**
* Translator object for escaping Java.
*
* While {@link #escapeJava(String)} is the expected method of use, this
* object allows the Java escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator ESCAPE_JAVA = new LookupTranslator(new String[][] { { "\"", "\\\"" }, { "\\", "\\\\" }, })
.with(new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()))
.with(JavaUnicodeEscaper.outsideOf(32, 0x7f));
/**
* Translator object for escaping EcmaScript/JavaScript.
*
* While {@link #escapeEcmaScript(String)} is the expected method of use, this
* object allows the EcmaScript escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = new AggregateTranslator(
new LookupTranslator(new String[][] { { "'", "\\'" }, { "\"", "\\\"" }, { "\\", "\\\\" }, { "/", "\\/" } }),
new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), JavaUnicodeEscaper.outsideOf(32, 0x7f));
/**
* Translator object for escaping Json.
*
* While {@link #escapeJson(String)} is the expected method of use, this
* object allows the Json escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.2
*/
public static final CharSequenceTranslator ESCAPE_JSON = new AggregateTranslator(
new LookupTranslator(new String[][] { { "\"", "\\\"" }, { "\\", "\\\\" }, { "/", "\\/" } }),
new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), JavaUnicodeEscaper.outsideOf(32, 0x7f));
/**
* Translator object for escaping XML 1.0.
*
* While {@link #escapeXml10(String)} is the expected method of use, this
* object allows the XML escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.3
*/
public static final CharSequenceTranslator ESCAPE_XML10 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.APOS_ESCAPE()),
new LookupTranslator(new String[][] { { "\u0000", N.EMPTY_STRING }, { "\u0001", N.EMPTY_STRING }, { "\u0002", N.EMPTY_STRING },
{ "\u0003", N.EMPTY_STRING }, { "\u0004", N.EMPTY_STRING }, { "\u0005", N.EMPTY_STRING }, { "\u0006", N.EMPTY_STRING },
{ "\u0007", N.EMPTY_STRING }, { "\u0008", N.EMPTY_STRING }, { "\u000b", N.EMPTY_STRING }, { "\u000c", N.EMPTY_STRING },
{ "\u000e", N.EMPTY_STRING }, { "\u000f", N.EMPTY_STRING }, { "\u0010", N.EMPTY_STRING }, { "\u0011", N.EMPTY_STRING },
{ "\u0012", N.EMPTY_STRING }, { "\u0013", N.EMPTY_STRING }, { "\u0014", N.EMPTY_STRING }, { "\u0015", N.EMPTY_STRING },
{ "\u0016", N.EMPTY_STRING }, { "\u0017", N.EMPTY_STRING }, { "\u0018", N.EMPTY_STRING }, { "\u0019", N.EMPTY_STRING },
{ "\u001a", N.EMPTY_STRING }, { "\u001b", N.EMPTY_STRING }, { "\u001c", N.EMPTY_STRING }, { "\u001d", N.EMPTY_STRING },
{ "\u001e", N.EMPTY_STRING }, { "\u001f", N.EMPTY_STRING }, { "\ufffe", N.EMPTY_STRING }, { "\uffff", N.EMPTY_STRING } }),
NumericEntityEscaper.between(0x7f, 0x84), NumericEntityEscaper.between(0x86, 0x9f), new UnicodeUnpairedSurrogateRemover());
/**
* Translator object for escaping XML 1.1.
*
* While {@link #escapeXml11(String)} is the expected method of use, this
* object allows the XML escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.3
*/
public static final CharSequenceTranslator ESCAPE_XML11 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.APOS_ESCAPE()),
new LookupTranslator(new String[][] { { "\u0000", N.EMPTY_STRING }, { "\u000b", "" }, { "\u000c", "" }, { "\ufffe", N.EMPTY_STRING },
{ "\uffff", N.EMPTY_STRING } }),
NumericEntityEscaper.between(0x1, 0x8), NumericEntityEscaper.between(0xe, 0x1f), NumericEntityEscaper.between(0x7f, 0x84),
NumericEntityEscaper.between(0x86, 0x9f), new UnicodeUnpairedSurrogateRemover());
/**
* Translator object for escaping HTML version 3.0.
*
* While {@link #escapeHtml3(String)} is the expected method of use, this
* object allows the HTML escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator ESCAPE_HTML3 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()));
/**
* Translator object for escaping HTML version 4.0.
*
* While {@link #escapeHtml4(String)} is the expected method of use, this
* object allows the HTML escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator ESCAPE_HTML4 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE()));
/**
* Translator object for escaping individual Comma Separated Values.
*
* While {@link #escapeCsv(String)} is the expected method of use, this
* object allows the CSV escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper();
/* UNESCAPE TRANSLATORS */
/**
* Translator object for unescaping escaped Java.
*
* While {@link #unescapeJava(String)} is the expected method of use, this
* object allows the Java unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
// TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)?
public static final CharSequenceTranslator UNESCAPE_JAVA = new AggregateTranslator(new OctalUnescaper(), // .between('\1', '\377'),
new UnicodeUnescaper(), new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()),
new LookupTranslator(new String[][] { { "\\\\", "\\" }, { "\\\"", "\"" }, { "\\'", "'" }, { "\\", "" } }));
/**
* Translator object for unescaping escaped EcmaScript.
*
* While {@link #unescapeEcmaScript(String)} is the expected method of use, this
* object allows the EcmaScript unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA;
/**
* Translator object for unescaping escaped Json.
*
* While {@link #unescapeJson(String)} is the expected method of use, this
* object allows the Json unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.2
*/
public static final CharSequenceTranslator UNESCAPE_JSON = UNESCAPE_JAVA;
/**
* Translator object for unescaping escaped HTML 3.0.
*
* While {@link #unescapeHtml3(String)} is the expected method of use, this
* object allows the HTML unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator UNESCAPE_HTML3 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), new NumericEntityUnescaper());
/**
* Translator object for unescaping escaped HTML 4.0.
*
* While {@link #unescapeHtml4(String)} is the expected method of use, this
* object allows the HTML unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator UNESCAPE_HTML4 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
new NumericEntityUnescaper());
/**
* Translator object for unescaping escaped XML.
*
* While {@link #unescapeXml(String)} is the expected method of use, this
* object allows the XML unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator UNESCAPE_XML = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
new LookupTranslator(EntityArrays.APOS_UNESCAPE()), new NumericEntityUnescaper());
/**
* Translator object for unescaping escaped Comma Separated Value entries.
*
* While {@link #unescapeCsv(String)} is the expected method of use, this
* object allows the CSV unescaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.0
*/
public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper();
/* Helper functions */
/**
* {@code StringEscapeUtils} instances should NOT be constructed in
* standard programming.
*
* Instead, the class should be used as:
* StringEscapeUtils.escapeJava("foo");
*
* This constructor is public to permit tools that require a JavaBean
* instance to operate.
*/
private EscapeUtil() {
// singlton.
}
// Java and JavaScript
//--------------------------------------------------------------------------
/**
* Escapes the characters in a {@code String} using Java String rules.
*
* Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.)
*
* So a tab becomes the characters {@code '\\'} and
* {@code 't'}.
*
* The only difference between Java strings and JavaScript strings
* is that in JavaScript, a single quote and forward-slash (/) are escaped.
*
* Example:
*
* input string: He didn't say, "Stop!"
* output string: He didn't say, \"Stop!\"
*
*
input String to escape values in, may be null
* @return String with escaped values, {@code null} if null string input
*/
public static final String escapeJava(final String input) {
return ESCAPE_JAVA.translate(input);
}
/**
* Escapes the characters in a {@code String} using EcmaScript String rules.
* Escapes any values it finds into their EcmaScript String form.
* Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.)
*
* So a tab becomes the characters {@code '\\'} and
* {@code 't'}.
*
* The only difference between Java strings and EcmaScript strings
* is that in EcmaScript, a single quote and forward-slash (/) are escaped.
*
* Note that EcmaScript is best known by the JavaScript and ActionScript dialects.
*
* Example:
*
* input string: He didn't say, "Stop!"
* output string: He didn\'t say, \"Stop!\"
*
*
input String to escape values in, may be null
* @return String with escaped values, {@code null} if null string input
*
* @since 3.0
*/
public static final String escapeEcmaScript(final String input) {
return ESCAPE_ECMASCRIPT.translate(input);
}
/**
* Escapes the characters in a {@code String} using Json String rules.
* Escapes any values it finds into their Json String form.
* Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.)
*
* So a tab becomes the characters {@code '\\'} and
* {@code 't'}.
*
* The only difference between Java strings and Json strings
* is that in Json, forward-slash (/) is escaped.
*
* See http://www.ietf.org/rfc/rfc4627.txt for further details.
*
* Example:
*
* input string: He didn't say, "Stop!"
* output string: He didn't say, \"Stop!\"
*
*
input String to escape values in, may be null
* @return String with escaped values, {@code null} if null string input
*
* @since 3.2
*/
public static final String escapeJson(final String input) {
return ESCAPE_JSON.translate(input);
}
/**
* Unescapes any Java literals found in the {@code String}.
* For example, it will turn a sequence of {@code '\'} and
* {@code 'n'} into a newline character, unless the {@code '\'}
* is preceded by another {@code '\'}.
*
input the {@code String} to unescape, may be null
* @return a new unescaped {@code String}, {@code null} if null string input
*/
public static final String unescapeJava(final String input) {
return UNESCAPE_JAVA.translate(input);
}
/**
* Unescapes any EcmaScript literals found in the {@code String}.
*
* For example, it will turn a sequence of {@code '\'} and {@code 'n'}
* into a newline character, unless the {@code '\'} is preceded by another
* {@code '\'}.
*
input the {@code String} to unescape, may be null
* @return A new unescaped {@code String}, {@code null} if null string input
* @see #unescapeJava(String)
* @since 3.0
*/
public static final String unescapeEcmaScript(final String input) {
return UNESCAPE_ECMASCRIPT.translate(input);
}
/**
* Unescapes any Json literals found in the {@code String}.
*
* For example, it will turn a sequence of {@code '\'} and {@code 'n'}
* into a newline character, unless the {@code '\'} is preceded by another
* {@code '\'}.
*
input the {@code String} to unescape, may be null
* @return A new unescaped {@code String}, {@code null} if null string input
* @see #unescapeJava(String)
* @since 3.2
*/
public static final String unescapeJson(final String input) {
return UNESCAPE_JSON.translate(input);
}
// HTML and XML
//--------------------------------------------------------------------------
/**
* Escapes the characters in a {@code String} using HTML entities.
*
*
* For example:
*
* "bread" & "butter"
* becomes:
*
* "bread" & "butter"
.
*
*
* Supports all known HTML 4.0 entities, including funky accents.
* Note that the commonly used apostrophe escape character (')
* is not a legal entity and so is not supported).
*
input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
*
* @see ISO Entities
* @see HTML 3.2 Character Entities for ISO Latin-1
* @see HTML 4.0 Character entity references
* @see HTML 4.01 Character References
* @see HTML 4.01 Code positions
*
* @since 3.0
*/
public static final String escapeHtml4(final String input) {
return ESCAPE_HTML4.translate(input);
}
/**
* Escapes the characters in a {@code String} using HTML entities.
* Supports only the HTML 3.0 entities.
*
input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
*
* @since 3.0
*/
public static final String escapeHtml3(final String input) {
return ESCAPE_HTML3.translate(input);
}
//-----------------------------------------------------------------------
/**
* Unescapes a string containing entity escapes to a string
* containing the actual Unicode characters corresponding to the
* escapes. Supports HTML 4.0 entities.
*
* For example, the string {@code "<Français>"}
* will become {@code ""}
*
* If an entity is unrecognized, it is left alone, and inserted
* verbatim into the result string. e.g. {@code ">&zzzz;x"} will
* become {@code ">&zzzz;x"}.
*
input the {@code String} to unescape, may be null
* @return a new unescaped {@code String}, {@code null} if null string input
*
* @since 3.0
*/
public static final String unescapeHtml4(final String input) {
return UNESCAPE_HTML4.translate(input);
}
/**
* Unescapes a string containing entity escapes to a string
* containing the actual Unicode characters corresponding to the
* escapes. Supports only HTML 3.0 entities.
*
input the {@code String} to unescape, may be null
* @return a new unescaped {@code String}, {@code null} if null string input
*
* @since 3.0
*/
public static final String unescapeHtml3(final String input) {
return UNESCAPE_HTML3.translate(input);
}
/**
* Escapes the characters in a {@code String} using XML entities.
*
* For example: {@code "bread" & "butter"} =>
* {@code "bread" & "butter"}.
*
*
* Note that XML 1.0 is a text-only format: it cannot represent control
* characters or unpaired Unicode surrogate codepoints, even after escaping.
* {@code escapeXml10} will remove characters that do not fit in the
* following ranges:
*
* {@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}
*
* Though not strictly necessary, {@code escapeXml10} will escape
* characters in the following ranges:
*
* {@code [#x7F-#x84] | [#x86-#x9F]}
*
* The returned string can be inserted into a valid XML 1.0 or XML 1.1
* document. If you want to allow more non-text characters in an XML 1.1
* document, use {@link #escapeXml11(String)}.
*
input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
* @see #unescapeXml(java.lang.String)
* @since 3.3
*/
public static String escapeXml10(final String input) {
return ESCAPE_XML10.translate(input);
}
/**
* Escapes the characters in a {@code String} using XML entities.
*
* For example: {@code "bread" & "butter"} =>
* {@code "bread" & "butter"}.
*
*
* XML 1.1 can represent certain control characters, but it cannot represent
* the null byte or unpaired Unicode surrogate codepoints, even after escaping.
* {@code escapeXml11} will remove characters that do not fit in the following
* ranges:
*
* {@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}
*
* {@code escapeXml11} will escape characters in the following ranges:
*
* {@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}
*
* The returned string can be inserted into a valid XML 1.1 document. Do not
* use it for XML 1.0 documents.
*
input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
* @see #unescapeXml(java.lang.String)
* @since 3.3
*/
public static String escapeXml11(final String input) {
return ESCAPE_XML11.translate(input);
}
//-----------------------------------------------------------------------
/**
* Unescapes a string containing XML entity escapes to a string
* containing the actual Unicode characters corresponding to the
* escapes.
*
* Supports only the five basic XML entities (gt, lt, quot, amp, apos).
* Does not support DTDs or external entities.
*
* Note that numerical \\u Unicode codes are unescaped to their respective
* Unicode characters. This may change in future releases.
*
input the {@code String} to unescape, may be null
* @return a new unescaped {@code String}, {@code null} if null string input
* @see #escapeXml(String)
* @see #escapeXml10(String)
* @see #escapeXml11(String)
*/
public static final String unescapeXml(final String input) {
return UNESCAPE_XML.translate(input);
}
//-----------------------------------------------------------------------
/**
* Returns a {@code String} value for a CSV column enclosed in double quotes,
* if required.
*
* If the value contains a comma, newline or double quote, then the
* String value is returned enclosed in double quotes.
*
* Any double quote characters in the value are escaped with another double quote.
*
* If the value does not contain a comma, newline or double quote, then the
* String value is returned unchanged.
*
* see Wikipedia and
* RFC 4180.
*
* @param input the input CSV column String, may be null
* @return
* newline or double quote, {@code null} if null string input
* @since 2.4
*/
public static final String escapeCsv(final String input) {
return ESCAPE_CSV.translate(input);
}
/**
* Returns a {@code String} value for an unescaped CSV column.
*
* If the value is enclosed in double quotes, and contains a comma, newline
* or double quote, then quotes are removed.
*
*
* Any double quote escaped characters (a pair of double quotes) are unescaped
* to just one double quote.
*
* If the value is not enclosed in double quotes, or is and does not contain a
* comma, newline or double quote, then the String value is returned unchanged.
*
* see Wikipedia and
* RFC 4180.
*
* @param input the input CSV column String, may be null
* @return
* quotes unescaped, {@code null} if null string input
* @since 2.4
*/
public static final String unescapeCsv(final String input) {
return UNESCAPE_CSV.translate(input);
}
/**
* An API for translating text.
* Its core use is to escape and unescape text. Because escaping and unescaping
* is completely contextual, the API does not present two separate signatures.
*
* @since 3.0
*/
public static abstract class CharSequenceTranslator {
/** The Constant HEX_DIGITS. */
static final char[] HEX_DIGITS = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
/**
* Translate a set of codepoints, represented by an int index into a CharSequence,
* into another set of codepoints. The number of codepoints consumed must be returned,
* and the only IOExceptions thrown must be from interacting with the Writer so that
* the top level API may reliably ignore StringWriter IOExceptions.
*
* @param input CharSequence that is being translated
* @param index int representing the current point of translation
* @param out Writer to translate the text to
* @return int count of codepoints consumed
* @throws IOException if and only if the Writer produces an IOException
*/
public abstract int translate(CharSequence input, int index, Writer out) throws IOException;
/**
* Helper for non-Writer usage.
* @param input CharSequence to be translated
* @return String output of translation
*/
public final String translate(final CharSequence input) {
if (input == null) {
return null;
}
try {
final StringWriter writer = new StringWriter(input.length() * 2);
translate(input, writer);
return writer.toString();
} catch (final IOException ioe) {
// this should never ever happen while writing to a StringWriter
throw new RuntimeException(ioe);
}
}
/**
* Translate an input onto a Writer. This is intentionally final as its algorithm is
* tightly coupled with the abstract method of this class.
*
* @param input CharSequence that is being translated
* @param out Writer to translate the text to
* @throws IOException if and only if the Writer produces an IOException
*/
public final void translate(final CharSequence input, final Writer out) throws IOException {
if (out == null) {
throw new IllegalArgumentException("The Writer must not be null");
}
if (input == null) {
return;
}
int pos = 0;
final int len = input.length();
while (pos < len) {
final int consumed = translate(input, pos, out);
if (consumed == 0) {
// inlined implementation of Character.toChars(Character.codePointAt(input, pos))
// avoids allocating temp char arrays and duplicate checks
char c1 = input.charAt(pos);
out.write(c1);
pos++;
if (Character.isHighSurrogate(c1) && pos < len) {
char c2 = input.charAt(pos);
if (Character.isLowSurrogate(c2)) {
out.write(c2);
pos++;
}
}
continue;
}
// contract with translators is that they have to understand codepoints
// and they just took care of a surrogate pair
for (int pt = 0; pt < consumed; pt++) {
pos += Character.charCount(Character.codePointAt(input, pos));
}
}
}
/**
* Helper method to create a merger of this translator with another set of
* translators. Useful in customizing the standard functionality.
*
* @param translators CharSequenceTranslator array of translators to merge with this one
* @return CharSequenceTranslator merging this translator with the others
*/
@SafeVarargs
public final CharSequenceTranslator with(final CharSequenceTranslator... translators) {
final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1];
newArray[0] = this;
System.arraycopy(translators, 0, newArray, 1, translators.length);
return new AggregateTranslator(newArray);
}
/**
* Returns an upper case hexadecimal String
for the given
* character.
*
* @param codepoint The codepoint to convert.
* @return An upper case hexadecimal String
*/
public static String hex(final int codepoint) {
return Integer.toHexString(codepoint).toUpperCase(Locale.ENGLISH);
}
}
/**
* Executes a sequence of translators one after the other. Execution ends whenever
* the first translator consumes codepoints from the input.
*
* @since 3.0
*/
static class AggregateTranslator extends CharSequenceTranslator {
/** The translators. */
private final CharSequenceTranslator[] translators;
/**
* Specify the translators to be used at creation time.
*
* @param translators CharSequenceTranslator array to aggregate
*/
@SafeVarargs
public AggregateTranslator(final CharSequenceTranslator... translators) {
this.translators = N.clone(translators);
}
/**
* The first translator to consume codepoints from the input is the 'winner'.
* Execution stops with the number of consumed codepoints being returned.
* {@inheritDoc}
*/
@Override
public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
for (final CharSequenceTranslator translator : translators) {
final int consumed = translator.translate(input, index, out);
if (consumed != 0) {
return consumed;
}
}
return 0;
}
}
/**
* Translates codepoints to their Unicode escaped value suitable for Java source.
*
* @since 3.2
*/
static class JavaUnicodeEscaper extends UnicodeEscaper {
/**
*
* Constructs a JavaUnicodeEscaper
above the specified value (exclusive).
*
*
* @param codepoint
* above which to escape
* @return
*/
public static JavaUnicodeEscaper above(final int codepoint) {
return outsideOf(0, codepoint);
}
/**
*
* Constructs a JavaUnicodeEscaper
below the specified value (exclusive).
*
*
* @param codepoint
* below which to escape
* @return
*/
public static JavaUnicodeEscaper below(final int codepoint) {
return outsideOf(codepoint, Integer.MAX_VALUE);
}
/**
*
* Constructs a JavaUnicodeEscaper
between the specified values (inclusive).
*
*
* @param codepointLow
* above which to escape
* @param codepointHigh
* below which to escape
* @return
*/
public static JavaUnicodeEscaper between(final int codepointLow, final int codepointHigh) {
return new JavaUnicodeEscaper(codepointLow, codepointHigh, true);
}
/**
*
* Constructs a JavaUnicodeEscaper
outside of the specified values (exclusive).
*
*
* @param codepointLow
* below which to escape
* @param codepointHigh
* above which to escape
* @return
*/
public static JavaUnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) {
return new JavaUnicodeEscaper(codepointLow, codepointHigh, false);
}
/**
*
* Constructs a JavaUnicodeEscaper
for the specified range. This is the underlying method for the
* other constructors/builders. The below
and above
boundaries are inclusive when
* between
is true
and exclusive when it is false
.
*
*
* @param below
* int value representing the lowest codepoint boundary
* @param above
* int value representing the highest codepoint boundary
* @param between
* whether to escape between the boundaries or outside them
*/
public JavaUnicodeEscaper(final int below, final int above, final boolean between) {
super(below, above, between);
}
/**
* Converts the given codepoint to a hex string of the form {@code "\\uXXXX\\uXXXX"}.
*
codepoint a Unicode code point
* @return
*/
@Override
protected String toUtf16Escape(final int codepoint) {
final char[] surrogatePair = Character.toChars(codepoint);
return "\\u" + hex(surrogatePair[0]) + "\\u" + hex(surrogatePair[1]);
}
}
/**
* Translates codepoints to their XML numeric entity escaped value.
*
* @since 3.0
*/
static class NumericEntityEscaper extends CodePointTranslator {
/** The below. */
private final int below;
/** The above. */
private final int above;
/** The between. */
private final boolean between;
/**
* Constructs a NumericEntityEscaper
for the specified range. This is
* the underlying method for the other constructors/builders. The below
* and above
boundaries are inclusive when between
is
* true
and exclusive when it is false
.
*
* @param below int value representing the lowest codepoint boundary
* @param above int value representing the highest codepoint boundary
* @param between whether to escape between the boundaries or outside them
*/
private NumericEntityEscaper(final int below, final int above, final boolean between) {
this.below = below;
this.above = above;
this.between = between;
}
/**
* Constructs a NumericEntityEscaper
for all characters.
*/
public NumericEntityEscaper() {
this(0, Integer.MAX_VALUE, true);
}
/**
* Constructs a NumericEntityEscaper
below the specified value (exclusive).
*
* @param codepoint below which to escape
* @return
*/
public static NumericEntityEscaper below(final int codepoint) {
return outsideOf(codepoint, Integer.MAX_VALUE);
}
/**
* Constructs a NumericEntityEscaper
above the specified value (exclusive).
*
* @param codepoint above which to escape
* @return
*/
public static NumericEntityEscaper above(final int codepoint) {
return outsideOf(0, codepoint);
}
/**
* Constructs a NumericEntityEscaper
between the specified values (inclusive).
*
* @param codepointLow above which to escape
* @param codepointHigh below which to escape
* @return
*/
public static NumericEntityEscaper between(final int codepointLow, final int codepointHigh) {
return new NumericEntityEscaper(codepointLow, codepointHigh, true);
}
/**
* Constructs a NumericEntityEscaper
outside of the specified values (exclusive).
*
* @param codepointLow below which to escape
* @param codepointHigh above which to escape
* @return
*/
public static NumericEntityEscaper outsideOf(final int codepointLow, final int codepointHigh) {
return new NumericEntityEscaper(codepointLow, codepointHigh, false);
}
/**
* {@inheritDoc}
*/
@Override
public boolean translate(final int codepoint, final Writer out) throws IOException {
if (between) {
if (codepoint < below || codepoint > above) {
return false;
}
} else {
if (codepoint >= below && codepoint <= above) {
return false;
}
}
out.write("");
out.write(Integer.toString(codepoint, 10));
out.write(';');
return true;
}
}
/**
* Helper subclass to CharSequenceTranslator to remove unpaired surrogates.
*/
static class UnicodeUnpairedSurrogateRemover extends CodePointTranslator {
/**
* Implementation of translate that throws out unpaired surrogates.
* {@inheritDoc}
*/
@Override
public boolean translate(final int codepoint, final Writer out) throws IOException {
if (codepoint >= Character.MIN_SURROGATE && codepoint <= Character.MAX_SURROGATE) {
// It's a surrogate. Write nothing and say we've translated.
return true;
}
// It's not a surrogate. Don't translate it.
return false;
}
}
/**
* Translates escaped Unicode values of the form \\u+\d\d\d\d back to
* Unicode. It supports multiple 'u' characters and will work with or
* without the +.
*
* @since 3.0
*/
static class UnicodeUnescaper extends CharSequenceTranslator {
/**
* {@inheritDoc}
*/
@Override
public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'u') {
// consume optional additional 'u' chars
int i = 2;
while (index + i < input.length() && input.charAt(index + i) == 'u') {
i++;
}
if (index + i < input.length() && input.charAt(index + i) == '+') {
i++;
}
if (index + i + 4 <= input.length()) {
// Get 4 hex digits
final CharSequence unicode = input.subSequence(index + i, index + i + 4);
try {
final int value = Integer.parseInt(unicode.toString(), 16);
out.write((char) value);
} catch (final NumberFormatException nfe) {
throw new IllegalArgumentException("Unable to parse unicode value: " + unicode, nfe);
}
return i + 4;
}
throw new IllegalArgumentException(
"Less than 4 hex digits in unicode value: '" + input.subSequence(index, input.length()) + "' due to end of CharSequence");
}
return 0;
}
}
/**
* Translates codepoints to their Unicode escaped value.
*
* @since 3.0
*/
static class UnicodeEscaper extends CodePointTranslator {
/** The below. */
private final int below;
/** The above. */
private final int above;
/** The between. */
private final boolean between;
/**
* Constructs a UnicodeEscaper
for all characters.
*/
public UnicodeEscaper() {
this(0, Integer.MAX_VALUE, true);
}
/**
* Constructs a UnicodeEscaper
for the specified range. This is
* the underlying method for the other constructors/builders. The below
* and above
boundaries are inclusive when between
is
* true
and exclusive when it is false
.
*
* @param below int value representing the lowest codepoint boundary
* @param above int value representing the highest codepoint boundary
* @param between whether to escape between the boundaries or outside them
*/
protected UnicodeEscaper(final int below, final int above, final boolean between) {
this.below = below;
this.above = above;
this.between = between;
}
/**
* Constructs a UnicodeEscaper
below the specified value (exclusive).
*
* @param codepoint below which to escape
* @return
*/
public static UnicodeEscaper below(final int codepoint) {
return outsideOf(codepoint, Integer.MAX_VALUE);
}
/**
* Constructs a UnicodeEscaper
above the specified value (exclusive).
*
* @param codepoint above which to escape
* @return
*/
public static UnicodeEscaper above(final int codepoint) {
return outsideOf(0, codepoint);
}
/**
* Constructs a UnicodeEscaper
outside of the specified values (exclusive).
*
* @param codepointLow below which to escape
* @param codepointHigh above which to escape
* @return
*/
public static UnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) {
return new UnicodeEscaper(codepointLow, codepointHigh, false);
}
/**
* Constructs a UnicodeEscaper
between the specified values (inclusive).
*
* @param codepointLow above which to escape
* @param codepointHigh below which to escape
* @return
*/
public static UnicodeEscaper between(final int codepointLow, final int codepointHigh) {
return new UnicodeEscaper(codepointLow, codepointHigh, true);
}
/**
* {@inheritDoc}
*/
@Override
public boolean translate(final int codepoint, final Writer out) throws IOException {
if (between) {
if (codepoint < below || codepoint > above) {
return false;
}
} else {
if (codepoint >= below && codepoint <= above) {
return false;
}
}
// TODO: Handle potential + sign per various Unicode escape implementations
if (codepoint > 0xffff) {
out.write(toUtf16Escape(codepoint));
} else {
out.write("\\u");
out.write(HEX_DIGITS[(codepoint >> 12) & 15]);
out.write(HEX_DIGITS[(codepoint >> 8) & 15]);
out.write(HEX_DIGITS[(codepoint >> 4) & 15]);
out.write(HEX_DIGITS[(codepoint) & 15]);
}
return true;
}
/**
* Converts the given codepoint to a hex string of the form {@code "\\uXXXX"}.
*
codepoint a Unicode code point
* @return
* @since 3.2
*/
protected String toUtf16Escape(final int codepoint) {
return "\\u" + hex(codepoint);
}
}
/**
* Translate escaped octal Strings back to their octal values.
*
* For example, "\45" should go back to being the specific value (a %).
*
* Note that this currently only supports the viable range of octal for Java; namely
* 1 to 377. This is because parsing Java is the main use case.
*
* @since 3.0
*/
static class OctalUnescaper extends CharSequenceTranslator {
/**
* {@inheritDoc}
*/
@Override
public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
final int remaining = input.length() - index - 1; // how many characters left, ignoring the first \
final StringBuilder builder = new StringBuilder();
if (input.charAt(index) == '\\' && remaining > 0 && isOctalDigit(input.charAt(index + 1))) {
final int next = index + 1;
final int next2 = index + 2;
final int next3 = index + 3;
// we know this is good as we checked it in the if block above
builder.append(input.charAt(next));
if (remaining > 1 && isOctalDigit(input.charAt(next2))) {
builder.append(input.charAt(next2));
if (remaining > 2 && isZeroToThree(input.charAt(next)) && isOctalDigit(input.charAt(next3))) {
builder.append(input.charAt(next3));
}
}
out.write(Integer.parseInt(builder.toString(), 8));
return 1 + builder.length();
}
return 0;
}
/**
* Checks if the given char is an octal digit. Octal digits are the character representations of the digits 0 to 7.
* @param ch the char to check
* @return true if the given char is the character representation of one of the digits from 0 to 7
*/
private boolean isOctalDigit(final char ch) {
return ch >= '0' && ch <= '7';
}
/**
* Checks if the given char is the character representation of one of the digit from 0 to 3.
* @param ch the char to check
* @return true if the given char is the character representation of one of the digits from 0 to 3
*/
private boolean isZeroToThree(final char ch) {
return ch >= '0' && ch <= '3';
}
}
/**
* Translate XML numeric entities of the form &#[xX]?\d+;? to
* the specific codepoint.
*
* Note that the semi-colon is optional.
*
* @since 3.0
*/
static class NumericEntityUnescaper extends CharSequenceTranslator {
/**
* The Enum OPTION.
*/
public static enum OPTION {
/** The semi colon required. */
semiColonRequired,
/** The semi colon optional. */
semiColonOptional,
/** The error if no semi colon. */
errorIfNoSemiColon
}
/** The options. */
// TODO?: Create an OptionsSet class to hide some of the conditional logic below
private final EnumSet