All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.landawn.abacus.util.EscapeUtil Maven / Gradle / Ivy

Go to download

A general programming library in Java/Android. It's easy to learn and simple to use with concise and powerful APIs.

There is a newer version: 2.1.12
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.landawn.abacus.util;

import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

// TODO: Auto-generated Javadoc
/**
 * Note: it's copied from StringEscaperUtils in Apache Commons Lang under Apache License 2.0
 * 
 * 

Escapes and unescapes {@code String}s for * Java, Java Script, HTML and XML.

* *

#ThreadSafe#

* @since 2.0 */ public class EscapeUtil { /** * {@code \u000a} linefeed LF ('\n'). * * @see JLF: Escape Sequences * for Character and String Literals * @since 2.2 */ static final char LF = '\n'; /** * {@code \u000d} carriage return CR ('\r'). * * @see JLF: Escape Sequences * for Character and String Literals * @since 2.2 */ static final char CR = '\r'; /* ESCAPE TRANSLATORS */ /** * Translator object for escaping Java. * * While {@link #escapeJava(String)} is the expected method of use, this * object allows the Java escaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator ESCAPE_JAVA = new LookupTranslator(new String[][] { { "\"", "\\\"" }, { "\\", "\\\\" }, }) .with(new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE())) .with(JavaUnicodeEscaper.outsideOf(32, 0x7f)); /** * Translator object for escaping EcmaScript/JavaScript. * * While {@link #escapeEcmaScript(String)} is the expected method of use, this * object allows the EcmaScript escaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = new AggregateTranslator( new LookupTranslator(new String[][] { { "'", "\\'" }, { "\"", "\\\"" }, { "\\", "\\\\" }, { "/", "\\/" } }), new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), JavaUnicodeEscaper.outsideOf(32, 0x7f)); /** * Translator object for escaping Json. * * While {@link #escapeJson(String)} is the expected method of use, this * object allows the Json escaping functionality to be used * as the foundation for a custom translator. * * @since 3.2 */ public static final CharSequenceTranslator ESCAPE_JSON = new AggregateTranslator( new LookupTranslator(new String[][] { { "\"", "\\\"" }, { "\\", "\\\\" }, { "/", "\\/" } }), new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), JavaUnicodeEscaper.outsideOf(32, 0x7f)); /** * Translator object for escaping XML 1.0. * * While {@link #escapeXml10(String)} is the expected method of use, this * object allows the XML escaping functionality to be used * as the foundation for a custom translator. * * @since 3.3 */ public static final CharSequenceTranslator ESCAPE_XML10 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()), new LookupTranslator(EntityArrays.APOS_ESCAPE()), new LookupTranslator(new String[][] { { "\u0000", N.EMPTY_STRING }, { "\u0001", N.EMPTY_STRING }, { "\u0002", N.EMPTY_STRING }, { "\u0003", N.EMPTY_STRING }, { "\u0004", N.EMPTY_STRING }, { "\u0005", N.EMPTY_STRING }, { "\u0006", N.EMPTY_STRING }, { "\u0007", N.EMPTY_STRING }, { "\u0008", N.EMPTY_STRING }, { "\u000b", N.EMPTY_STRING }, { "\u000c", N.EMPTY_STRING }, { "\u000e", N.EMPTY_STRING }, { "\u000f", N.EMPTY_STRING }, { "\u0010", N.EMPTY_STRING }, { "\u0011", N.EMPTY_STRING }, { "\u0012", N.EMPTY_STRING }, { "\u0013", N.EMPTY_STRING }, { "\u0014", N.EMPTY_STRING }, { "\u0015", N.EMPTY_STRING }, { "\u0016", N.EMPTY_STRING }, { "\u0017", N.EMPTY_STRING }, { "\u0018", N.EMPTY_STRING }, { "\u0019", N.EMPTY_STRING }, { "\u001a", N.EMPTY_STRING }, { "\u001b", N.EMPTY_STRING }, { "\u001c", N.EMPTY_STRING }, { "\u001d", N.EMPTY_STRING }, { "\u001e", N.EMPTY_STRING }, { "\u001f", N.EMPTY_STRING }, { "\ufffe", N.EMPTY_STRING }, { "\uffff", N.EMPTY_STRING } }), NumericEntityEscaper.between(0x7f, 0x84), NumericEntityEscaper.between(0x86, 0x9f), new UnicodeUnpairedSurrogateRemover()); /** * Translator object for escaping XML 1.1. * * While {@link #escapeXml11(String)} is the expected method of use, this * object allows the XML escaping functionality to be used * as the foundation for a custom translator. * * @since 3.3 */ public static final CharSequenceTranslator ESCAPE_XML11 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()), new LookupTranslator(EntityArrays.APOS_ESCAPE()), new LookupTranslator(new String[][] { { "\u0000", N.EMPTY_STRING }, { "\u000b", " " }, { "\u000c", " " }, { "\ufffe", N.EMPTY_STRING }, { "\uffff", N.EMPTY_STRING } }), NumericEntityEscaper.between(0x1, 0x8), NumericEntityEscaper.between(0xe, 0x1f), NumericEntityEscaper.between(0x7f, 0x84), NumericEntityEscaper.between(0x86, 0x9f), new UnicodeUnpairedSurrogateRemover()); /** * Translator object for escaping HTML version 3.0. * * While {@link #escapeHtml3(String)} is the expected method of use, this * object allows the HTML escaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator ESCAPE_HTML3 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()), new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE())); /** * Translator object for escaping HTML version 4.0. * * While {@link #escapeHtml4(String)} is the expected method of use, this * object allows the HTML escaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator ESCAPE_HTML4 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_ESCAPE()), new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE())); /** * Translator object for escaping individual Comma Separated Values. * * While {@link #escapeCsv(String)} is the expected method of use, this * object allows the CSV escaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper(); /* UNESCAPE TRANSLATORS */ /** * Translator object for unescaping escaped Java. * * While {@link #unescapeJava(String)} is the expected method of use, this * object allows the Java unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)? public static final CharSequenceTranslator UNESCAPE_JAVA = new AggregateTranslator(new OctalUnescaper(), // .between('\1', '\377'), new UnicodeUnescaper(), new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()), new LookupTranslator(new String[][] { { "\\\\", "\\" }, { "\\\"", "\"" }, { "\\'", "'" }, { "\\", "" } })); /** * Translator object for unescaping escaped EcmaScript. * * While {@link #unescapeEcmaScript(String)} is the expected method of use, this * object allows the EcmaScript unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; /** * Translator object for unescaping escaped Json. * * While {@link #unescapeJson(String)} is the expected method of use, this * object allows the Json unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.2 */ public static final CharSequenceTranslator UNESCAPE_JSON = UNESCAPE_JAVA; /** * Translator object for unescaping escaped HTML 3.0. * * While {@link #unescapeHtml3(String)} is the expected method of use, this * object allows the HTML unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator UNESCAPE_HTML3 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), new NumericEntityUnescaper()); /** * Translator object for unescaping escaped HTML 4.0. * * While {@link #unescapeHtml4(String)} is the expected method of use, this * object allows the HTML unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator UNESCAPE_HTML4 = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()), new NumericEntityUnescaper()); /** * Translator object for unescaping escaped XML. * * While {@link #unescapeXml(String)} is the expected method of use, this * object allows the XML unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator UNESCAPE_XML = new AggregateTranslator(new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), new LookupTranslator(EntityArrays.APOS_UNESCAPE()), new NumericEntityUnescaper()); /** * Translator object for unescaping escaped Comma Separated Value entries. * * While {@link #unescapeCsv(String)} is the expected method of use, this * object allows the CSV unescaping functionality to be used * as the foundation for a custom translator. * * @since 3.0 */ public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper(); /* Helper functions */ /** *

{@code StringEscapeUtils} instances should NOT be constructed in * standard programming.

* *

Instead, the class should be used as:

*
StringEscapeUtils.escapeJava("foo");
* *

This constructor is public to permit tools that require a JavaBean * instance to operate.

*/ private EscapeUtil() { // singlton. } // Java and JavaScript //-------------------------------------------------------------------------- /** *

Escapes the characters in a {@code String} using Java String rules.

* *

Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.)

* *

So a tab becomes the characters {@code '\\'} and * {@code 't'}.

* *

The only difference between Java strings and JavaScript strings * is that in JavaScript, a single quote and forward-slash (/) are escaped.

* *

Example:

*
     * input string: He didn't say, "Stop!"
     * output string: He didn't say, \"Stop!\"
     * 
* input String to escape values in, may be null * @return String with escaped values, {@code null} if null string input */ public static final String escapeJava(final String input) { return ESCAPE_JAVA.translate(input); } /** *

Escapes the characters in a {@code String} using EcmaScript String rules.

*

Escapes any values it finds into their EcmaScript String form. * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.)

* *

So a tab becomes the characters {@code '\\'} and * {@code 't'}.

* *

The only difference between Java strings and EcmaScript strings * is that in EcmaScript, a single quote and forward-slash (/) are escaped.

* *

Note that EcmaScript is best known by the JavaScript and ActionScript dialects.

* *

Example:

*
     * input string: He didn't say, "Stop!"
     * output string: He didn\'t say, \"Stop!\"
     * 
* input String to escape values in, may be null * @return String with escaped values, {@code null} if null string input * * @since 3.0 */ public static final String escapeEcmaScript(final String input) { return ESCAPE_ECMASCRIPT.translate(input); } /** *

Escapes the characters in a {@code String} using Json String rules.

*

Escapes any values it finds into their Json String form. * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.)

* *

So a tab becomes the characters {@code '\\'} and * {@code 't'}.

* *

The only difference between Java strings and Json strings * is that in Json, forward-slash (/) is escaped.

* *

See http://www.ietf.org/rfc/rfc4627.txt for further details.

* *

Example:

*
     * input string: He didn't say, "Stop!"
     * output string: He didn't say, \"Stop!\"
     * 
* input String to escape values in, may be null * @return String with escaped values, {@code null} if null string input * * @since 3.2 */ public static final String escapeJson(final String input) { return ESCAPE_JSON.translate(input); } /** *

Unescapes any Java literals found in the {@code String}. * For example, it will turn a sequence of {@code '\'} and * {@code 'n'} into a newline character, unless the {@code '\'} * is preceded by another {@code '\'}.

* input the {@code String} to unescape, may be null * @return a new unescaped {@code String}, {@code null} if null string input */ public static final String unescapeJava(final String input) { return UNESCAPE_JAVA.translate(input); } /** *

Unescapes any EcmaScript literals found in the {@code String}.

* *

For example, it will turn a sequence of {@code '\'} and {@code 'n'} * into a newline character, unless the {@code '\'} is preceded by another * {@code '\'}.

* input the {@code String} to unescape, may be null * @return A new unescaped {@code String}, {@code null} if null string input * @see #unescapeJava(String) * @since 3.0 */ public static final String unescapeEcmaScript(final String input) { return UNESCAPE_ECMASCRIPT.translate(input); } /** *

Unescapes any Json literals found in the {@code String}.

* *

For example, it will turn a sequence of {@code '\'} and {@code 'n'} * into a newline character, unless the {@code '\'} is preceded by another * {@code '\'}.

* input the {@code String} to unescape, may be null * @return A new unescaped {@code String}, {@code null} if null string input * @see #unescapeJava(String) * @since 3.2 */ public static final String unescapeJson(final String input) { return UNESCAPE_JSON.translate(input); } // HTML and XML //-------------------------------------------------------------------------- /** *

Escapes the characters in a {@code String} using HTML entities.

* *

* For example: *

*

"bread" & "butter"

* becomes: *

* "bread" & "butter". *

* *

Supports all known HTML 4.0 entities, including funky accents. * Note that the commonly used apostrophe escape character (') * is not a legal entity and so is not supported).

* input the {@code String} to escape, may be null * @return a new escaped {@code String}, {@code null} if null string input * * @see ISO Entities * @see HTML 3.2 Character Entities for ISO Latin-1 * @see HTML 4.0 Character entity references * @see HTML 4.01 Character References * @see HTML 4.01 Code positions * * @since 3.0 */ public static final String escapeHtml4(final String input) { return ESCAPE_HTML4.translate(input); } /** *

Escapes the characters in a {@code String} using HTML entities.

*

Supports only the HTML 3.0 entities.

* input the {@code String} to escape, may be null * @return a new escaped {@code String}, {@code null} if null string input * * @since 3.0 */ public static final String escapeHtml3(final String input) { return ESCAPE_HTML3.translate(input); } //----------------------------------------------------------------------- /** *

Unescapes a string containing entity escapes to a string * containing the actual Unicode characters corresponding to the * escapes. Supports HTML 4.0 entities.

* *

For example, the string {@code "<Français>"} * will become {@code ""}

* *

If an entity is unrecognized, it is left alone, and inserted * verbatim into the result string. e.g. {@code ">&zzzz;x"} will * become {@code ">&zzzz;x"}.

* input the {@code String} to unescape, may be null * @return a new unescaped {@code String}, {@code null} if null string input * * @since 3.0 */ public static final String unescapeHtml4(final String input) { return UNESCAPE_HTML4.translate(input); } /** *

Unescapes a string containing entity escapes to a string * containing the actual Unicode characters corresponding to the * escapes. Supports only HTML 3.0 entities.

* input the {@code String} to unescape, may be null * @return a new unescaped {@code String}, {@code null} if null string input * * @since 3.0 */ public static final String unescapeHtml3(final String input) { return UNESCAPE_HTML3.translate(input); } /** *

Escapes the characters in a {@code String} using XML entities.

* *

For example: {@code "bread" & "butter"} => * {@code "bread" & "butter"}. *

* *

Note that XML 1.0 is a text-only format: it cannot represent control * characters or unpaired Unicode surrogate codepoints, even after escaping. * {@code escapeXml10} will remove characters that do not fit in the * following ranges:

* *

{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}

* *

Though not strictly necessary, {@code escapeXml10} will escape * characters in the following ranges:

* *

{@code [#x7F-#x84] | [#x86-#x9F]}

* *

The returned string can be inserted into a valid XML 1.0 or XML 1.1 * document. If you want to allow more non-text characters in an XML 1.1 * document, use {@link #escapeXml11(String)}.

* input the {@code String} to escape, may be null * @return a new escaped {@code String}, {@code null} if null string input * @see #unescapeXml(java.lang.String) * @since 3.3 */ public static String escapeXml10(final String input) { return ESCAPE_XML10.translate(input); } /** *

Escapes the characters in a {@code String} using XML entities.

* *

For example: {@code "bread" & "butter"} => * {@code "bread" & "butter"}. *

* *

XML 1.1 can represent certain control characters, but it cannot represent * the null byte or unpaired Unicode surrogate codepoints, even after escaping. * {@code escapeXml11} will remove characters that do not fit in the following * ranges:

* *

{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}

* *

{@code escapeXml11} will escape characters in the following ranges:

* *

{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}

* *

The returned string can be inserted into a valid XML 1.1 document. Do not * use it for XML 1.0 documents.

* input the {@code String} to escape, may be null * @return a new escaped {@code String}, {@code null} if null string input * @see #unescapeXml(java.lang.String) * @since 3.3 */ public static String escapeXml11(final String input) { return ESCAPE_XML11.translate(input); } //----------------------------------------------------------------------- /** *

Unescapes a string containing XML entity escapes to a string * containing the actual Unicode characters corresponding to the * escapes.

* *

Supports only the five basic XML entities (gt, lt, quot, amp, apos). * Does not support DTDs or external entities.

* *

Note that numerical \\u Unicode codes are unescaped to their respective * Unicode characters. This may change in future releases.

* input the {@code String} to unescape, may be null * @return a new unescaped {@code String}, {@code null} if null string input * @see #escapeXml(String) * @see #escapeXml10(String) * @see #escapeXml11(String) */ public static final String unescapeXml(final String input) { return UNESCAPE_XML.translate(input); } //----------------------------------------------------------------------- /** *

Returns a {@code String} value for a CSV column enclosed in double quotes, * if required.

* *

If the value contains a comma, newline or double quote, then the * String value is returned enclosed in double quotes.

* *

Any double quote characters in the value are escaped with another double quote.

* *

If the value does not contain a comma, newline or double quote, then the * String value is returned unchanged.

* * see Wikipedia and * RFC 4180. * * @param input the input CSV column String, may be null * @return * newline or double quote, {@code null} if null string input * @since 2.4 */ public static final String escapeCsv(final String input) { return ESCAPE_CSV.translate(input); } /** *

Returns a {@code String} value for an unescaped CSV column.

* *

If the value is enclosed in double quotes, and contains a comma, newline * or double quote, then quotes are removed. *

* *

Any double quote escaped characters (a pair of double quotes) are unescaped * to just one double quote.

* *

If the value is not enclosed in double quotes, or is and does not contain a * comma, newline or double quote, then the String value is returned unchanged.

* * see Wikipedia and * RFC 4180. * * @param input the input CSV column String, may be null * @return * quotes unescaped, {@code null} if null string input * @since 2.4 */ public static final String unescapeCsv(final String input) { return UNESCAPE_CSV.translate(input); } /** * An API for translating text. * Its core use is to escape and unescape text. Because escaping and unescaping * is completely contextual, the API does not present two separate signatures. * * @since 3.0 */ public static abstract class CharSequenceTranslator { /** The Constant HEX_DIGITS. */ static final char[] HEX_DIGITS = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; /** * Translate a set of codepoints, represented by an int index into a CharSequence, * into another set of codepoints. The number of codepoints consumed must be returned, * and the only IOExceptions thrown must be from interacting with the Writer so that * the top level API may reliably ignore StringWriter IOExceptions. * * @param input CharSequence that is being translated * @param index int representing the current point of translation * @param out Writer to translate the text to * @return int count of codepoints consumed * @throws IOException if and only if the Writer produces an IOException */ public abstract int translate(CharSequence input, int index, Writer out) throws IOException; /** * Helper for non-Writer usage. * @param input CharSequence to be translated * @return String output of translation */ public final String translate(final CharSequence input) { if (input == null) { return null; } try { final StringWriter writer = new StringWriter(input.length() * 2); translate(input, writer); return writer.toString(); } catch (final IOException ioe) { // this should never ever happen while writing to a StringWriter throw new RuntimeException(ioe); } } /** * Translate an input onto a Writer. This is intentionally final as its algorithm is * tightly coupled with the abstract method of this class. * * @param input CharSequence that is being translated * @param out Writer to translate the text to * @throws IOException if and only if the Writer produces an IOException */ public final void translate(final CharSequence input, final Writer out) throws IOException { if (out == null) { throw new IllegalArgumentException("The Writer must not be null"); } if (input == null) { return; } int pos = 0; final int len = input.length(); while (pos < len) { final int consumed = translate(input, pos, out); if (consumed == 0) { // inlined implementation of Character.toChars(Character.codePointAt(input, pos)) // avoids allocating temp char arrays and duplicate checks char c1 = input.charAt(pos); out.write(c1); pos++; if (Character.isHighSurrogate(c1) && pos < len) { char c2 = input.charAt(pos); if (Character.isLowSurrogate(c2)) { out.write(c2); pos++; } } continue; } // contract with translators is that they have to understand codepoints // and they just took care of a surrogate pair for (int pt = 0; pt < consumed; pt++) { pos += Character.charCount(Character.codePointAt(input, pos)); } } } /** * Helper method to create a merger of this translator with another set of * translators. Useful in customizing the standard functionality. * * @param translators CharSequenceTranslator array of translators to merge with this one * @return CharSequenceTranslator merging this translator with the others */ @SafeVarargs public final CharSequenceTranslator with(final CharSequenceTranslator... translators) { final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1]; newArray[0] = this; System.arraycopy(translators, 0, newArray, 1, translators.length); return new AggregateTranslator(newArray); } /** *

Returns an upper case hexadecimal String for the given * character.

* * @param codepoint The codepoint to convert. * @return An upper case hexadecimal String */ public static String hex(final int codepoint) { return Integer.toHexString(codepoint).toUpperCase(Locale.ENGLISH); } } /** * Executes a sequence of translators one after the other. Execution ends whenever * the first translator consumes codepoints from the input. * * @since 3.0 */ static class AggregateTranslator extends CharSequenceTranslator { /** The translators. */ private final CharSequenceTranslator[] translators; /** * Specify the translators to be used at creation time. * * @param translators CharSequenceTranslator array to aggregate */ @SafeVarargs public AggregateTranslator(final CharSequenceTranslator... translators) { this.translators = N.clone(translators); } /** * The first translator to consume codepoints from the input is the 'winner'. * Execution stops with the number of consumed codepoints being returned. * {@inheritDoc} */ @Override public int translate(final CharSequence input, final int index, final Writer out) throws IOException { for (final CharSequenceTranslator translator : translators) { final int consumed = translator.translate(input, index, out); if (consumed != 0) { return consumed; } } return 0; } } /** * Translates codepoints to their Unicode escaped value suitable for Java source. * * @since 3.2 */ static class JavaUnicodeEscaper extends UnicodeEscaper { /** *

* Constructs a JavaUnicodeEscaper above the specified value (exclusive). *

* * @param codepoint * above which to escape * @return */ public static JavaUnicodeEscaper above(final int codepoint) { return outsideOf(0, codepoint); } /** *

* Constructs a JavaUnicodeEscaper below the specified value (exclusive). *

* * @param codepoint * below which to escape * @return */ public static JavaUnicodeEscaper below(final int codepoint) { return outsideOf(codepoint, Integer.MAX_VALUE); } /** *

* Constructs a JavaUnicodeEscaper between the specified values (inclusive). *

* * @param codepointLow * above which to escape * @param codepointHigh * below which to escape * @return */ public static JavaUnicodeEscaper between(final int codepointLow, final int codepointHigh) { return new JavaUnicodeEscaper(codepointLow, codepointHigh, true); } /** *

* Constructs a JavaUnicodeEscaper outside of the specified values (exclusive). *

* * @param codepointLow * below which to escape * @param codepointHigh * above which to escape * @return */ public static JavaUnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) { return new JavaUnicodeEscaper(codepointLow, codepointHigh, false); } /** *

* Constructs a JavaUnicodeEscaper for the specified range. This is the underlying method for the * other constructors/builders. The below and above boundaries are inclusive when * between is true and exclusive when it is false. *

* * @param below * int value representing the lowest codepoint boundary * @param above * int value representing the highest codepoint boundary * @param between * whether to escape between the boundaries or outside them */ public JavaUnicodeEscaper(final int below, final int above, final boolean between) { super(below, above, between); } /** * Converts the given codepoint to a hex string of the form {@code "\\uXXXX\\uXXXX"}. * codepoint a Unicode code point * @return */ @Override protected String toUtf16Escape(final int codepoint) { final char[] surrogatePair = Character.toChars(codepoint); return "\\u" + hex(surrogatePair[0]) + "\\u" + hex(surrogatePair[1]); } } /** * Translates codepoints to their XML numeric entity escaped value. * * @since 3.0 */ static class NumericEntityEscaper extends CodePointTranslator { /** The below. */ private final int below; /** The above. */ private final int above; /** The between. */ private final boolean between; /** *

Constructs a NumericEntityEscaper for the specified range. This is * the underlying method for the other constructors/builders. The below * and above boundaries are inclusive when between is * true and exclusive when it is false.

* * @param below int value representing the lowest codepoint boundary * @param above int value representing the highest codepoint boundary * @param between whether to escape between the boundaries or outside them */ private NumericEntityEscaper(final int below, final int above, final boolean between) { this.below = below; this.above = above; this.between = between; } /** *

Constructs a NumericEntityEscaper for all characters.

*/ public NumericEntityEscaper() { this(0, Integer.MAX_VALUE, true); } /** *

Constructs a NumericEntityEscaper below the specified value (exclusive).

* * @param codepoint below which to escape * @return */ public static NumericEntityEscaper below(final int codepoint) { return outsideOf(codepoint, Integer.MAX_VALUE); } /** *

Constructs a NumericEntityEscaper above the specified value (exclusive).

* * @param codepoint above which to escape * @return */ public static NumericEntityEscaper above(final int codepoint) { return outsideOf(0, codepoint); } /** *

Constructs a NumericEntityEscaper between the specified values (inclusive).

* * @param codepointLow above which to escape * @param codepointHigh below which to escape * @return */ public static NumericEntityEscaper between(final int codepointLow, final int codepointHigh) { return new NumericEntityEscaper(codepointLow, codepointHigh, true); } /** *

Constructs a NumericEntityEscaper outside of the specified values (exclusive).

* * @param codepointLow below which to escape * @param codepointHigh above which to escape * @return */ public static NumericEntityEscaper outsideOf(final int codepointLow, final int codepointHigh) { return new NumericEntityEscaper(codepointLow, codepointHigh, false); } /** * {@inheritDoc} */ @Override public boolean translate(final int codepoint, final Writer out) throws IOException { if (between) { if (codepoint < below || codepoint > above) { return false; } } else { if (codepoint >= below && codepoint <= above) { return false; } } out.write("&#"); out.write(Integer.toString(codepoint, 10)); out.write(';'); return true; } } /** * Helper subclass to CharSequenceTranslator to remove unpaired surrogates. */ static class UnicodeUnpairedSurrogateRemover extends CodePointTranslator { /** * Implementation of translate that throws out unpaired surrogates. * {@inheritDoc} */ @Override public boolean translate(final int codepoint, final Writer out) throws IOException { if (codepoint >= Character.MIN_SURROGATE && codepoint <= Character.MAX_SURROGATE) { // It's a surrogate. Write nothing and say we've translated. return true; } // It's not a surrogate. Don't translate it. return false; } } /** * Translates escaped Unicode values of the form \\u+\d\d\d\d back to * Unicode. It supports multiple 'u' characters and will work with or * without the +. * * @since 3.0 */ static class UnicodeUnescaper extends CharSequenceTranslator { /** * {@inheritDoc} */ @Override public int translate(final CharSequence input, final int index, final Writer out) throws IOException { if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'u') { // consume optional additional 'u' chars int i = 2; while (index + i < input.length() && input.charAt(index + i) == 'u') { i++; } if (index + i < input.length() && input.charAt(index + i) == '+') { i++; } if (index + i + 4 <= input.length()) { // Get 4 hex digits final CharSequence unicode = input.subSequence(index + i, index + i + 4); try { final int value = Integer.parseInt(unicode.toString(), 16); out.write((char) value); } catch (final NumberFormatException nfe) { throw new IllegalArgumentException("Unable to parse unicode value: " + unicode, nfe); } return i + 4; } throw new IllegalArgumentException( "Less than 4 hex digits in unicode value: '" + input.subSequence(index, input.length()) + "' due to end of CharSequence"); } return 0; } } /** * Translates codepoints to their Unicode escaped value. * * @since 3.0 */ static class UnicodeEscaper extends CodePointTranslator { /** The below. */ private final int below; /** The above. */ private final int above; /** The between. */ private final boolean between; /** *

Constructs a UnicodeEscaper for all characters.

*/ public UnicodeEscaper() { this(0, Integer.MAX_VALUE, true); } /** *

Constructs a UnicodeEscaper for the specified range. This is * the underlying method for the other constructors/builders. The below * and above boundaries are inclusive when between is * true and exclusive when it is false.

* * @param below int value representing the lowest codepoint boundary * @param above int value representing the highest codepoint boundary * @param between whether to escape between the boundaries or outside them */ protected UnicodeEscaper(final int below, final int above, final boolean between) { this.below = below; this.above = above; this.between = between; } /** *

Constructs a UnicodeEscaper below the specified value (exclusive).

* * @param codepoint below which to escape * @return */ public static UnicodeEscaper below(final int codepoint) { return outsideOf(codepoint, Integer.MAX_VALUE); } /** *

Constructs a UnicodeEscaper above the specified value (exclusive).

* * @param codepoint above which to escape * @return */ public static UnicodeEscaper above(final int codepoint) { return outsideOf(0, codepoint); } /** *

Constructs a UnicodeEscaper outside of the specified values (exclusive).

* * @param codepointLow below which to escape * @param codepointHigh above which to escape * @return */ public static UnicodeEscaper outsideOf(final int codepointLow, final int codepointHigh) { return new UnicodeEscaper(codepointLow, codepointHigh, false); } /** *

Constructs a UnicodeEscaper between the specified values (inclusive).

* * @param codepointLow above which to escape * @param codepointHigh below which to escape * @return */ public static UnicodeEscaper between(final int codepointLow, final int codepointHigh) { return new UnicodeEscaper(codepointLow, codepointHigh, true); } /** * {@inheritDoc} */ @Override public boolean translate(final int codepoint, final Writer out) throws IOException { if (between) { if (codepoint < below || codepoint > above) { return false; } } else { if (codepoint >= below && codepoint <= above) { return false; } } // TODO: Handle potential + sign per various Unicode escape implementations if (codepoint > 0xffff) { out.write(toUtf16Escape(codepoint)); } else { out.write("\\u"); out.write(HEX_DIGITS[(codepoint >> 12) & 15]); out.write(HEX_DIGITS[(codepoint >> 8) & 15]); out.write(HEX_DIGITS[(codepoint >> 4) & 15]); out.write(HEX_DIGITS[(codepoint) & 15]); } return true; } /** * Converts the given codepoint to a hex string of the form {@code "\\uXXXX"}. * codepoint a Unicode code point * @return * @since 3.2 */ protected String toUtf16Escape(final int codepoint) { return "\\u" + hex(codepoint); } } /** * Translate escaped octal Strings back to their octal values. * * For example, "\45" should go back to being the specific value (a %). * * Note that this currently only supports the viable range of octal for Java; namely * 1 to 377. This is because parsing Java is the main use case. * * @since 3.0 */ static class OctalUnescaper extends CharSequenceTranslator { /** * {@inheritDoc} */ @Override public int translate(final CharSequence input, final int index, final Writer out) throws IOException { final int remaining = input.length() - index - 1; // how many characters left, ignoring the first \ final StringBuilder builder = new StringBuilder(); if (input.charAt(index) == '\\' && remaining > 0 && isOctalDigit(input.charAt(index + 1))) { final int next = index + 1; final int next2 = index + 2; final int next3 = index + 3; // we know this is good as we checked it in the if block above builder.append(input.charAt(next)); if (remaining > 1 && isOctalDigit(input.charAt(next2))) { builder.append(input.charAt(next2)); if (remaining > 2 && isZeroToThree(input.charAt(next)) && isOctalDigit(input.charAt(next3))) { builder.append(input.charAt(next3)); } } out.write(Integer.parseInt(builder.toString(), 8)); return 1 + builder.length(); } return 0; } /** * Checks if the given char is an octal digit. Octal digits are the character representations of the digits 0 to 7. * @param ch the char to check * @return true if the given char is the character representation of one of the digits from 0 to 7 */ private boolean isOctalDigit(final char ch) { return ch >= '0' && ch <= '7'; } /** * Checks if the given char is the character representation of one of the digit from 0 to 3. * @param ch the char to check * @return true if the given char is the character representation of one of the digits from 0 to 3 */ private boolean isZeroToThree(final char ch) { return ch >= '0' && ch <= '3'; } } /** * Translate XML numeric entities of the form &#[xX]?\d+;? to * the specific codepoint. * * Note that the semi-colon is optional. * * @since 3.0 */ static class NumericEntityUnescaper extends CharSequenceTranslator { /** * The Enum OPTION. */ public static enum OPTION { /** The semi colon required. */ semiColonRequired, /** The semi colon optional. */ semiColonOptional, /** The error if no semi colon. */ errorIfNoSemiColon } /** The options. */ // TODO?: Create an OptionsSet class to hide some of the conditional logic below private final EnumSet




© 2015 - 2025 Weber Informatics LLC | Privacy Policy