org.apache.juneau.xml.XmlUtils Maven / Gradle / Ivy
// ***************************************************************************************************************************
// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file *
// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file *
// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance *
// * with the License. You may obtain a copy of the License at *
// * *
// * http://www.apache.org/licenses/LICENSE-2.0 *
// * *
// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an *
// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the *
// * specific language governing permissions and limitations under the License. *
// ***************************************************************************************************************************
package org.apache.juneau.xml;
import static org.apache.juneau.internal.StringUtils.*;
import java.io.*;
import java.net.*;
import java.util.*;
import javax.xml.stream.*;
import org.apache.juneau.*;
import org.apache.juneau.internal.*;
import org.apache.juneau.xml.annotation.*;
/**
* XML utility methods.
*/
public final class XmlUtils {
//--------------------------------------------------------------------------------
// XML element names
//--------------------------------------------------------------------------------
/**
* Encodes any invalid XML element name characters to _x####_
sequences.
*
* @param w The writer to send the output to.
* @param o The object being encoded.
* @return The same writer passed in.
* @throws IOException Throw by the writer.
*/
public static final Writer encodeElementName(Writer w, Object o) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (needsElementNameEncoding(s))
return encodeElementNameInner(w, s);
w.append(s);
return w;
}
/**
* Encodes any invalid XML element name characters to _x####_
sequences.
*
* @param o The object being encoded.
* @return The encoded element name string.
*/
public static final String encodeElementName(Object o) {
if (o == null)
return "_x0000_";
String s = o.toString();
if (s.isEmpty())
return "_xE000_";
try {
if (needsElementNameEncoding(s))
return encodeElementNameInner(new StringBuilderWriter(s.length() * 2), s).toString();
} catch (IOException e) {
throw new RuntimeException(e); // Never happens
}
return s;
}
private static final Writer encodeElementNameInner(Writer w, String s) throws IOException {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ((c >= 'A' && c <= 'Z')
|| (c == '_' && ! isEscapeSequence(s,i))
|| (c >= 'a' && c <= 'z')
|| (i != 0 && (
c == '-'
|| c == '.'
|| (c >= '0' && c <= '9')
|| c == '\u00b7'
|| (c >= '\u0300' && c <= '\u036f')
|| (c >= '\u203f' && c <= '\u2040')
))
|| (c >= '\u00c0' && c <= '\u00d6')
|| (c >= '\u00d8' && c <= '\u00f6')
|| (c >= '\u00f8' && c <= '\u02ff')
|| (c >= '\u0370' && c <= '\u037d')
|| (c >= '\u037f' && c <= '\u1fff')
|| (c >= '\u200c' && c <= '\u200d')
|| (c >= '\u2070' && c <= '\u218f')
|| (c >= '\u2c00' && c <= '\u2fef')
|| (c >= '\u3001' && c <= '\ud7ff')
|| (c >= '\uf900' && c <= '\ufdcf')
|| (c >= '\ufdf0' && c <= '\ufffd')) {
w.append(c);
} else {
appendPaddedHexChar(w, c);
}
}
return w;
}
private static final boolean needsElementNameEncoding(String s) {
// Note that this doesn't need to be perfect, just fast.
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
return true;
if (i == 0 && (c >= '0' && c <= '9'))
return true;
}
return false;
}
//--------------------------------------------------------------------------------
// XML element text
//--------------------------------------------------------------------------------
/**
* Escapes invalid XML text characters to _x####_
sequences.
*
* @param o The object being encoded.
* @return The encoded string.
*/
public static final String escapeText(Object o) {
if (o == null)
return "_x0000_";
String s = o.toString();
try {
if (! needsTextEncoding(s))
return s;
final int len = s.length();
StringWriter sw = new StringWriter(s.length()*2);
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
appendPaddedHexChar(sw, c);
else if (c == '_' && isEscapeSequence(s,i))
appendPaddedHexChar(sw, c);
else if (isValidXmlCharacter(c))
sw.append(c);
else
appendPaddedHexChar(sw, c);
}
return sw.toString();
} catch (IOException e) {
throw new RuntimeException(e); // Never happens
}
}
/**
* Encodes the specified element text and sends the results to the specified writer.
*
*
* Encodes any invalid XML text characters to _x####_
sequences and sends the response to the specified
* writer.
*
Encodes '&' , '<' , and '>' as XML entities.
*
Encodes invalid XML text characters to _x####_
sequences.
*
* @param w The writer to send the output to.
* @param o The object being encoded.
* @param trim Trim the text before serializing it.
* @param preserveWhitespace
* Specifies whether we're in preserve-whitespace mode.
* (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}.
* If true , leading and trailing whitespace characters will be encoded.
* @return The same writer passed in.
* @throws IOException Thrown from the writer.
*/
public static final Writer encodeText(Writer w, Object o, boolean trim, boolean preserveWhitespace) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (s.isEmpty())
return w.append("_xE000_");
if (trim)
s = s.trim();
if (needsTextEncoding(s)) {
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace)
appendPaddedHexChar(w, c);
else if (REPLACE_TEXT.contains(c))
w.append(REPLACE_TEXT.get(c));
else if (c == '_' && isEscapeSequence(s,i))
appendPaddedHexChar(w, c);
else if (isValidXmlCharacter(c))
w.append(c);
else
appendPaddedHexChar(w, c);
}
} else {
w.append(s);
}
return w;
}
private static final boolean needsTextEncoding(String s) {
// See if we need to convert the string.
// Conversion is somewhat expensive, so make sure we need to do so before hand.
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
return true;
if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i)))
return true;
}
return false;
}
private static AsciiMap REPLACE_TEXT = new AsciiMap()
.append('&', "&")
.append('<', "<")
.append('>', ">")
.append((char)0x09, " ")
.append((char)0x0A, "
")
.append((char)0x0D, "
");
//--------------------------------------------------------------------------------
// XML attribute names
//--------------------------------------------------------------------------------
/**
* Serializes and encodes the specified object as valid XML attribute name.
*
* @param w The writer to send the output to.
* @param o The object being serialized.
* @return This object (for method chaining).
* @throws IOException If a problem occurred.
*/
public static final Writer encodeAttrName(Writer w, Object o) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (needsAttrNameEncoding(s)) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (i == 0) {
if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')
w.append(c);
else if (c == '_' && ! isEscapeSequence(s,i))
w.append(c);
else
appendPaddedHexChar(w, c);
} else {
if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':'))
w.append(c);
else if (c == '_' && ! isEscapeSequence(s,i))
w.append(c);
else
appendPaddedHexChar(w, c);
}
}
} else {
w.append(s);
}
return w;
}
private static final boolean needsAttrNameEncoding(String s) {
// Note that this doesn't need to be perfect, just fast.
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
return true;
if (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
return true;
}
return false;
}
//--------------------------------------------------------------------------------
// XML attribute values
//--------------------------------------------------------------------------------
/**
* Encodes the specified attribute value and sends the results to the specified writer.
*
*
* Encodes any invalid XML text characters to _x####_
sequences and sends the response to the specified
* writer.
*
Encodes '&' , '<' , '>' , '"' , and '\'' as XML entities.
*
Encodes invalid XML text characters to _x####_
sequences.
*
* @param w The writer to send the output to.
* @param o The object being encoded.
* @param trim
* Trim the text before serializing it.
* If true , leading and trailing whitespace characters will be encoded.
* @return The same writer passed in.
* @throws IOException Thrown from the writer.
*/
public static final Writer encodeAttrValue(Writer w, Object o, boolean trim) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (s.isEmpty())
return w;
if (trim)
s = s.trim();
if (needsAttrValueEncoding(s)) {
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
appendPaddedHexChar(w, c);
else if (REPLACE_ATTR_VAL.contains(c))
w.append(REPLACE_ATTR_VAL.get(c));
else if (c == '_' && isEscapeSequence(s,i))
appendPaddedHexChar(w, c);
else if (isValidXmlCharacter(c))
w.append(c);
else
appendPaddedHexChar(w, c);
}
} else {
w.append(s);
}
return w;
}
private static final boolean needsAttrValueEncoding(String s) {
// See if we need to convert the string.
// Conversion is somewhat expensive, so make sure we need to do so before hand.
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
return true;
if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i)))
return true;
}
return false;
}
private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap()
.append('&', "&")
.append('<', "<")
.append('>', ">")
.append('"', """)
.append('\'', "'")
.append((char)0x09, " ")
.append((char)0x0A, "
")
.append((char)0x0D, "
");
//--------------------------------------------------------------------------------
// Decode XML text
//--------------------------------------------------------------------------------
/**
* Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters.
*
* @param s The string being decoded.
* @param sb The string builder to use as a scratch pad.
* @return The decoded string.
*/
public static final String decode(String s, StringBuilder sb) {
if (s == null) return null;
if (s.length() == 0)
return s;
if (s.indexOf('_') == -1)
return s;
if (sb == null)
sb = new StringBuilder(s.length());
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '_' && isEscapeSequence(s,i)) {
int x = Integer.parseInt(s.substring(i+2, i+6), 16);
// If we find _x0000_, then that means a null.
// If we find _xE000_, then that means an empty string.
if (x == 0)
return null;
else if (x != 0xE000)
sb.append((char)x);
i+=6;
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* Given a list of Strings and other Objects, combines Strings that are next to each other in the list.
*
* @param l The list of text nodes to collapse.
* @return The same list.
*/
public static LinkedList