fr.cenotelie.commons.utils.xml.XmlCanonicalizer Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2017 Association Cénotélie (cenotelie.fr)
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General
* Public License along with this program.
* If not, see .
******************************************************************************/
package fr.cenotelie.commons.utils.xml;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* Represents a canonicalizer for RDF/XML inputs
*
* @author Laurent Wouters
*/
public class XmlCanonicalizer {
/**
* The output string writer
*/
protected StringWriter output;
/**
* Flags whether the current document is XML 1.1
*/
protected boolean xmlVersion11;
/**
* Canonicalizes the specified list of nodes
*
* @param nodes The XML nodes to canonicalize
* @return The String representation of the canonical XML
*/
public String canonicalize(NodeList nodes) {
output = new StringWriter();
onNodes(nodes);
return output.toString();
}
/**
* Canonicalizes the specified list of nodes
*
* @param nodes A list of nodes
*/
private void onNodes(NodeList nodes) {
for (int i = 0; i != nodes.getLength(); i++)
onOne(nodes.item(i));
}
/**
* Canonicalizes the specified node
*
* @param node A node
*/
private void onOne(Node node) {
switch (node.getNodeType()) {
case Node.ELEMENT_NODE:
onElementStart(node);
onNodes(node.getChildNodes());
onElementEnd(node);
break;
case Node.TEXT_NODE:
onText(node);
break;
}
}
/**
* Writes the canonical element start markup for the specified node
*
* @param node An element node
*/
private void onElementStart(Node node) {
output.write('<');
output.write(node.getNodeName());
List attributes = new ArrayList<>(node.getAttributes().getLength());
for (int i = 0; i != node.getAttributes().getLength(); i++) {
attributes.add(node.getAttributes().item(i));
}
Collections.sort(attributes, new Comparator() {
@Override
public int compare(Node node1, Node node2) {
return node1.getNodeName().compareTo(node2.getNodeName());
}
});
for (Node attribute : attributes)
onAttribute(attribute);
output.write('>');
}
/**
* Writes the canonical representation of the specified attribute
*
* @param node An attribute node
*/
private void onAttribute(Node node) {
output.write(" ");
output.write(node.getNodeName());
output.write("=\"");
normalizeAndPrint(node.getNodeValue(), true);
output.write('"');
}
/**
* Writes the canonical element end markup for the specified node
*
* @param node An element node
*/
private void onElementEnd(Node node) {
output.write("");
output.write(node.getNodeName());
output.write('>');
}
/**
* Writes the canonical representation of the specified text node
*
* @param node A text node
*/
private void onText(Node node) {
normalizeAndPrint(node.getNodeValue(), false);
}
/**
* Normalizes and prints the specified string
*
* @param value The string to normalize and print
* @param isAttributeValue Whether this is the value of an attribute
*/
private void normalizeAndPrint(String value, boolean isAttributeValue) {
int len = (value != null) ? value.length() : 0;
for (int i = 0; i < len; i++) {
char c = value.charAt(i);
normalizeAndPrint(c, isAttributeValue);
}
}
/**
* Normalizes and prints the specified character
*
* @param character A character
* @param isAttributeValue Whether this is the value of an attribute
*/
private void normalizeAndPrint(char character, boolean isAttributeValue) {
switch (character) {
case '<': {
output.write("<");
break;
}
case '>': {
output.write(">");
break;
}
case '&': {
output.write("&");
break;
}
case '"': {
// A '"' that appears in character data
// does not need to be escaped.
if (isAttributeValue) {
output.write(""");
} else {
output.write("\"");
}
break;
}
case '\r': {
// If CR is part of the document's content, it
// must not be printed as a literal otherwise
// it would be normalized to LF when the document
// is reparsed.
output.write("
");
break;
}
case '\n': {
output.write("
");
break;
}
default: {
// In XML 1.1, control chars in the ranges [#x1-#x1F, #x7F-#x9F] must be escaped.
//
// Escape space characters that would be normalized to #x20 in attribute values
// when the document is reparsed.
//
// Escape NEL (0x85) and LSEP (0x2028) that appear in content
// if the document is XML 1.1, since they would be normalized to LF
// when the document is reparsed.
if (xmlVersion11 && ((character >= 0x01 && character <= 0x1F && character != 0x09)
|| (character >= 0x7F && character <= 0x9F) || character == 0x2028)
|| isAttributeValue && character == 0x09) {
output.write("");
output.write(Integer.toHexString(character).toUpperCase());
output.write(";");
} else {
output.write(character);
}
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy