cdc.io.tools.XmlNormalizer Maven / Gradle / Ivy
package cdc.io.tools;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.UnaryOperator;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import cdc.io.data.Attribute;
import cdc.io.data.Document;
import cdc.io.data.Element;
import cdc.io.data.Node;
import cdc.io.data.paths.SPath;
import cdc.io.data.util.AttributeNameConverter;
import cdc.io.data.util.AttributePredicate;
import cdc.io.data.util.AttributeValueConverter;
import cdc.io.data.util.DataUtils;
import cdc.io.data.util.ElementNameConverter;
import cdc.io.data.util.ElementPredicate;
import cdc.io.data.util.PatternReplacement;
import cdc.io.data.util.TextContentConverter;
import cdc.io.data.util.TextPredicate;
import cdc.io.data.xml.XmlDataReader;
import cdc.io.data.xml.XmlDataWriter;
import cdc.io.tools.XmlNormalizer.MainArgs.Feature;
import cdc.io.utils.NonCloseableOutputStream;
import cdc.io.xml.XmlWriter;
import cdc.util.cli.AbstractMainSupport;
import cdc.util.cli.FeatureMask;
import cdc.util.cli.OptionEnum;
import cdc.util.lang.MapUtils;
/**
* Toy utility that can be used to "normalize" or modify an XML file.
*
* WARNING: This is not standard XML normalization, but it can be useful.
* This can:
*
* - Pretty print file (indentation string can be specified).
*
- Sorting
*
* - Sort all attributes.
*
- Move some attributes to first position.
*
- Sort all elements.
*
*
* - Scrambling
*
* - Scramble values of all attributes.
*
- Scramble values of some attributes.
*
- Scramble content of all elements.
*
- Scramble content of some elements.
*
*
* - Removal
*
* - Remove all empty attributes.
*
- Remove all pure elements.
*
- Remove all comments.
*
- Remove all elements that have a given name (before any local renaming).
*
- Remove all attributes that have a given name (before any local renaming).
*
*
* - Renaming
*
* - Rename attributes names.
*
- Rename elements names.
*
*
* - Setting
*
* - Set attributes values (of existing attributes).
*
- Set elements contents (of existing elements that have a text child or no children).
*
*
*
* WARNING: Renaming may interact with other actions. It may be safe to rename in a separate step.
*
* @author Damien Carbonne
*/
public final class XmlNormalizer {
private static final Logger LOGGER = LogManager.getLogger(XmlNormalizer.class);
private static final String ALIAS_SEPARATOR = "::";
private final MainArgs margs;
public static class MainArgs {
public enum Feature implements OptionEnum {
PRETTY_PRINT("pretty-print", "Pretty prints."),
USE_XML_EOL("use-xml-eol", "Use xml eol instead of platform eol."),
APPEND_FINAL_EOL("append-final-eol", "Appends a final eol."),
USE_SINGLE_QUOTE("use-single-quote", "Use single quote (') instead of quote (\") for attributes delimiters."),
ALWAYS_ENTITIZE_ATTRIBUTES("always-entitize-attributes",
"Replaces all special characters by entities in attributes, even if not necessary."),
SORT_ATTRIBUTES("sort-attributes", "Sorts all attributes in alphabetical order."),
SCRAMBLE_ATTRIBUTES("scramble-attributes", "Scramble values of all attributes. This can alter schema compliance."),
SORT_ELEMENTS("sort-elements", "Sorts all children elements in alphabetical order. This can alter schema compliance."),
SCRAMBLE_ELEMENTS("scramble-elements",
"Scramble content of all (non-ignorable white space) elements. This can alter schema compliance."),
REMOVE_EMPTY_ATTRIBUTES("remove-empty-attributes", "Removes all empty attributes. This can alter schema compliance."),
REMOVE_PURE_ELEMENTS("remove-pure-elements",
"Removes all elements that are empty and have no attributes. This can alter schema compliance."),
REMOVE_COMMENTS("remove-comments", "Removes all comments."),
DELAYED_PROCESSING("delayed-processing",
"If set, processing is done after all data has been loaded. Otherwise, it is done during loading, if possible.");
private final String name;
private final String description;
private Feature(String name,
String description) {
this.name = name;
this.description = description;
}
@Override
public final String getName() {
return name;
}
@Override
public final String getDescription() {
return description;
}
}
/** String to use for indentation. */
public String indentString = " ";
public File inputFile;
public File outputFile;
/** List of attributes names to move to first position. Order matters. */
public final List firstAttributes = new ArrayList<>();
/** Set of attributes and elements to scramble. */
public final Set scrambled = new HashSet<>();
/** Set of attributes and elements to remove. */
public final Set removed = new HashSet<>();
/** Renaming of attributes names and elements names. */
public final Map> renamed = new HashMap<>();
/** Setting of attributes values and elements contents. */
public final Map> set = new HashMap<>();
protected final FeatureMask features = new FeatureMask<>();
public void setEnabled(Feature feature,
boolean enabled) {
features.setEnabled(feature, enabled);
}
public boolean isEnabled(Feature feature) {
return features.isEnabled(feature);
}
}
private XmlNormalizer(MainArgs margs) {
this.margs = margs;
}
private void execute() throws IOException {
// Read input file
final XmlDataReader reader = new XmlDataReader();
reader.setEnabled(XmlDataReader.Feature.ALLOW_MIXED_CONTENT, true);
reader.setEnabled(XmlDataReader.Feature.LOAD_COMMENTS, !margs.isEnabled(Feature.REMOVE_COMMENTS));
reader.setEnabled(XmlDataReader.Feature.LOAD_SPACES, !margs.isEnabled(Feature.PRETTY_PRINT));
final Map> renamedAttributes = MapUtils.filterKeys(margs.renamed, SPath::isAttribute);
final Map> renamedElements = MapUtils.filterKeys(margs.renamed, SPath::isElement);
final Map> setAttributes = MapUtils.filterKeys(margs.set, SPath::isAttribute);
final Map> setElements = MapUtils.filterKeys(margs.set, SPath::isElement);
final boolean delayed =
margs.isEnabled(Feature.DELAYED_PROCESSING)
|| renamedAttributes.size() * renamedElements.size() != 0;
if (delayed != margs.isEnabled(Feature.DELAYED_PROCESSING)) {
LOGGER.warn("Activate {}", Feature.DELAYED_PROCESSING);
}
if (!delayed) {
// If delayed processing is disabled, do as many processing as possible during loading
// Build the element pre filter used to remove specific elements
ElementPredicate elementPreFilter = ElementPredicate.ANY_ELEMENT;
if (!margs.removed.isEmpty()) {
elementPreFilter = ElementPredicate.fromPaths(margs.removed).not();
}
reader.setElementPreFilter(elementPreFilter);
// Build the element post filter used to remove pure elements
ElementPredicate elementPostFilter = ElementPredicate.ANY_ELEMENT;
if (margs.isEnabled(Feature.REMOVE_PURE_ELEMENTS)) {
elementPostFilter = ElementPredicate.PURE_ELEMENT.not();
}
reader.setElementPostFilter(elementPostFilter);
// Build the attribute filter used to remove empty and specific attributes
AttributePredicate attributeFilter = AttributePredicate.ANY_ATTRIBUTE;
if (margs.isEnabled(Feature.REMOVE_EMPTY_ATTRIBUTES)) {
attributeFilter = AttributePredicate.IS_NOT_EMPTY_ATTRIBUTE;
}
if (!margs.removed.isEmpty()) {
attributeFilter = attributeFilter.and(AttributePredicate.fromPaths(margs.removed).not());
}
reader.setAttributeFilter(attributeFilter);
// Build the attributes and elements renaming
if (!renamedAttributes.isEmpty()) {
reader.setAttributeNameConverter(AttributeNameConverter.fromPathNameFunctionMap(renamedAttributes));
}
if (!renamedElements.isEmpty()) {
reader.setElementNameConverter(ElementNameConverter.fromPathNameFunctionMap(renamedElements));
}
// Build the attribute value converter
if (!margs.set.isEmpty()) {
reader.setAttributeValueConverter(AttributeValueConverter.fromPathValueFunctionMap(margs.set));
}
// Elements content setting can not be done during loading.
}
final Document document = reader.read(margs.inputFile);
// Modify data
if (delayed) {
// Remove specific elements first
if (!margs.removed.isEmpty()) {
DataUtils.removeMatchingChildren(document,
Element.matching(margs.removed),
DataUtils.RECURSE);
}
// Remove attributes
if (margs.isEnabled(Feature.REMOVE_EMPTY_ATTRIBUTES)) {
DataUtils.removeEmptyAttributes(document, DataUtils.RECURSE);
}
if (!margs.removed.isEmpty()) {
DataUtils.removeMatchingAttributes(document,
AttributePredicate.fromPaths(margs.removed),
DataUtils.RECURSE);
}
// Now remove pure elements
// Attributes removal can make some elements empty
if (margs.isEnabled(Feature.REMOVE_PURE_ELEMENTS)) {
DataUtils.removePureElements(document, DataUtils.RECURSE);
}
// Set attributes values (before renaming)
if (!setAttributes.isEmpty()) {
DataUtils.setValueOfMatchingAttributes(document,
AttributePredicate.fromPaths(setAttributes.keySet()),
AttributeValueConverter.fromPathValueFunctionMap(setAttributes),
DataUtils.RECURSE);
}
// Rename attributes and elements
if (!renamedAttributes.isEmpty()) {
DataUtils.setNameOfMatchingAttributes(document,
AttributePredicate.fromPaths(renamedAttributes.keySet()),
AttributeNameConverter.fromPathNameFunctionMap(renamedAttributes),
DataUtils.RECURSE);
}
if (!renamedElements.isEmpty()) {
DataUtils.setNameOfMatchingElements(document,
SPath.toStandardPredicate(renamedElements.keySet()),
ElementNameConverter.fromPathNameFunctionMap(renamedElements),
DataUtils.RECURSE);
}
}
// Set elements contents (after renaming)
if (!setElements.isEmpty()) {
DataUtils.setContentOfMatchingElements(document,
ElementPredicate.fromPaths(setElements.keySet()),
TextContentConverter.fromPathContentFunctionMap(setElements),
DataUtils.RECURSE);
}
if (margs.isEnabled(Feature.SORT_ATTRIBUTES)) {
DataUtils.sortAttributes(document,
Attribute.NAME_COMPARATOR,
DataUtils.RECURSE);
}
if (margs.isEnabled(Feature.SORT_ELEMENTS)) {
DataUtils.sortChildren(document,
Node.ANY_NODE,
Node.ELEMENT_NAME_AND_ATTRIBUTES_COMPARATOR,
DataUtils.RECURSE);
}
if (margs.isEnabled(Feature.SCRAMBLE_ATTRIBUTES)) {
DataUtils.setValueOfMatchingAttributes(document,
AttributePredicate.ANY_ATTRIBUTE,
AttributeValueConverter.scramble(true),
DataUtils.RECURSE);
}
if (margs.isEnabled(Feature.SCRAMBLE_ELEMENTS)) {
DataUtils.setContentOfMatchingTexts(document,
TextPredicate.ANY_TEXT,
TextContentConverter.scramble(true),
DataUtils.RECURSE);
}
for (final String name : margs.firstAttributes) {
DataUtils.moveAttributeFirst(document, Node.ANY_NODE, name, DataUtils.RECURSE);
}
// Now we can scramble specific attributes and elements
if (!margs.scrambled.isEmpty()) {
DataUtils.setValueOfMatchingAttributes(document,
AttributePredicate.fromPaths(margs.scrambled),
AttributeValueConverter.scramble(true),
DataUtils.RECURSE);
DataUtils.setContentOfMatchingTexts(document,
TextPredicate.fromElementPaths(margs.scrambled),
TextContentConverter.scramble(true),
DataUtils.RECURSE);
}
// Write output
try (final XmlDataWriter writer = margs.outputFile == null
? new XmlDataWriter(NonCloseableOutputStream.NON_CLOSABLE_SYSTEM_OUT)
: new XmlDataWriter(margs.outputFile)) {
writer.getXmlWriter().setEnabled(XmlWriter.Feature.PRETTY_PRINT, margs.isEnabled(Feature.PRETTY_PRINT));
writer.getXmlWriter().setEnabled(XmlWriter.Feature.USE_XML_EOL, margs.isEnabled(Feature.USE_XML_EOL));
writer.getXmlWriter().setEnabled(XmlWriter.Feature.APPEND_FINAL_EOL, margs.isEnabled(Feature.APPEND_FINAL_EOL));
writer.getXmlWriter().setEnabled(XmlWriter.Feature.USE_SINGLE_QUOTE, margs.isEnabled(Feature.USE_SINGLE_QUOTE));
writer.getXmlWriter()
.setEnabled(XmlWriter.Feature.ALWAYS_ENTITIZE_ATTRIBUTES, margs.isEnabled(Feature.ALWAYS_ENTITIZE_ATTRIBUTES));
writer.getXmlWriter().setIndentString(margs.indentString);
writer.write(document);
writer.flush();
}
}
public static void execute(MainArgs margs) throws IOException {
final XmlNormalizer instance = new XmlNormalizer(margs);
instance.execute();
}
public static void main(String[] args) {
final MainSupport support = new MainSupport();
support.main(args);
}
private static class MainSupport extends AbstractMainSupport {
private static final String FIRST_ATTRIBUTE = "first-attribute";
private static final String INDENT_STRING = "indent-string";
private static final String REMOVE = "remove";
private static final String RENAME = "rename";
private static final String SCRAMBLE = "scramble";
private static final String SET = "set";
public MainSupport() {
super(XmlNormalizer.class, LOGGER);
}
@Override
protected String getVersion() {
return Config.VERSION;
}
@Override
protected void addSpecificOptions(Options options) {
options.addOption(Option.builder()
.longOpt(INPUT)
.desc("Xml input file.")
.hasArg()
.required()
.build());
options.addOption(Option.builder()
.longOpt(OUTPUT)
.desc("Optional Xml output file.")
.hasArg()
.build());
options.addOption(Option.builder()
.longOpt(INDENT_STRING)
.desc("Optional indentation string. Used when " + Feature.PRETTY_PRINT.getName()
+ " is enabled. (Default '')")
.hasArg()
.build());
options.addOption(Option.builder()
.longOpt(FIRST_ATTRIBUTE)
.desc("Optional name(s) of attributes to move to first position."
+ " Order of declarations matters."
+ " Executed after attributes sorting.")
.hasArgs()
.build());
options.addOption(Option.builder()
.longOpt(SCRAMBLE)
.desc("Optional path(s) of attributes and elements to scramble. Have the form: name(/name)* to designate an element or (name(/name)*)?@name to designate an attribute.")
.hasArgs()
.build());
options.addOption(Option.builder()
.longOpt(REMOVE)
.desc("Optional path(s) of attributes and elements to remove. Have the form: name(/name)* to designate an element or (name(/name)*)?@name to designate an attribute.")
.hasArgs()
.build());
options.addOption(Option.builder()
.longOpt(RENAME)
.desc("Optional pair(s) (path, name) or triplets (path, pattern, replacement) of attributes or elements to rename.\n"
+ "Have these forms:\n"
+ "- path" + ALIAS_SEPARATOR + "name.\n"
+ "- path" + ALIAS_SEPARATOR + "pattern" + ALIAS_SEPARATOR + "replacement")
.hasArgs()
.build());
options.addOption(Option.builder()
.longOpt(SET)
.desc("Optional pair(s) (path, text) or triplets (path, pattern, replacement) of attributes or elements to set.\n"
+ "Have these forms:\n"
+ "- path" + ALIAS_SEPARATOR + "text.\n"
+ "- path" + ALIAS_SEPARATOR + "pattern" + ALIAS_SEPARATOR + "replacement")
.hasArgs()
.build());
addNoArgOptions(options, MainArgs.Feature.class);
}
private static void analyzePath(String s,
Map> map) throws ParseException {
final String part0 = getPart(s, ALIAS_SEPARATOR, 0, null);
final String part1 = getPart(s, ALIAS_SEPARATOR, 1, null);
final String part2 = getPart(s, ALIAS_SEPARATOR, 2, null);
if (part1 == null) {
throw new ParseException("Invalid alias arg: '" + s + "'");
}
if (part2 == null) {
map.put(new SPath(part0), n -> part1);
} else {
map.put(new SPath(part0), new PatternReplacement(part1, part2));
}
}
private static void fillPaths(CommandLine cl,
String optionName,
Collection values) {
if (cl.hasOption(optionName)) {
for (final String s : cl.getOptionValues(optionName)) {
values.add(new SPath(s));
}
}
}
@Override
protected MainArgs analyze(CommandLine cl) throws ParseException {
final MainArgs margs = new MainArgs();
margs.inputFile = getValueAsFile(cl, INPUT, IS_NULL_OR_FILE);
margs.outputFile = getValueAsFile(cl, OUTPUT);
margs.indentString = cl.getOptionValue(INDENT_STRING);
fillValues(cl, FIRST_ATTRIBUTE, margs.firstAttributes);
fillPaths(cl, SCRAMBLE, margs.scrambled);
fillPaths(cl, REMOVE, margs.removed);
if (cl.getOptionValues(RENAME) != null) {
for (final String s : cl.getOptionValues(RENAME)) {
analyzePath(s, margs.renamed);
}
}
if (cl.getOptionValues(SET) != null) {
for (final String s : cl.getOptionValues(SET)) {
analyzePath(s, margs.set);
}
}
setMask(cl, MainArgs.Feature.class, margs.features::setEnabled);
return margs;
}
@Override
protected Void execute(MainArgs margs) throws Exception {
XmlNormalizer.execute(margs);
return null;
}
}
}