All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cdc.io.tools.XmlNormalizer Maven / Gradle / Ivy

There is a newer version: 0.52.1
Show newest version
package cdc.io.tools;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.UnaryOperator;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import cdc.io.data.Attribute;
import cdc.io.data.Document;
import cdc.io.data.Element;
import cdc.io.data.Node;
import cdc.io.data.paths.SPath;
import cdc.io.data.util.AttributeNameConverter;
import cdc.io.data.util.AttributePredicate;
import cdc.io.data.util.AttributeValueConverter;
import cdc.io.data.util.DataUtils;
import cdc.io.data.util.ElementNameConverter;
import cdc.io.data.util.ElementPredicate;
import cdc.io.data.util.PatternReplacement;
import cdc.io.data.util.TextContentConverter;
import cdc.io.data.util.TextPredicate;
import cdc.io.data.xml.XmlDataReader;
import cdc.io.data.xml.XmlDataWriter;
import cdc.io.tools.XmlNormalizer.MainArgs.Feature;
import cdc.io.utils.NonCloseableOutputStream;
import cdc.io.xml.XmlWriter;
import cdc.util.cli.AbstractMainSupport;
import cdc.util.cli.FeatureMask;
import cdc.util.cli.OptionEnum;
import cdc.util.lang.MapUtils;

/**
 * Toy utility that can be used to "normalize" or modify an XML file.
 * 

* WARNING: This is not standard XML normalization, but it can be useful.
* This can: *

    *
  • Pretty print file (indentation string can be specified). *
  • Sorting *
      *
    • Sort all attributes. *
    • Move some attributes to first position. *
    • Sort all elements. *
    *
  • *
  • Scrambling *
      *
    • Scramble values of all attributes. *
    • Scramble values of some attributes. *
    • Scramble content of all elements. *
    • Scramble content of some elements. *
    *
  • *
  • Removal *
      *
    • Remove all empty attributes. *
    • Remove all pure elements. *
    • Remove all comments. *
    • Remove all elements that have a given name (before any local renaming). *
    • Remove all attributes that have a given name (before any local renaming). *
    *
  • *
  • Renaming *
      *
    • Rename attributes names. *
    • Rename elements names. *
    *
  • *
  • Setting *
      *
    • Set attributes values (of existing attributes). *
    • Set elements contents (of existing elements that have a text child or no children). *
    *
  • *
* WARNING: Renaming may interact with other actions. It may be safe to rename in a separate step. * * @author Damien Carbonne */ public final class XmlNormalizer { private static final Logger LOGGER = LogManager.getLogger(XmlNormalizer.class); private static final String ALIAS_SEPARATOR = "::"; private final MainArgs margs; public static class MainArgs { public enum Feature implements OptionEnum { PRETTY_PRINT("pretty-print", "Pretty prints."), USE_XML_EOL("use-xml-eol", "Use xml eol instead of platform eol."), APPEND_FINAL_EOL("append-final-eol", "Appends a final eol."), USE_SINGLE_QUOTE("use-single-quote", "Use single quote (') instead of quote (\") for attributes delimiters."), ALWAYS_ENTITIZE_ATTRIBUTES("always-entitize-attributes", "Replaces all special characters by entities in attributes, even if not necessary."), SORT_ATTRIBUTES("sort-attributes", "Sorts all attributes in alphabetical order."), SCRAMBLE_ATTRIBUTES("scramble-attributes", "Scramble values of all attributes. This can alter schema compliance."), SORT_ELEMENTS("sort-elements", "Sorts all children elements in alphabetical order. This can alter schema compliance."), SCRAMBLE_ELEMENTS("scramble-elements", "Scramble content of all (non-ignorable white space) elements. This can alter schema compliance."), REMOVE_EMPTY_ATTRIBUTES("remove-empty-attributes", "Removes all empty attributes. This can alter schema compliance."), REMOVE_PURE_ELEMENTS("remove-pure-elements", "Removes all elements that are empty and have no attributes. This can alter schema compliance."), REMOVE_COMMENTS("remove-comments", "Removes all comments."), DELAYED_PROCESSING("delayed-processing", "If set, processing is done after all data has been loaded. Otherwise, it is done during loading, if possible."); private final String name; private final String description; private Feature(String name, String description) { this.name = name; this.description = description; } @Override public final String getName() { return name; } @Override public final String getDescription() { return description; } } /** String to use for indentation. */ public String indentString = " "; public File inputFile; public File outputFile; /** List of attributes names to move to first position. Order matters. */ public final List firstAttributes = new ArrayList<>(); /** Set of attributes and elements to scramble. */ public final Set scrambled = new HashSet<>(); /** Set of attributes and elements to remove. */ public final Set removed = new HashSet<>(); /** Renaming of attributes names and elements names. */ public final Map> renamed = new HashMap<>(); /** Setting of attributes values and elements contents. */ public final Map> set = new HashMap<>(); protected final FeatureMask features = new FeatureMask<>(); public void setEnabled(Feature feature, boolean enabled) { features.setEnabled(feature, enabled); } public boolean isEnabled(Feature feature) { return features.isEnabled(feature); } } private XmlNormalizer(MainArgs margs) { this.margs = margs; } private void execute() throws IOException { // Read input file final XmlDataReader reader = new XmlDataReader(); reader.setEnabled(XmlDataReader.Feature.ALLOW_MIXED_CONTENT, true); reader.setEnabled(XmlDataReader.Feature.LOAD_COMMENTS, !margs.isEnabled(Feature.REMOVE_COMMENTS)); reader.setEnabled(XmlDataReader.Feature.LOAD_SPACES, !margs.isEnabled(Feature.PRETTY_PRINT)); final Map> renamedAttributes = MapUtils.filterKeys(margs.renamed, SPath::isAttribute); final Map> renamedElements = MapUtils.filterKeys(margs.renamed, SPath::isElement); final Map> setAttributes = MapUtils.filterKeys(margs.set, SPath::isAttribute); final Map> setElements = MapUtils.filterKeys(margs.set, SPath::isElement); final boolean delayed = margs.isEnabled(Feature.DELAYED_PROCESSING) || renamedAttributes.size() * renamedElements.size() != 0; if (delayed != margs.isEnabled(Feature.DELAYED_PROCESSING)) { LOGGER.warn("Activate {}", Feature.DELAYED_PROCESSING); } if (!delayed) { // If delayed processing is disabled, do as many processing as possible during loading // Build the element pre filter used to remove specific elements ElementPredicate elementPreFilter = ElementPredicate.ANY_ELEMENT; if (!margs.removed.isEmpty()) { elementPreFilter = ElementPredicate.fromPaths(margs.removed).not(); } reader.setElementPreFilter(elementPreFilter); // Build the element post filter used to remove pure elements ElementPredicate elementPostFilter = ElementPredicate.ANY_ELEMENT; if (margs.isEnabled(Feature.REMOVE_PURE_ELEMENTS)) { elementPostFilter = ElementPredicate.PURE_ELEMENT.not(); } reader.setElementPostFilter(elementPostFilter); // Build the attribute filter used to remove empty and specific attributes AttributePredicate attributeFilter = AttributePredicate.ANY_ATTRIBUTE; if (margs.isEnabled(Feature.REMOVE_EMPTY_ATTRIBUTES)) { attributeFilter = AttributePredicate.IS_NOT_EMPTY_ATTRIBUTE; } if (!margs.removed.isEmpty()) { attributeFilter = attributeFilter.and(AttributePredicate.fromPaths(margs.removed).not()); } reader.setAttributeFilter(attributeFilter); // Build the attributes and elements renaming if (!renamedAttributes.isEmpty()) { reader.setAttributeNameConverter(AttributeNameConverter.fromPathNameFunctionMap(renamedAttributes)); } if (!renamedElements.isEmpty()) { reader.setElementNameConverter(ElementNameConverter.fromPathNameFunctionMap(renamedElements)); } // Build the attribute value converter if (!margs.set.isEmpty()) { reader.setAttributeValueConverter(AttributeValueConverter.fromPathValueFunctionMap(margs.set)); } // Elements content setting can not be done during loading. } final Document document = reader.read(margs.inputFile); // Modify data if (delayed) { // Remove specific elements first if (!margs.removed.isEmpty()) { DataUtils.removeMatchingChildren(document, Element.matching(margs.removed), DataUtils.RECURSE); } // Remove attributes if (margs.isEnabled(Feature.REMOVE_EMPTY_ATTRIBUTES)) { DataUtils.removeEmptyAttributes(document, DataUtils.RECURSE); } if (!margs.removed.isEmpty()) { DataUtils.removeMatchingAttributes(document, AttributePredicate.fromPaths(margs.removed), DataUtils.RECURSE); } // Now remove pure elements // Attributes removal can make some elements empty if (margs.isEnabled(Feature.REMOVE_PURE_ELEMENTS)) { DataUtils.removePureElements(document, DataUtils.RECURSE); } // Set attributes values (before renaming) if (!setAttributes.isEmpty()) { DataUtils.setValueOfMatchingAttributes(document, AttributePredicate.fromPaths(setAttributes.keySet()), AttributeValueConverter.fromPathValueFunctionMap(setAttributes), DataUtils.RECURSE); } // Rename attributes and elements if (!renamedAttributes.isEmpty()) { DataUtils.setNameOfMatchingAttributes(document, AttributePredicate.fromPaths(renamedAttributes.keySet()), AttributeNameConverter.fromPathNameFunctionMap(renamedAttributes), DataUtils.RECURSE); } if (!renamedElements.isEmpty()) { DataUtils.setNameOfMatchingElements(document, SPath.toStandardPredicate(renamedElements.keySet()), ElementNameConverter.fromPathNameFunctionMap(renamedElements), DataUtils.RECURSE); } } // Set elements contents (after renaming) if (!setElements.isEmpty()) { DataUtils.setContentOfMatchingElements(document, ElementPredicate.fromPaths(setElements.keySet()), TextContentConverter.fromPathContentFunctionMap(setElements), DataUtils.RECURSE); } if (margs.isEnabled(Feature.SORT_ATTRIBUTES)) { DataUtils.sortAttributes(document, Attribute.NAME_COMPARATOR, DataUtils.RECURSE); } if (margs.isEnabled(Feature.SORT_ELEMENTS)) { DataUtils.sortChildren(document, Node.ANY_NODE, Node.ELEMENT_NAME_AND_ATTRIBUTES_COMPARATOR, DataUtils.RECURSE); } if (margs.isEnabled(Feature.SCRAMBLE_ATTRIBUTES)) { DataUtils.setValueOfMatchingAttributes(document, AttributePredicate.ANY_ATTRIBUTE, AttributeValueConverter.scramble(true), DataUtils.RECURSE); } if (margs.isEnabled(Feature.SCRAMBLE_ELEMENTS)) { DataUtils.setContentOfMatchingTexts(document, TextPredicate.ANY_TEXT, TextContentConverter.scramble(true), DataUtils.RECURSE); } for (final String name : margs.firstAttributes) { DataUtils.moveAttributeFirst(document, Node.ANY_NODE, name, DataUtils.RECURSE); } // Now we can scramble specific attributes and elements if (!margs.scrambled.isEmpty()) { DataUtils.setValueOfMatchingAttributes(document, AttributePredicate.fromPaths(margs.scrambled), AttributeValueConverter.scramble(true), DataUtils.RECURSE); DataUtils.setContentOfMatchingTexts(document, TextPredicate.fromElementPaths(margs.scrambled), TextContentConverter.scramble(true), DataUtils.RECURSE); } // Write output try (final XmlDataWriter writer = margs.outputFile == null ? new XmlDataWriter(NonCloseableOutputStream.NON_CLOSABLE_SYSTEM_OUT) : new XmlDataWriter(margs.outputFile)) { writer.getXmlWriter().setEnabled(XmlWriter.Feature.PRETTY_PRINT, margs.isEnabled(Feature.PRETTY_PRINT)); writer.getXmlWriter().setEnabled(XmlWriter.Feature.USE_XML_EOL, margs.isEnabled(Feature.USE_XML_EOL)); writer.getXmlWriter().setEnabled(XmlWriter.Feature.APPEND_FINAL_EOL, margs.isEnabled(Feature.APPEND_FINAL_EOL)); writer.getXmlWriter().setEnabled(XmlWriter.Feature.USE_SINGLE_QUOTE, margs.isEnabled(Feature.USE_SINGLE_QUOTE)); writer.getXmlWriter() .setEnabled(XmlWriter.Feature.ALWAYS_ENTITIZE_ATTRIBUTES, margs.isEnabled(Feature.ALWAYS_ENTITIZE_ATTRIBUTES)); writer.getXmlWriter().setIndentString(margs.indentString); writer.write(document); writer.flush(); } } public static void execute(MainArgs margs) throws IOException { final XmlNormalizer instance = new XmlNormalizer(margs); instance.execute(); } public static void main(String[] args) { final MainSupport support = new MainSupport(); support.main(args); } private static class MainSupport extends AbstractMainSupport { private static final String FIRST_ATTRIBUTE = "first-attribute"; private static final String INDENT_STRING = "indent-string"; private static final String REMOVE = "remove"; private static final String RENAME = "rename"; private static final String SCRAMBLE = "scramble"; private static final String SET = "set"; public MainSupport() { super(XmlNormalizer.class, LOGGER); } @Override protected String getVersion() { return Config.VERSION; } @Override protected void addSpecificOptions(Options options) { options.addOption(Option.builder() .longOpt(INPUT) .desc("Xml input file.") .hasArg() .required() .build()); options.addOption(Option.builder() .longOpt(OUTPUT) .desc("Optional Xml output file.") .hasArg() .build()); options.addOption(Option.builder() .longOpt(INDENT_STRING) .desc("Optional indentation string. Used when " + Feature.PRETTY_PRINT.getName() + " is enabled. (Default '')") .hasArg() .build()); options.addOption(Option.builder() .longOpt(FIRST_ATTRIBUTE) .desc("Optional name(s) of attributes to move to first position." + " Order of declarations matters." + " Executed after attributes sorting.") .hasArgs() .build()); options.addOption(Option.builder() .longOpt(SCRAMBLE) .desc("Optional path(s) of attributes and elements to scramble. Have the form: name(/name)* to designate an element or (name(/name)*)?@name to designate an attribute.") .hasArgs() .build()); options.addOption(Option.builder() .longOpt(REMOVE) .desc("Optional path(s) of attributes and elements to remove. Have the form: name(/name)* to designate an element or (name(/name)*)?@name to designate an attribute.") .hasArgs() .build()); options.addOption(Option.builder() .longOpt(RENAME) .desc("Optional pair(s) (path, name) or triplets (path, pattern, replacement) of attributes or elements to rename.\n" + "Have these forms:\n" + "- path" + ALIAS_SEPARATOR + "name.\n" + "- path" + ALIAS_SEPARATOR + "pattern" + ALIAS_SEPARATOR + "replacement") .hasArgs() .build()); options.addOption(Option.builder() .longOpt(SET) .desc("Optional pair(s) (path, text) or triplets (path, pattern, replacement) of attributes or elements to set.\n" + "Have these forms:\n" + "- path" + ALIAS_SEPARATOR + "text.\n" + "- path" + ALIAS_SEPARATOR + "pattern" + ALIAS_SEPARATOR + "replacement") .hasArgs() .build()); addNoArgOptions(options, MainArgs.Feature.class); } private static void analyzePath(String s, Map> map) throws ParseException { final String part0 = getPart(s, ALIAS_SEPARATOR, 0, null); final String part1 = getPart(s, ALIAS_SEPARATOR, 1, null); final String part2 = getPart(s, ALIAS_SEPARATOR, 2, null); if (part1 == null) { throw new ParseException("Invalid alias arg: '" + s + "'"); } if (part2 == null) { map.put(new SPath(part0), n -> part1); } else { map.put(new SPath(part0), new PatternReplacement(part1, part2)); } } private static void fillPaths(CommandLine cl, String optionName, Collection values) { if (cl.hasOption(optionName)) { for (final String s : cl.getOptionValues(optionName)) { values.add(new SPath(s)); } } } @Override protected MainArgs analyze(CommandLine cl) throws ParseException { final MainArgs margs = new MainArgs(); margs.inputFile = getValueAsFile(cl, INPUT, IS_NULL_OR_FILE); margs.outputFile = getValueAsFile(cl, OUTPUT); margs.indentString = cl.getOptionValue(INDENT_STRING); fillValues(cl, FIRST_ATTRIBUTE, margs.firstAttributes); fillPaths(cl, SCRAMBLE, margs.scrambled); fillPaths(cl, REMOVE, margs.removed); if (cl.getOptionValues(RENAME) != null) { for (final String s : cl.getOptionValues(RENAME)) { analyzePath(s, margs.renamed); } } if (cl.getOptionValues(SET) != null) { for (final String s : cl.getOptionValues(SET)) { analyzePath(s, margs.set); } } setMask(cl, MainArgs.Feature.class, margs.features::setEnabled); return margs; } @Override protected Void execute(MainArgs margs) throws Exception { XmlNormalizer.execute(margs); return null; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy