cdc.io.tools.XmlNormalizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cdc-io-tools Show documentation
There is a newer version: 0.52.1
package cdc.io.tools;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.UnaryOperator;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import cdc.io.data.Attribute;
import cdc.io.data.Document;
import cdc.io.data.Element;
import cdc.io.data.Node;
import cdc.io.data.paths.SPath;
import cdc.io.data.util.AttributeNameConverter;
import cdc.io.data.util.AttributePredicate;
import cdc.io.data.util.AttributeValueConverter;
import cdc.io.data.util.DataUtils;
import cdc.io.data.util.ElementNameConverter;
import cdc.io.data.util.ElementPredicate;
import cdc.io.data.util.PatternReplacement;
import cdc.io.data.util.TextContentConverter;
import cdc.io.data.util.TextPredicate;
import cdc.io.data.xml.XmlDataReader;
import cdc.io.data.xml.XmlDataWriter;
import cdc.io.tools.XmlNormalizer.MainArgs.Feature;
import cdc.io.utils.NonCloseableOutputStream;
import cdc.io.xml.XmlWriter;
import cdc.util.cli.AbstractMainSupport;
import cdc.util.cli.FeatureMask;
import cdc.util.cli.OptionEnum;
import cdc.util.lang.MapUtils;

/**
 * Toy utility that can be used to "normalize" or modify an XML file.
 * 
 * WARNING: This is not standard XML normalization, but it can be useful.

 * This can:
 * 

 * Pretty print file (indentation string can be specified).
 * 
Sorting
 * 
 * Sort all attributes.
 * 
Move some attributes to first position.
 * 
Sort all elements.
 * 
 * 
 * Scrambling
 * 
 * Scramble values of all attributes.
 * 
Scramble values of some attributes.
 * 
Scramble content of all elements.
 * 
Scramble content of some elements.
 * 
 * 
 * Removal
 * 
 * Remove all empty attributes.
 * 
Remove all pure elements.
 * 
Remove all comments.
 * 
Remove all elements that have a given name (before any local renaming).
 * 
Remove all attributes that have a given name (before any local renaming).
 * 
 * 
 * Renaming
 * 
 * Rename attributes names.
 * 
Rename elements names.
 * 
 * 
 * Setting
 * 
 * Set attributes values (of existing attributes).
 * 
Set elements contents (of existing elements that have a text child or no children).
 * 
 * 
 * 
 * WARNING: Renaming may interact with other actions. It may be safe to rename in a separate step.
 *
 * @author Damien Carbonne
 */
public final class XmlNormalizer {
    private static final Logger LOGGER = LogManager.getLogger(XmlNormalizer.class);
    private static final String ALIAS_SEPARATOR = "::";
    private final MainArgs margs;

    public static class MainArgs {
        public enum Feature implements OptionEnum {
            PRETTY_PRINT("pretty-print", "Pretty prints."),
            USE_XML_EOL("use-xml-eol", "Use xml eol instead of platform eol."),
            APPEND_FINAL_EOL("append-final-eol", "Appends a final eol."),
            USE_SINGLE_QUOTE("use-single-quote", "Use single quote (') instead of quote (\") for attributes delimiters."),
            ALWAYS_ENTITIZE_ATTRIBUTES("always-entitize-attributes",
                                       "Replaces all special characters by entities in attributes, even if not necessary."),
            SORT_ATTRIBUTES("sort-attributes", "Sorts all attributes in alphabetical order."),
            SCRAMBLE_ATTRIBUTES("scramble-attributes", "Scramble values of all attributes. This can alter schema compliance."),
            SORT_ELEMENTS("sort-elements", "Sorts all children elements in alphabetical order. This can alter schema compliance."),
            SCRAMBLE_ELEMENTS("scramble-elements",
                              "Scramble content of all (non-ignorable white space) elements. This can alter schema compliance."),
            REMOVE_EMPTY_ATTRIBUTES("remove-empty-attributes", "Removes all empty attributes. This can alter schema compliance."),
            REMOVE_PURE_ELEMENTS("remove-pure-elements",
                                 "Removes all elements that are empty and have no attributes. This can alter schema compliance."),
            REMOVE_COMMENTS("remove-comments", "Removes all comments."),
            DELAYED_PROCESSING("delayed-processing",
                               "If set, processing is done after all data has been loaded. Otherwise, it is done during loading, if possible.");

            private final String name;
            private final String description;

            private Feature(String name,
                            String description) {
                this.name = name;
                this.description = description;
            }

            @Override
            public final String getName() {
                return name;
            }

            @Override
            public final String getDescription() {
                return description;
            }
        }

        /** String to use for indentation. */
        public String indentString = "  ";

        public File inputFile;

        public File outputFile;

        /** List of attributes names to move to first position. Order matters. */
        public final List firstAttributes = new ArrayList<>();

        /** Set of attributes and elements to scramble. */
        public final Set scrambled = new HashSet<>();

        /** Set of attributes and elements to remove. */
        public final Set removed = new HashSet<>();

        /** Renaming of attributes names and elements names. */
        public final Map> renamed = new HashMap<>();

        /** Setting of attributes values and elements contents. */
        public final Map> set = new HashMap<>();

        protected final FeatureMask features = new FeatureMask<>();

        public void setEnabled(Feature feature,
                               boolean enabled) {
            features.setEnabled(feature, enabled);
        }

        public boolean isEnabled(Feature feature) {
            return features.isEnabled(feature);
        }
    }

    private XmlNormalizer(MainArgs margs) {
        this.margs = margs;
    }

    private void execute() throws IOException {
        // Read input file
        final XmlDataReader reader = new XmlDataReader();
        reader.setEnabled(XmlDataReader.Feature.ALLOW_MIXED_CONTENT, true);
        reader.setEnabled(XmlDataReader.Feature.LOAD_COMMENTS, !margs.isEnabled(Feature.REMOVE_COMMENTS));
        reader.setEnabled(XmlDataReader.Feature.LOAD_SPACES, !margs.isEnabled(Feature.PRETTY_PRINT));

        final Map> renamedAttributes = MapUtils.filterKeys(margs.renamed, SPath::isAttribute);
        final Map> renamedElements = MapUtils.filterKeys(margs.renamed, SPath::isElement);
        final Map> setAttributes = MapUtils.filterKeys(margs.set, SPath::isAttribute);
        final Map> setElements = MapUtils.filterKeys(margs.set, SPath::isElement);

        final boolean delayed =
                margs.isEnabled(Feature.DELAYED_PROCESSING)
                        || renamedAttributes.size() * renamedElements.size() != 0;
        if (delayed != margs.isEnabled(Feature.DELAYED_PROCESSING)) {
            LOGGER.warn("Activate {}", Feature.DELAYED_PROCESSING);
        }

        if (!delayed) {
            // If delayed processing is disabled, do as many processing as possible during loading

            // Build the element pre filter used to remove specific elements
            ElementPredicate elementPreFilter = ElementPredicate.ANY_ELEMENT;
            if (!margs.removed.isEmpty()) {
                elementPreFilter = ElementPredicate.fromPaths(margs.removed).not();
            }
            reader.setElementPreFilter(elementPreFilter);

            // Build the element post filter used to remove pure elements
            ElementPredicate elementPostFilter = ElementPredicate.ANY_ELEMENT;
            if (margs.isEnabled(Feature.REMOVE_PURE_ELEMENTS)) {
                elementPostFilter = ElementPredicate.PURE_ELEMENT.not();
            }
            reader.setElementPostFilter(elementPostFilter);

            // Build the attribute filter used to remove empty and specific attributes
            AttributePredicate attributeFilter = AttributePredicate.ANY_ATTRIBUTE;
            if (margs.isEnabled(Feature.REMOVE_EMPTY_ATTRIBUTES)) {
                attributeFilter = AttributePredicate.IS_NOT_EMPTY_ATTRIBUTE;
            }
            if (!margs.removed.isEmpty()) {
                attributeFilter = attributeFilter.and(AttributePredicate.fromPaths(margs.removed).not());
            }
            reader.setAttributeFilter(attributeFilter);

            // Build the attributes and elements renaming
            if (!renamedAttributes.isEmpty()) {
                reader.setAttributeNameConverter(AttributeNameConverter.fromPathNameFunctionMap(renamedAttributes));
            }
            if (!renamedElements.isEmpty()) {
                reader.setElementNameConverter(ElementNameConverter.fromPathNameFunctionMap(renamedElements));
            }

            // Build the attribute value converter
            if (!margs.set.isEmpty()) {
                reader.setAttributeValueConverter(AttributeValueConverter.fromPathValueFunctionMap(margs.set));
            }

            // Elements content setting can not be done during loading.
        }

        final Document document = reader.read(margs.inputFile);

        // Modify data

        if (delayed) {
            // Remove specific elements first
            if (!margs.removed.isEmpty()) {
                DataUtils.removeMatchingChildren(document,
                                                 Element.matching(margs.removed),
                                                 DataUtils.RECURSE);
            }

            // Remove attributes
            if (margs.isEnabled(Feature.REMOVE_EMPTY_ATTRIBUTES)) {
                DataUtils.removeEmptyAttributes(document, DataUtils.RECURSE);
            }
            if (!margs.removed.isEmpty()) {
                DataUtils.removeMatchingAttributes(document,
                                                   AttributePredicate.fromPaths(margs.removed),
                                                   DataUtils.RECURSE);
            }

            // Now remove pure elements
            // Attributes removal can make some elements empty
            if (margs.isEnabled(Feature.REMOVE_PURE_ELEMENTS)) {
                DataUtils.removePureElements(document, DataUtils.RECURSE);
            }

            // Set attributes values (before renaming)
            if (!setAttributes.isEmpty()) {
                DataUtils.setValueOfMatchingAttributes(document,
                                                       AttributePredicate.fromPaths(setAttributes.keySet()),
                                                       AttributeValueConverter.fromPathValueFunctionMap(setAttributes),
                                                       DataUtils.RECURSE);
            }

            // Rename attributes and elements
            if (!renamedAttributes.isEmpty()) {
                DataUtils.setNameOfMatchingAttributes(document,
                                                      AttributePredicate.fromPaths(renamedAttributes.keySet()),
                                                      AttributeNameConverter.fromPathNameFunctionMap(renamedAttributes),
                                                      DataUtils.RECURSE);
            }
            if (!renamedElements.isEmpty()) {
                DataUtils.setNameOfMatchingElements(document,
                                                    SPath.toStandardPredicate(renamedElements.keySet()),
                                                    ElementNameConverter.fromPathNameFunctionMap(renamedElements),
                                                    DataUtils.RECURSE);
            }
        }

        // Set elements contents (after renaming)
        if (!setElements.isEmpty()) {
            DataUtils.setContentOfMatchingElements(document,
                                                   ElementPredicate.fromPaths(setElements.keySet()),
                                                   TextContentConverter.fromPathContentFunctionMap(setElements),
                                                   DataUtils.RECURSE);
        }

        if (margs.isEnabled(Feature.SORT_ATTRIBUTES)) {
            DataUtils.sortAttributes(document,
                                     Attribute.NAME_COMPARATOR,
                                     DataUtils.RECURSE);
        }

        if (margs.isEnabled(Feature.SORT_ELEMENTS)) {
            DataUtils.sortChildren(document,
                                   Node.ANY_NODE,
                                   Node.ELEMENT_NAME_AND_ATTRIBUTES_COMPARATOR,
                                   DataUtils.RECURSE);
        }

        if (margs.isEnabled(Feature.SCRAMBLE_ATTRIBUTES)) {
            DataUtils.setValueOfMatchingAttributes(document,
                                                   AttributePredicate.ANY_ATTRIBUTE,
                                                   AttributeValueConverter.scramble(true),
                                                   DataUtils.RECURSE);
        }
        if (margs.isEnabled(Feature.SCRAMBLE_ELEMENTS)) {

            DataUtils.setContentOfMatchingTexts(document,
                                                TextPredicate.ANY_TEXT,
                                                TextContentConverter.scramble(true),
                                                DataUtils.RECURSE);
        }
        for (final String name : margs.firstAttributes) {
            DataUtils.moveAttributeFirst(document, Node.ANY_NODE, name, DataUtils.RECURSE);
        }

        // Now we can scramble specific attributes and elements
        if (!margs.scrambled.isEmpty()) {
            DataUtils.setValueOfMatchingAttributes(document,
                                                   AttributePredicate.fromPaths(margs.scrambled),
                                                   AttributeValueConverter.scramble(true),
                                                   DataUtils.RECURSE);
            DataUtils.setContentOfMatchingTexts(document,
                                                TextPredicate.fromElementPaths(margs.scrambled),
                                                TextContentConverter.scramble(true),
                                                DataUtils.RECURSE);
        }

        // Write output
        try (final XmlDataWriter writer = margs.outputFile == null
                ? new XmlDataWriter(NonCloseableOutputStream.NON_CLOSABLE_SYSTEM_OUT)
                : new XmlDataWriter(margs.outputFile)) {
            writer.getXmlWriter().setEnabled(XmlWriter.Feature.PRETTY_PRINT, margs.isEnabled(Feature.PRETTY_PRINT));
            writer.getXmlWriter().setEnabled(XmlWriter.Feature.USE_XML_EOL, margs.isEnabled(Feature.USE_XML_EOL));
            writer.getXmlWriter().setEnabled(XmlWriter.Feature.APPEND_FINAL_EOL, margs.isEnabled(Feature.APPEND_FINAL_EOL));
            writer.getXmlWriter().setEnabled(XmlWriter.Feature.USE_SINGLE_QUOTE, margs.isEnabled(Feature.USE_SINGLE_QUOTE));
            writer.getXmlWriter()
                  .setEnabled(XmlWriter.Feature.ALWAYS_ENTITIZE_ATTRIBUTES, margs.isEnabled(Feature.ALWAYS_ENTITIZE_ATTRIBUTES));
            writer.getXmlWriter().setIndentString(margs.indentString);
            writer.write(document);
            writer.flush();
        }
    }

    public static void execute(MainArgs margs) throws IOException {
        final XmlNormalizer instance = new XmlNormalizer(margs);
        instance.execute();
    }

    public static void main(String[] args) {
        final MainSupport support = new MainSupport();
        support.main(args);
    }

    private static class MainSupport extends AbstractMainSupport {
        private static final String FIRST_ATTRIBUTE = "first-attribute";
        private static final String INDENT_STRING = "indent-string";
        private static final String REMOVE = "remove";
        private static final String RENAME = "rename";
        private static final String SCRAMBLE = "scramble";
        private static final String SET = "set";

        public MainSupport() {
            super(XmlNormalizer.class, LOGGER);
        }

        @Override
        protected String getVersion() {
            return Config.VERSION;
        }

        @Override
        protected void addSpecificOptions(Options options) {
            options.addOption(Option.builder()
                                    .longOpt(INPUT)
                                    .desc("Xml input file.")
                                    .hasArg()
                                    .required()
                                    .build());

            options.addOption(Option.builder()
                                    .longOpt(OUTPUT)
                                    .desc("Optional Xml output file.")
                                    .hasArg()
                                    .build());

            options.addOption(Option.builder()
                                    .longOpt(INDENT_STRING)
                                    .desc("Optional indentation string. Used when " + Feature.PRETTY_PRINT.getName()
                                            + " is enabled. (Default '')")
                                    .hasArg()
                                    .build());

            options.addOption(Option.builder()
                                    .longOpt(FIRST_ATTRIBUTE)
                                    .desc("Optional name(s) of attributes to move to first position."
                                            + " Order of declarations matters."
                                            + " Executed after attributes sorting.")
                                    .hasArgs()
                                    .build());
            options.addOption(Option.builder()
                                    .longOpt(SCRAMBLE)
                                    .desc("Optional path(s) of attributes and elements to scramble. Have the form: name(/name)* to designate an element or (name(/name)*)?@name to designate an attribute.")
                                    .hasArgs()
                                    .build());
            options.addOption(Option.builder()
                                    .longOpt(REMOVE)
                                    .desc("Optional path(s) of attributes and elements to remove. Have the form: name(/name)* to designate an element or (name(/name)*)?@name to designate an attribute.")
                                    .hasArgs()
                                    .build());
            options.addOption(Option.builder()
                                    .longOpt(RENAME)
                                    .desc("Optional pair(s) (path, name) or triplets (path, pattern, replacement) of attributes or elements to rename.\n"
                                            + "Have these forms:\n"
                                            + "- path" + ALIAS_SEPARATOR + "name.\n"
                                            + "- path" + ALIAS_SEPARATOR + "pattern" + ALIAS_SEPARATOR + "replacement")
                                    .hasArgs()
                                    .build());
            options.addOption(Option.builder()
                                    .longOpt(SET)
                                    .desc("Optional pair(s) (path, text) or triplets (path, pattern, replacement) of attributes or elements to set.\n"
                                            + "Have these forms:\n"
                                            + "- path" + ALIAS_SEPARATOR + "text.\n"
                                            + "- path" + ALIAS_SEPARATOR + "pattern" + ALIAS_SEPARATOR + "replacement")
                                    .hasArgs()
                                    .build());

            addNoArgOptions(options, MainArgs.Feature.class);
        }

        private static void analyzePath(String s,
                                        Map> map) throws ParseException {
            final String part0 = getPart(s, ALIAS_SEPARATOR, 0, null);
            final String part1 = getPart(s, ALIAS_SEPARATOR, 1, null);
            final String part2 = getPart(s, ALIAS_SEPARATOR, 2, null);

            if (part1 == null) {
                throw new ParseException("Invalid alias arg: '" + s + "'");
            }
            if (part2 == null) {
                map.put(new SPath(part0), n -> part1);
            } else {
                map.put(new SPath(part0), new PatternReplacement(part1, part2));
            }
        }

        private static void fillPaths(CommandLine cl,
                                      String optionName,
                                      Collection values) {
            if (cl.hasOption(optionName)) {
                for (final String s : cl.getOptionValues(optionName)) {
                    values.add(new SPath(s));
                }
            }
        }

        @Override
        protected MainArgs analyze(CommandLine cl) throws ParseException {
            final MainArgs margs = new MainArgs();
            margs.inputFile = getValueAsFile(cl, INPUT, IS_NULL_OR_FILE);
            margs.outputFile = getValueAsFile(cl, OUTPUT);
            margs.indentString = cl.getOptionValue(INDENT_STRING);
            fillValues(cl, FIRST_ATTRIBUTE, margs.firstAttributes);
            fillPaths(cl, SCRAMBLE, margs.scrambled);
            fillPaths(cl, REMOVE, margs.removed);

            if (cl.getOptionValues(RENAME) != null) {
                for (final String s : cl.getOptionValues(RENAME)) {
                    analyzePath(s, margs.renamed);
                }
            }

            if (cl.getOptionValues(SET) != null) {
                for (final String s : cl.getOptionValues(SET)) {
                    analyzePath(s, margs.set);
                }
            }

            setMask(cl, MainArgs.Feature.class, margs.features::setEnabled);

            return margs;
        }

        @Override
        protected Void execute(MainArgs margs) throws Exception {
            XmlNormalizer.execute(margs);
            return null;
        }
    }
}