All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.codemodder.plugins.maven.operator.FormatCommand Maven / Gradle / Ivy

There is a newer version: 0.97.4
Show newest version
package io.codemodder.plugins.maven.operator;

import static io.github.pixee.security.XMLInputFactorySecurity.hardenFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.xml.stream.*;
import javax.xml.stream.events.*;
import kotlin.ranges.IntRange;
import kotlin.sequences.Sequence;
import kotlin.text.MatchGroupCollection;
import kotlin.text.MatchResult;
import kotlin.text.Regex;
import org.apache.commons.lang3.StringUtils;
import org.mozilla.universalchardet.UniversalDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This Command handles Formatting - particularly storing the original document preamble (the
 * Processing Instruction and the first XML Element contents), which are the only ones which are
 * tricky to format (due to element and its attributes being freeform - thus formatting lost when
 * serializing the DOM and the PI being completely optional for the POM Document)
 */
class FormatCommand extends AbstractCommand {

  private static final Set LINE_ENDINGS = new HashSet<>();
  private static final Regex RE_EMPTY_ELEMENT_NO_ATTRIBUTES;
  private static final Logger LOGGER = LoggerFactory.getLogger(FormatCommand.class);

  /** StAX InputFactory */
  private XMLInputFactory inputFactory = hardenFactory(XMLInputFactory.newInstance());

  /** StAX OutputFactory */
  private XMLOutputFactory outputFactory = XMLOutputFactory.newInstance();

  private List singleElementsWithAttributes = new ArrayList<>();

  static {
    LINE_ENDINGS.add("\r\n");
    LINE_ENDINGS.add("\n");
    LINE_ENDINGS.add("\r");

    RE_EMPTY_ELEMENT_NO_ATTRIBUTES =
        new Regex("<([\\p{Alnum}_\\-.]+)>\\s*|<([\\p{Alnum}_\\-.]+)\\s*/>");
  }

  /**
   * Execute the formatting operation.
   *
   * @param pm The ProjectModel to operate on.
   * @return `true` if the formatting operation was successful, `false` otherwise.
   * @throws XMLStreamException if there is an issue with XML processing.
   * @throws IOException if there is an issue with I/O operations.
   * @throws URISyntaxException if there is an issue with URI syntax.
   */
  @Override
  public boolean execute(ProjectModel pm)
      throws XMLStreamException, IOException, URISyntaxException {
    for (POMDocument pomFile : pm.allPomFiles()) {
      parseXmlAndCharset(pomFile);

      pomFile.setEndl(parseLineEndings(pomFile));
      pomFile.setIndent(guessIndent(pomFile));
    }

    return super.execute(pm);
  }

  /**
   * Perform post-processing after formatting. When doing the opposite, render the XML using the
   * optionally supplied encoding (defaults to UTF8 obviously) but apply the original formatting as
   * well
   *
   * @param pm The ProjectModel to operate on.
   * @return `true` if the post-processing was successful, `false` otherwise.
   * @throws XMLStreamException if there is an issue with XML processing.
   */
  @Override
  public boolean postProcess(ProjectModel pm) throws XMLStreamException {
    for (POMDocument pomFile : pm.allPomFiles()) {
      /** Serializes it back */
      byte[] content = serializePomFile(pomFile);
      pomFile.setResultPomBytes(content);
    }
    return super.postProcess(pm);
  }

  /**
   * This one is quite fun yet important. Let me explain:
   *
   * 

The DOM doesn't track records if empty elements are either `` or ``. * Therefore we need to scan all ocurrences of singleton elements. * *

Therefore we use a bitSet to keep track of each element and offset, scanning it forward when * serializing we pick backwards and rewrite tags accordingly * * @param doc Raw Document Bytes * @see RE_EMPTY_ELEMENT_NO_ATTRIBUTES * @return bitSet of */ private BitSet elementBitSet(byte[] doc) throws XMLStreamException { BitSet result = new BitSet(); XMLEventReader eventReader = inputFactory.createXMLEventReader(new ByteArrayInputStream(doc)); StringWriter eventContent = new StringWriter(); XMLEventWriter xmlEventWriter = outputFactory.createXMLEventWriter(eventContent); while (eventReader.hasNext()) { XMLEvent next = eventReader.nextEvent(); if (next instanceof StartElement || next instanceof EndElement) { int startIndex = next.getLocation().getCharacterOffset(); eventContent.getBuffer().setLength(0); xmlEventWriter.add(next); xmlEventWriter.flush(); int endIndex = startIndex + eventContent.getBuffer().length(); result.set(startIndex, startIndex + endIndex); } } return result; } /** * A Slight variation on writeAsUnicode from stax which writes as a regex string so we could * rewrite its output */ private String writeAsRegex(StartElement element) { StringWriter writer = new StringWriter(); writer.write("<"); writer.write(Pattern.quote(element.getName().getLocalPart())); Iterator attrIter = element.getAttributes(); while (attrIter.hasNext()) { Attribute attr = (Attribute) attrIter.next(); writer.write("\\s+"); writer.write(Pattern.quote(attr.getName().getLocalPart())); writer.write("=[\\\"\']"); writer.write(Pattern.quote(attr.getValue())); writer.write("[\\\"\']"); } writer.write("\\s*\\/>"); return writer.toString(); } private String parseLineEndings(POMDocument pomFile) throws IOException { InputStream inputStream = new ByteArrayInputStream(pomFile.getOriginalPom()); byte[] bytes = inputStream.readAllBytes(); String str = new String(bytes, pomFile.getCharset()); Map lineEndingCounts = new HashMap<>(); for (String lineEnding : LINE_ENDINGS) { lineEndingCounts.put(lineEnding, str.split(lineEnding).length); } return Collections.max(lineEndingCounts.entrySet(), Map.Entry.comparingByValue()).getKey(); } /** * Guesses the indent character (spaces / tabs) and length from the original document formatting * settings * * @param pomFile (project model) where it takes its input pom * @return indent string */ private String guessIndent(POMDocument pomFile) throws XMLStreamException { InputStream inputStream = new ByteArrayInputStream(pomFile.getOriginalPom()); XMLEventReader eventReader = inputFactory.createXMLEventReader(inputStream); Map freqMap = new HashMap<>(); Map charFreqMap = new HashMap<>(); /** Parse, while grabbing whitespace sequences and examining it */ while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); if (event instanceof Characters) { Characters characters = (Characters) event; String data = characters.getData(); if (StringUtils.isWhitespace(data)) { String lineEndingsPattern = String.join("|", LINE_ENDINGS.toArray(new String[0])); String[] patterns = data.split(lineEndingsPattern); /** Updates space / character frequencies found */ for (String pattern : patterns) { if (!pattern.isEmpty() && StringUtils.isAllBlank(pattern)) { int length = pattern.length(); freqMap.merge(length, 1, Integer::sum); char firstChar = pattern.charAt(0); charFreqMap.merge(firstChar, 1, Integer::sum); } } } } } // Assign the most frequent indent char char indentCharacter = getMostFrequentIndentChar(charFreqMap); // Cast it as a String String indentCharacterAsString = String.valueOf(indentCharacter); // Pick the length int indentLength = getMinimumIndentLength(freqMap); // Build the standard indent string (length vs char) String indentString = StringUtils.repeat(indentCharacterAsString, indentLength); // Return it return indentString; } private char getMostFrequentIndentChar(Map charFreqMap) { char mostFrequentChar = '\0'; int maxFrequency = Integer.MIN_VALUE; for (Map.Entry entry : charFreqMap.entrySet()) { if (entry.getValue() > maxFrequency) { maxFrequency = entry.getValue(); mostFrequentChar = entry.getKey(); } } return mostFrequentChar; } private int getMinimumIndentLength(Map freqMap) { int minIndentLength = Integer.MAX_VALUE; for (Map.Entry entry : freqMap.entrySet()) { if (entry.getKey() < minIndentLength) { minIndentLength = entry.getKey(); } } return minIndentLength; } private void parseXmlAndCharset(POMDocument pomFile) throws XMLStreamException, IOException { InputStream inputStream = new ByteArrayInputStream(pomFile.getOriginalPom()); /** Performs a StAX Parsing to Grab the first element */ XMLEventReader eventReader = inputFactory.createXMLEventReader(inputStream); Charset charset = null; /** Parse, while grabbing its preamble and encoding */ int elementIndex = 0; boolean mustTrack = false; boolean hasPreamble = false; int elementStart = 0; List prevEvents = new ArrayList<>(); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); if (event.isStartDocument() && ((StartDocument) event).encodingSet()) { /** Processing Instruction Found - Store its Character Encoding */ charset = Charset.forName(((StartDocument) event).getCharacterEncodingScheme()); } else if (event.isStartElement()) { StartElement asStartElement = event.asStartElement(); String name = asStartElement.getName().getLocalPart(); List attributes = new ArrayList<>(); Iterator attrIter = asStartElement.getAttributes(); while (attrIter.hasNext()) { attributes.add((Attribute) attrIter.next()); } if (elementIndex > 0 && !attributes.isEmpty()) { // record this guy mustTrack = true; Characters lastCharacterEvent = null; for (int i = prevEvents.size() - 1; i >= 0; i--) { if (prevEvents.get(i).isCharacters()) { lastCharacterEvent = prevEvents.get(i).asCharacters(); break; } } if (lastCharacterEvent != null) { elementStart = lastCharacterEvent.getLocation().getCharacterOffset() - lastCharacterEvent.getData().length(); } } else if (mustTrack) { // turn it off mustTrack = false; } elementIndex++; } else if (event.isEndElement()) { /** First End of Element ("Tag") found - store its offset */ EndElement endElementEvent = event.asEndElement(); Location location = endElementEvent.getLocation(); int offset = location.getCharacterOffset(); if (mustTrack) { mustTrack = false; String localPart = event.asEndElement().getName().getLocalPart(); String originalPomCharsetString = new String(pomFile.getOriginalPom(), pomFile.getCharset()); String untrimmedOriginalContent = originalPomCharsetString.substring(elementStart, offset); String trimmedOriginalContent = untrimmedOriginalContent.trim(); int realElementStart = originalPomCharsetString.indexOf(trimmedOriginalContent, elementStart); IntRange contentRange = new IntRange( realElementStart, realElementStart + 1 + trimmedOriginalContent.length()); String contentRe = writeAsRegex(getLastStartElement(prevEvents)); Regex modifiedContentRE = new Regex(contentRe); singleElementsWithAttributes.add( new MatchData( contentRange, trimmedOriginalContent, localPart, true, modifiedContentRE)); } mustTrack = false; /** Sets Preamble - keeps parsing anyway */ if (!hasPreamble) { pomFile.setPreamble( new String(pomFile.getOriginalPom(), pomFile.getCharset()).substring(0, offset)); hasPreamble = true; } } prevEvents.add(event); while (prevEvents.size() > 4) { prevEvents.remove(0); } if (!eventReader.hasNext()) if (!hasPreamble) throw new IllegalStateException("Couldn't find document start"); } if (null == charset) { InputStream inputStream2 = new ByteArrayInputStream(pomFile.getOriginalPom()); String detectedCharsetName = UniversalDetector.detectCharset(inputStream2); charset = Charset.forName(detectedCharsetName); } pomFile.setCharset(charset); String lastLine = new String(pomFile.getOriginalPom(), pomFile.getCharset()); String lastLineTrimmed = lastLine.replaceAll("\\s+$", ""); pomFile.setSuffix(lastLine.substring(lastLineTrimmed.length())); } private StartElement getLastStartElement(List prevEvents) { for (int i = prevEvents.size() - 1; i >= 0; i--) { XMLEvent event = prevEvents.get(i); if (event.isStartElement()) { return (StartElement) event; } } return null; // Handle the case where no StartElement event is found. } /** * Returns a reverse-ordered list of all the single element matches from the pom document raw * string * *

this is important so we can mix and match offsets and apply formatting accordingly * * @param xmlDocumentString Rendered POM Document Contents (string-formatted) * @return map of (index, matchData object) reverse ordered */ private LinkedHashMap findSingleElementMatchesFrom(String xmlDocumentString) { Sequence allFoundMatchesSequence = RE_EMPTY_ELEMENT_NO_ATTRIBUTES.findAll(xmlDocumentString, 0); List emptyMappedTags = new ArrayList<>(); Iterator iterator = allFoundMatchesSequence.iterator(); while (iterator.hasNext()) { MatchResult matchResult = iterator.next(); MatchGroupCollection groups = matchResult.getGroups(); String value1 = (groups.get(1) != null) ? groups.get(1).getValue() : null; String value2 = (groups.get(2) != null) ? groups.get(2).getValue() : null; MatchData matchDataJ = new MatchData( matchResult.getRange(), matchResult.getValue(), (value1 != null) ? value1 : value2, false, null); emptyMappedTags.add(matchDataJ); } List> allTags = emptyMappedTags.stream() .flatMap(data -> Stream.of(data)) .map(data -> new Pair<>(data.getRange().getFirst(), data)) .collect(Collectors.toList()); allTags.sort(Comparator.comparing(Pair::getFirst, Comparator.reverseOrder())); LinkedHashMap linkedHashMap = new LinkedHashMap<>(); for (Pair pair : allTags) { linkedHashMap.put(pair.getFirst(), pair.getSecond()); } return linkedHashMap; } private List getElementsToReplace(BitSet originalElementMap, POMDocument pom) { // Let's find out the original empty elements from the original pom and store them in a stack List elementsToReplace = new ArrayList<>(); Map singleElementMatches = findSingleElementMatchesFrom(new String(pom.getOriginalPom(), pom.getCharset())); for (MatchData match : singleElementMatches.values()) { if (!match.getHasAttributes() && originalElementMap.get(match.getRange().getFirst())) { elementsToReplace.add(match); } } return elementsToReplace; } private Map getEmptyElements( BitSet targetElementMap, String xmlRepresentation) { LinkedHashMap emptyElements = new LinkedHashMap<>(); for (Map.Entry entry : findSingleElementMatchesFrom(xmlRepresentation).entrySet()) { Integer key = entry.getKey(); MatchData value = entry.getValue(); if (targetElementMap.get(value.getRange().getFirst())) { emptyElements.put(key, value); } } return emptyElements; } private String replaceRange(String xmlRepresentation, IntRange range, String replacement) { StringBuilder sb = new StringBuilder(); sb.append(xmlRepresentation.substring(0, range.getStart())); sb.append(replacement); sb.append(xmlRepresentation.substring(range.getEndInclusive() + 1, xmlRepresentation.length())); return sb.toString(); } /** * Serialize a POM Document * * @param pom pom document * @return bytes for the pom document */ private byte[] serializePomFile(POMDocument pom) throws XMLStreamException { // Generate a String representation. We'll need to patch it up and apply back // differences we recorded previously on the pom (see the pom member variables) String xmlRepresentation = pom.getResultPom().asXML().toString(); BitSet originalElementMap = elementBitSet(pom.getOriginalPom()); BitSet targetElementMap = elementBitSet(xmlRepresentation.getBytes()); // Let's find out the original empty elements from the original pom and store into a stack List elementsToReplace = getElementsToReplace(originalElementMap, pom); // Lets to the replacements backwards on the existing, current pom Map emptyElements = getEmptyElements(targetElementMap, xmlRepresentation); for (Map.Entry entry : emptyElements.entrySet()) { Integer key = entry.getKey(); MatchData match = entry.getValue(); MatchData nextMatch = elementsToReplace.remove(0); xmlRepresentation = replaceRange(xmlRepresentation, match.getRange(), nextMatch.getContent()); } int lastIndex = 0; singleElementsWithAttributes.sort( Comparator.comparingInt(matchDataJ -> matchDataJ.getRange().getFirst())); for (MatchData match : singleElementsWithAttributes) { MatchResult representationMatch = match.getModifiedContent().find(xmlRepresentation, lastIndex); if (null == representationMatch) { LOGGER.warn("Failure on quoting: {}", match); } else { xmlRepresentation = replaceRange(xmlRepresentation, representationMatch.getRange(), match.getContent()); lastIndex = representationMatch.getRange().getFirst() + match.getContent().length(); } } /** * We might need to replace the beginning of the POM with the same content from the very * beginning * *

Grab the same initial offset from the formatted element like we did */ XMLInputFactory inputFactory = hardenFactory(XMLInputFactory.newInstance()); XMLEventReader eventReader = inputFactory.createXMLEventReader( new ByteArrayInputStream(xmlRepresentation.getBytes(pom.getCharset()))); while (true) { XMLEvent event = eventReader.nextEvent(); if (event.isEndElement()) { /** Apply the formatting and tweak its XML Representation */ EndElement endElementEvent = (EndElement) event; int offset = endElementEvent.getLocation().getCharacterOffset(); xmlRepresentation = pom.getPreamble() + xmlRepresentation.substring(offset) + pom.getSuffix(); break; } /** This code shouldn't be unreachable at all */ if (!eventReader.hasNext()) { throw new IllegalStateException("Couldn't find document start"); } } /** Serializes it back from (string to ByteArray) */ byte[] serializedContent = xmlRepresentation.getBytes(pom.getCharset()); return serializedContent; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy