com.overzealous.remark.convert.TextCleaner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of remark Show documentation
markdown generator from html updated but based on original Apache 2.0 licensed code from https://bitbucket.org/OverZealous/remark/src/default/
There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

/*
 * Copyright 2011 OverZealous Creations, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.overzealous.remark.convert;

import com.overzealous.remark.Options;
import com.overzealous.remark.util.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This class is used to clean up plain text fields based on the selected set of
 * options. It optionally escapes certain special characters, as well as
 * replacing various HTML and Unicode entities with their plaintext equivalents.
 *
 * @author Phil DeJarnett
 * @author Nathaniel Mills modifications for provenance and level tracking
 */
public class TextCleaner {

   /**
    * Internal class simply used to hold the various escape regexes.
    */
   private class Escape {

      final Pattern pattern;
      final String replacement;

      public Escape(String pattern, String replacement) {
         this.pattern = Pattern.compile(pattern);
         this.replacement = replacement;
      }
   }

   /** Used to track the replacements based on matched groups. */
   private Map replacements;
   /** Compiled entity replacement pattern. */
   private Pattern entityReplacementsPattern;
   /** Compiled unicode replacement pattern. */
   private Pattern unicodeReplacementsPattern = null;
   /** List of possible escapes */
   private List escapes;
   private Pattern unescapeLeadingChars;

   private static final Pattern EMPTY_MATCHER = Pattern.compile("\\s+",
      Pattern.DOTALL);
   private static final Pattern LINEBREAK_REMOVER = Pattern
      .compile("(\\s*\\r?+\\n)+");

   private static final Pattern URL_CLEANER = Pattern.compile("([\\(\\) ])");

   /**
    * Create a new TextCleaner based on the configured options.
    * 
    * @param options
    *           Options that will affect what is cleaned.
    */
   public TextCleaner(Options options) {
      setupReplacements(options);
      setupEscapes(options);
   }

   /**
    * Configures the basic replacements based on the configured options.
    * 
    * @param options
    *           Options that will affect what is replaced.
    */
   private void setupReplacements(Options options) {
      this.replacements = new HashMap();

      // build replacement regex
      StringBuilder entities = new StringBuilder(replacements.size() * 5);

      // this is a special case for double-encoded HTML entities.
      entities.append("&(?>amp;([#a-z0-9]++;)|(?>");
      addRepl(entities, "&", "&");
      addRepl(entities, "<", "<");
      addRepl(entities, ">", ">");
      addRepl(entities, """, "\"");
      if (options.reverseHtmlSmartQuotes) {
         addRepl(entities, "“", "\"");
         addRepl(entities, "”", "\"");
         addRepl(entities, "‘", "\'");
         addRepl(entities, "’", "\'");
         addRepl(entities, "'", "\'");
         addRepl(entities, "«", "<<");
         addRepl(entities, "»", ">>");
      }
      if (options.reverseHtmlSmartPunctuation) {
         addRepl(entities, "–", "--");
         addRepl(entities, "—", "---");
         addRepl(entities, "…", "...");
      }
      entities.replace(entities.length() - 1, entities.length(), ");)");

      entityReplacementsPattern = Pattern.compile(entities.toString(),
         Pattern.CASE_INSENSITIVE);

      if (options.reverseUnicodeSmartPunctuation
         || options.reverseUnicodeSmartQuotes) {
         StringBuilder unicode = new StringBuilder("[\\Q");
         if (options.reverseUnicodeSmartQuotes) {
            addRepl(unicode, "\u201c", "\""); // left double quote: “
            addRepl(unicode, "\u201d", "\""); // right double quote: ”
            addRepl(unicode, "\u2018", "\'"); // left single quote: ‘
            addRepl(unicode, "\u2019", "\'"); // right single quote: ’
            addRepl(unicode, "\u00ab", "<<"); // left angle quote: «
            addRepl(unicode, "\u00bb", ">>"); // right angle quote: »
         }
         if (options.reverseUnicodeSmartPunctuation) {
            addRepl(unicode, "\u2013", "--"); // en-dash: –
            addRepl(unicode, "\u2014", "---"); // em-dash: —
            addRepl(unicode, "\u2026", "..."); // ellipsis: …
         }
         unicode.append("\\E]");
         unicodeReplacementsPattern = Pattern.compile(unicode.toString());
      }
   }

   /**
    * Utility method to make the code above easier to read.
    * 
    * @param regex
    *           A character buffer to append the replacement to
    * @param original
    *           Original character or string.
    * @param replacement
    *           Replacement character or string.
    */
   private void addRepl(StringBuilder regex, String original,
      String replacement) {
      replacements.put(original, replacement);
      if (original.charAt(0) == '&') {
         // add entity
         regex.append(original.substring(1, original.length() - 1));
         regex.append('|');
      } else {
         // add single character
         regex.append(original);
      }
   }

   /**
    * Configures the basic escapes based on the configured options.
    * 
    * @param options
    *           Options that will affect what is escaped.
    */
   private void setupEscapes(Options options) {
      escapes = new ArrayList();

      // confusingly, this replaces single backslashes with double backslashes.
      // Man, I miss Groovy's slashy strings in these moments...
      escapes.add(new Escape("\\\\", "\\\\\\\\"));

      // creates an set of characters that are universally escaped.
      // these characters are wrapped in \Q...\E to ensure they aren't treated
      // as special characters.
      // wnm3 added -+.!
      StringBuilder chars = new StringBuilder("([\\Q`*_{}[]#-+.!");
      if (options.tables.isConvertedToText()
         && !options.tables.isRenderedAsCode()) {
         chars.append('|');
      }
      chars.append("\\E])");
      escapes.add(new Escape(chars.toString(), "\\\\$1"));
      // wnm3 took these out for general escape of -+.!
      // // finally, escape certain characters only if they are leading
      // characters
      // StringBuilder leadingChars = new StringBuilder("^( ?+)([\\Q-+");
      // if (options.definitionLists) {
      // leadingChars.append(':');
      // }
      // leadingChars.append("\\E])");
      // escapes.add(new Escape(leadingChars.toString(), "$1\\\\$2"));
      //
      // // setup the leading character reverser
      // // this is a bit of a hack to undo leading character escapes.
      // unescapeLeadingChars = Pattern
      // .compile(leadingChars.insert(6, "\\\\").toString());
   }

   /**
    * Clean the given input text based on the original configuration Options.
    * Newlines are also replaced with a single space.
    *
    * @param input
    *           The text to be cleaned. Can be any object. JSoup nodes are
    *           handled specially.
    * @return The cleaned text.
    */
   public String clean(Object input) {
      return clean(input, true);
   }

   /**
    * Clean the given input text based on the original configuration Options.
    * The text is treat as code, so it is not escaped, and newlines are
    * preserved.
    *
    * @param input
    *           The text to be cleaned. Can be any object. JSoup nodes are
    *           handled specially.
    * @return The cleaned text.
    */
   public String cleanCode(Object input) {
      return clean(input, false);
   }

   /**
    * Clean the given input text based on the original configuration Options.
    * Optionally, don't escape special characters.
    *
    * @param oinput
    *           The text to be cleaned. Can be any object. JSoup nodes are
    *           handled specially.
    * @param normalText
    *           If false, don't escape special characters. This is usually only
    *           used for inline code or code blocks, because they don't need to
    *           be escaped.
    * @return The cleaned text.
    */
   private String clean(Object oinput, boolean normalText) {
      String input;
      if (oinput instanceof TextNode) {
         input = getTextNodeText((TextNode) oinput, normalText);
      } else if (oinput instanceof Element) {
         if (normalText) {
            input = ((Element) oinput).text();
         } else {
            input = getPreformattedText((Element) oinput);
         }
      } else {
         input = oinput.toString();
      }
      String result;
      if (input.length() == 0) {
         // not seen, so just return an empty string.
         result = "";
      } else if (normalText) {
         // For non-code text, newlines are _never_ allowed.
         // Replace one or more set of whitespace chars followed by a newline
         // with a single space.
         input = LINEBREAK_REMOVER.matcher(input).replaceAll(" ");

         // now escape special characters.
         for (final Escape rep : escapes) {
            input = rep.pattern.matcher(input).replaceAll(rep.replacement);
         }
         StringBuffer output = doReplacements(input, entityReplacementsPattern);
         if (unicodeReplacementsPattern != null) {
            output = doReplacements(output, unicodeReplacementsPattern);
         }
         result = output.toString();
      } else {
         // we have to revert ALL HTML entities for code, because they will end
         // up
         // double-encoded by markdown
         // we also don't need to worry about escaping anything
         // note: we have to manually replace ' because it is ignored by
         // StringEscapeUtils for some reason.
         result = StringEscapeUtils.unescapeHtml4(input.replace("'", "'"));
      }
      return result;
   }

   /**
    * Replaces all {@code 

    * } tags with a newline in a copy of the input node, and returns the
    * resulting innter text. This is necessary to ensure that manual linebreaks
    * are supported in preformatted code.
    * 
    * @param oinput
    *           Preformatted node to process
    * @return inner text of the node.
    */
   private String getPreformattedText(Element oinput) {
      Element el = oinput.clone();
      fixLineBreaks(el);
      return el.text();
   }

   // recursively processes the element to replace 
's with \n
   private void fixLineBreaks(Element el) {
      for (final Element e : el.children()) {
         if (e.tagName().equals("br")) {
            e.before("\n");
            e.remove();
         } else {
            fixLineBreaks(e);
         }
      }
   }

   /**
    * Handles running the regex-based replacements in the input
    * 
    * @param input
    *           String to process
    * @param regex
    *           Pattern to use
    * @return cleaned up input string
    */
   private StringBuffer doReplacements(CharSequence input, Pattern regex) {
      StringBuffer output = new StringBuffer();

      Matcher m = regex.matcher(input);
      while (m.find()) {
         String repString;
         // if we have a hard match, do a simple replacement.
         String replacementKey = m.group().toLowerCase(Locale.ENGLISH);
         if (replacements.containsKey(replacementKey)) {
            repString = replacements.get(replacementKey);
         } else {
            // special case for escaped HTML entities.
            repString = "\\\\&$1";
         }
         m.appendReplacement(output, repString);
      }
      m.appendTail(output);

      return output;
   }

   /**
    * Method to clean inline code, and, if necessary, add spaces to make sure
    * that internal, leading, or trailing {@code '`'} characters don't break the
    * inline code. Newlines are also replaced with spaces.
    *
    * This method also adds the leading and trailing {@code '`'} or
    * {@code '```'} as necessary.
    *
    * @param input
    *           String to clean. Can be any object. JSoup nodes are handled
    *           specially.
    * @return The cleaned text.
    */
   public String cleanInlineCode(Object input) {
      String output = clean(input, false).replace('\n', ' ');
      if (output.indexOf('`') != -1) {
         String prepend = "";
         if (output.charAt(0) == '`') {
            prepend = " ";
         }
         String append = "";
         if (output.charAt(output.length() - 1) == '`') {
            append = " ";
         }
         String delim = getDelimiter(output);
         output = String.format("%s%s%s%s%s", delim, prepend, output, append,
            delim);
      } else {
         output = String.format("`%s`", output);
      }
      return output;
   }

   /**
    * Removes the escaping on leading characters, for example, when they are
    * going to be rendered inside another node, such as a table.
    * 
    * @param input
    *           String to process
    * @return Cleaned string.
    */
   public String unescapeLeadingCharacters(String input) {
      // removes any leading escapes...
      // wnm3 added below for geneal escape of -+.! charadcters
      if (unescapeLeadingChars == null) {
         return input;
      }
      return unescapeLeadingChars.matcher(input).replaceAll("$1$2");
   }

   /**
    * Handles escaping special characters in URLs to avoid issues when they are
    * rendered out (ie: spaces, parentheses)
    * 
    * @param input
    *           URL to process
    * @return Cleaned URL
    */
   public String cleanUrl(String input) {
      StringBuffer output = new StringBuffer();

      Matcher m = URL_CLEANER.matcher(input);
      while (m.find()) {
         char c = m.group().charAt(0);
         m.appendReplacement(output, String.format("%%%02x", (int) c));
      }
      m.appendTail(output);
      return output.toString();
   }

   String getDelimiter(String input) {
      int max = 0;
      int counter = 0;
      for (int i = 0; i < input.length(); i++) {
         if (input.charAt(i) == '`') {
            counter++;
         } else {
            max = Math.max(max, counter);
            counter = 0;
         }
      }
      // check in case the last tick was at the end.
      max = Math.max(max, counter);
      return StringUtils.multiply('`', max + 1);
   }

   private String getTextNodeText(TextNode tn, boolean normalText) {
      String input = normalText ? tn.text() : tn.getWholeText();
      Node prev = tn.previousSibling();
      Node next = tn.nextSibling();
      boolean parentIsBlock = isBlock(tn.parent());
      if (isBlock(prev)) {
         input = ltrim(input);
      } else if (prev == null && parentIsBlock) {
         input = ltrim(input);
      } else if (normalText && prev instanceof TextNode) {
         TextNode tprev = (TextNode) prev;
         if (EMPTY_MATCHER.matcher(tprev.text()).matches()) {
            input = ltrim(input);
         }
      }
      if (input.length() > 0) {
         if (isBlock(next)) {
            input = rtrim(input);
         } else if (next == null && parentIsBlock) {
            input = rtrim(input);
         } else if (normalText && next instanceof TextNode) {
            TextNode tnext = (TextNode) next;
            if (EMPTY_MATCHER.matcher(tnext.text()).matches()) {
               input = rtrim(input);
            }
         }
      }
      return input;
   }

   private boolean isBlock(Node n) {
      boolean block = false;
      if (n != null && n instanceof Element) {
         Element el = (Element) n;
         block = el.isBlock() || el.tagName().equals("br");
      }
      return block;
   }

   private String ltrim(String s) {
      int start = 0;
      while ((start + 1 <= s.length())
         && EMPTY_MATCHER.matcher(s.substring(start, start + 1)).matches()) {
         start++;
      }
      String ret = "";
      if (start != s.length()) {
         ret = s.substring(start);
      }
      return ret;
   }

   private String rtrim(String s) {
      int end = s.length();
      while ((end - 1 >= 0)
         && EMPTY_MATCHER.matcher(s.substring(end - 1, end)).matches()) {
         end--;
      }
      String ret = "";
      if (end != 0) {
         ret = s.substring(0, end);
      }
      return ret;
   }

}