All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.overzealous.remark.convert.TextCleaner Maven / Gradle / Ivy

Go to download

markdown generator from html updated but based on original Apache 2.0 licensed code from https://bitbucket.org/OverZealous/remark/src/default/

There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

/*
 * Copyright 2011 OverZealous Creations, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.overzealous.remark.convert;

import com.overzealous.remark.Options;
import com.overzealous.remark.util.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This class is used to clean up plain text fields based on the selected set of
 * options. It optionally escapes certain special characters, as well as
 * replacing various HTML and Unicode entities with their plaintext equivalents.
 *
 * @author Phil DeJarnett
 * @author Nathaniel Mills modifications for provenance and level tracking
 */
public class TextCleaner {

   /**
    * Internal class simply used to hold the various escape regexes.
    */
   private class Escape {

      final Pattern pattern;
      final String replacement;

      public Escape(String pattern, String replacement) {
         this.pattern = Pattern.compile(pattern);
         this.replacement = replacement;
      }
   }

   /** Used to track the replacements based on matched groups. */
   private Map replacements;
   /** Compiled entity replacement pattern. */
   private Pattern entityReplacementsPattern;
   /** Compiled unicode replacement pattern. */
   private Pattern unicodeReplacementsPattern = null;
   /** List of possible escapes */
   private List escapes;
   private Pattern unescapeLeadingChars;

   private static final Pattern EMPTY_MATCHER = Pattern.compile("\\s+",
      Pattern.DOTALL);
   private static final Pattern LINEBREAK_REMOVER = Pattern
      .compile("(\\s*\\r?+\\n)+");

   private static final Pattern URL_CLEANER = Pattern.compile("([\\(\\) ])");

   /**
    * Create a new TextCleaner based on the configured options.
    * 
    * @param options
    *           Options that will affect what is cleaned.
    */
   public TextCleaner(Options options) {
      setupReplacements(options);
      setupEscapes(options);
   }

   /**
    * Configures the basic replacements based on the configured options.
    * 
    * @param options
    *           Options that will affect what is replaced.
    */
   private void setupReplacements(Options options) {
      this.replacements = new HashMap();

      // build replacement regex
      StringBuilder entities = new StringBuilder(replacements.size() * 5);

      // this is a special case for double-encoded HTML entities.
      entities.append("&(?>amp;([#a-z0-9]++;)|(?>");
      addRepl(entities, "&", "&");
      addRepl(entities, "<", "<");
      addRepl(entities, ">", ">");
      addRepl(entities, """, "\"");
      if (options.reverseHtmlSmartQuotes) {
         addRepl(entities, "“", "\"");
         addRepl(entities, "”", "\"");
         addRepl(entities, "‘", "\'");
         addRepl(entities, "’", "\'");
         addRepl(entities, "'", "\'");
         addRepl(entities, "«", "<<");
         addRepl(entities, "»", ">>");
      }
      if (options.reverseHtmlSmartPunctuation) {
         addRepl(entities, "–", "--");
         addRepl(entities, "—", "---");
         addRepl(entities, "…", "...");
      }
      entities.replace(entities.length() - 1, entities.length(), ");)");

      entityReplacementsPattern = Pattern.compile(entities.toString(),
         Pattern.CASE_INSENSITIVE);

      if (options.reverseUnicodeSmartPunctuation
         || options.reverseUnicodeSmartQuotes) {
         StringBuilder unicode = new StringBuilder("[\\Q");
         if (options.reverseUnicodeSmartQuotes) {
            addRepl(unicode, "\u201c", "\""); // left double quote: “
            addRepl(unicode, "\u201d", "\""); // right double quote: ”
            addRepl(unicode, "\u2018", "\'"); // left single quote: ‘
            addRepl(unicode, "\u2019", "\'"); // right single quote: ’
            addRepl(unicode, "\u00ab", "<<"); // left angle quote: «
            addRepl(unicode, "\u00bb", ">>"); // right angle quote: »
         }
         if (options.reverseUnicodeSmartPunctuation) {
            addRepl(unicode, "\u2013", "--"); // en-dash: –
            addRepl(unicode, "\u2014", "---"); // em-dash: —
            addRepl(unicode, "\u2026", "..."); // ellipsis: …
         }
         unicode.append("\\E]");
         unicodeReplacementsPattern = Pattern.compile(unicode.toString());
      }
   }

   /**
    * Utility method to make the code above easier to read.
    * 
    * @param regex
    *           A character buffer to append the replacement to
    * @param original
    *           Original character or string.
    * @param replacement
    *           Replacement character or string.
    */
   private void addRepl(StringBuilder regex, String original,
      String replacement) {
      replacements.put(original, replacement);
      if (original.charAt(0) == '&') {
         // add entity
         regex.append(original.substring(1, original.length() - 1));
         regex.append('|');
      } else {
         // add single character
         regex.append(original);
      }
   }

   /**
    * Configures the basic escapes based on the configured options.
    * 
    * @param options
    *           Options that will affect what is escaped.
    */
   private void setupEscapes(Options options) {
      escapes = new ArrayList();

      // confusingly, this replaces single backslashes with double backslashes.
      // Man, I miss Groovy's slashy strings in these moments...
      escapes.add(new Escape("\\\\", "\\\\\\\\"));

      // creates an set of characters that are universally escaped.
      // these characters are wrapped in \Q...\E to ensure they aren't treated
      // as special characters.
      // wnm3 added -+.!
      StringBuilder chars = new StringBuilder("([\\Q`*_{}[]#-+.!");
      if (options.tables.isConvertedToText()
         && !options.tables.isRenderedAsCode()) {
         chars.append('|');
      }
      chars.append("\\E])");
      escapes.add(new Escape(chars.toString(), "\\\\$1"));
      // wnm3 took these out for general escape of -+.!
      // // finally, escape certain characters only if they are leading
      // characters
      // StringBuilder leadingChars = new StringBuilder("^( ?+)([\\Q-+");
      // if (options.definitionLists) {
      // leadingChars.append(':');
      // }
      // leadingChars.append("\\E])");
      // escapes.add(new Escape(leadingChars.toString(), "$1\\\\$2"));
      //
      // // setup the leading character reverser
      // // this is a bit of a hack to undo leading character escapes.
      // unescapeLeadingChars = Pattern
      // .compile(leadingChars.insert(6, "\\\\").toString());
   }

   /**
    * Clean the given input text based on the original configuration Options.
    * Newlines are also replaced with a single space.
    *
    * @param input
    *           The text to be cleaned. Can be any object. JSoup nodes are
    *           handled specially.
    * @return The cleaned text.
    */
   public String clean(Object input) {
      return clean(input, true);
   }

   /**
    * Clean the given input text based on the original configuration Options.
    * The text is treat as code, so it is not escaped, and newlines are
    * preserved.
    *
    * @param input
    *           The text to be cleaned. Can be any object. JSoup nodes are
    *           handled specially.
    * @return The cleaned text.
    */
   public String cleanCode(Object input) {
      return clean(input, false);
   }

   /**
    * Clean the given input text based on the original configuration Options.
    * Optionally, don't escape special characters.
    *
    * @param oinput
    *           The text to be cleaned. Can be any object. JSoup nodes are
    *           handled specially.
    * @param normalText
    *           If false, don't escape special characters. This is usually only
    *           used for inline code or code blocks, because they don't need to
    *           be escaped.
    * @return The cleaned text.
    */
   private String clean(Object oinput, boolean normalText) {
      String input;
      if (oinput instanceof TextNode) {
         input = getTextNodeText((TextNode) oinput, normalText);
      } else if (oinput instanceof Element) {
         if (normalText) {
            input = ((Element) oinput).text();
         } else {
            input = getPreformattedText((Element) oinput);
         }
      } else {
         input = oinput.toString();
      }
      String result;
      if (input.length() == 0) {
         // not seen, so just return an empty string.
         result = "";
      } else if (normalText) {
         // For non-code text, newlines are _never_ allowed.
         // Replace one or more set of whitespace chars followed by a newline
         // with a single space.
         input = LINEBREAK_REMOVER.matcher(input).replaceAll(" ");

         // now escape special characters.
         for (final Escape rep : escapes) {
            input = rep.pattern.matcher(input).replaceAll(rep.replacement);
         }
         StringBuffer output = doReplacements(input, entityReplacementsPattern);
         if (unicodeReplacementsPattern != null) {
            output = doReplacements(output, unicodeReplacementsPattern);
         }
         result = output.toString();
      } else {
         // we have to revert ALL HTML entities for code, because they will end
         // up
         // double-encoded by markdown
         // we also don't need to worry about escaping anything
         // note: we have to manually replace ' because it is ignored by
         // StringEscapeUtils for some reason.
         result = StringEscapeUtils.unescapeHtml4(input.replace("'", "'"));
      }
      return result;
   }

   /**
    * Replaces all {@code 
* } tags with a newline in a copy of the input node, and returns the * resulting innter text. This is necessary to ensure that manual linebreaks * are supported in preformatted code. * * @param oinput * Preformatted node to process * @return inner text of the node. */ private String getPreformattedText(Element oinput) { Element el = oinput.clone(); fixLineBreaks(el); return el.text(); } // recursively processes the element to replace
's with \n private void fixLineBreaks(Element el) { for (final Element e : el.children()) { if (e.tagName().equals("br")) { e.before("\n"); e.remove(); } else { fixLineBreaks(e); } } } /** * Handles running the regex-based replacements in the input * * @param input * String to process * @param regex * Pattern to use * @return cleaned up input string */ private StringBuffer doReplacements(CharSequence input, Pattern regex) { StringBuffer output = new StringBuffer(); Matcher m = regex.matcher(input); while (m.find()) { String repString; // if we have a hard match, do a simple replacement. String replacementKey = m.group().toLowerCase(Locale.ENGLISH); if (replacements.containsKey(replacementKey)) { repString = replacements.get(replacementKey); } else { // special case for escaped HTML entities. repString = "\\\\&$1"; } m.appendReplacement(output, repString); } m.appendTail(output); return output; } /** * Method to clean inline code, and, if necessary, add spaces to make sure * that internal, leading, or trailing {@code '`'} characters don't break the * inline code. Newlines are also replaced with spaces. * * This method also adds the leading and trailing {@code '`'} or * {@code '```'} as necessary. * * @param input * String to clean. Can be any object. JSoup nodes are handled * specially. * @return The cleaned text. */ public String cleanInlineCode(Object input) { String output = clean(input, false).replace('\n', ' '); if (output.indexOf('`') != -1) { String prepend = ""; if (output.charAt(0) == '`') { prepend = " "; } String append = ""; if (output.charAt(output.length() - 1) == '`') { append = " "; } String delim = getDelimiter(output); output = String.format("%s%s%s%s%s", delim, prepend, output, append, delim); } else { output = String.format("`%s`", output); } return output; } /** * Removes the escaping on leading characters, for example, when they are * going to be rendered inside another node, such as a table. * * @param input * String to process * @return Cleaned string. */ public String unescapeLeadingCharacters(String input) { // removes any leading escapes... // wnm3 added below for geneal escape of -+.! charadcters if (unescapeLeadingChars == null) { return input; } return unescapeLeadingChars.matcher(input).replaceAll("$1$2"); } /** * Handles escaping special characters in URLs to avoid issues when they are * rendered out (ie: spaces, parentheses) * * @param input * URL to process * @return Cleaned URL */ public String cleanUrl(String input) { StringBuffer output = new StringBuffer(); Matcher m = URL_CLEANER.matcher(input); while (m.find()) { char c = m.group().charAt(0); m.appendReplacement(output, String.format("%%%02x", (int) c)); } m.appendTail(output); return output.toString(); } String getDelimiter(String input) { int max = 0; int counter = 0; for (int i = 0; i < input.length(); i++) { if (input.charAt(i) == '`') { counter++; } else { max = Math.max(max, counter); counter = 0; } } // check in case the last tick was at the end. max = Math.max(max, counter); return StringUtils.multiply('`', max + 1); } private String getTextNodeText(TextNode tn, boolean normalText) { String input = normalText ? tn.text() : tn.getWholeText(); Node prev = tn.previousSibling(); Node next = tn.nextSibling(); boolean parentIsBlock = isBlock(tn.parent()); if (isBlock(prev)) { input = ltrim(input); } else if (prev == null && parentIsBlock) { input = ltrim(input); } else if (normalText && prev instanceof TextNode) { TextNode tprev = (TextNode) prev; if (EMPTY_MATCHER.matcher(tprev.text()).matches()) { input = ltrim(input); } } if (input.length() > 0) { if (isBlock(next)) { input = rtrim(input); } else if (next == null && parentIsBlock) { input = rtrim(input); } else if (normalText && next instanceof TextNode) { TextNode tnext = (TextNode) next; if (EMPTY_MATCHER.matcher(tnext.text()).matches()) { input = rtrim(input); } } } return input; } private boolean isBlock(Node n) { boolean block = false; if (n != null && n instanceof Element) { Element el = (Element) n; block = el.isBlock() || el.tagName().equals("br"); } return block; } private String ltrim(String s) { int start = 0; while ((start + 1 <= s.length()) && EMPTY_MATCHER.matcher(s.substring(start, start + 1)).matches()) { start++; } String ret = ""; if (start != s.length()) { ret = s.substring(start); } return ret; } private String rtrim(String s) { int end = s.length(); while ((end - 1 >= 0) && EMPTY_MATCHER.matcher(s.substring(end - 1, end)).matches()) { end--; } String ret = ""; if (end != 0) { ret = s.substring(0, end); } return ret; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy