All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mdfromhtml.markdown.transform.GetTextFromMarkdown Maven / Gradle / Ivy

Go to download

Generate markdown (.md) files from html and url provided in JSON files. The name of the generated files will use the name of the JSON file, and an incrementing number starting with 1 for each JSON file read, and for each html reference within the files.

There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
import com.overzealous.remark.Options;
import com.overzealous.remark.Remark;
import com.overzealous.remark.convert.ProvenanceWriter;

/**
 * Utility class to transform multimarkdown generated from HTML into text files.
 * Files are read from an input directory, and written to an output directory.
 * 
 * @author Nathaniel Mills
 */
public class GetTextFromMarkdown {

   /**
    * @param args
    */
   public static void main(String[] args) {
      int exitVal = 0;
      JSONObject HTMLFilters = null;
      try {
         HTMLFilters = MDfromHTMLUtils.loadJSONFile("HTML_Filters.json");
      } catch (Exception e1) {
         System.out.println(
            "Warning: Using no HTML Filters -- can not find \"HTML_Filters.json\": "
               + e1.getLocalizedMessage());
      }
      GetTextFromMarkdown pgm = new GetTextFromMarkdown(Options.multiMarkdown(),
         HTMLFilters);
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated text files (." + pgm._txtext
               + ") will be " + "saved in " + pgm._outputPath
               + "\nIt is "+_includeLinks+" that links will be included in the text output."); //
         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  exitVal = pgm.doWork(file);
                  if (exitVal != 0) {
                     break;
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
   }

   String _ext = "md";
   JSONObject _HTMLFilters = new JSONObject();
   Path _inputPath = null;
   boolean _interactive = false;
   Options _options = Options.multiMarkdown();
   String _outputPath = ".";
   boolean _thumbsucker = false;
   String _txtext = "txt";
   static boolean _includeLinks = false;

   /**
    * Constructor
    */
   public GetTextFromMarkdown() {
   }

   public GetTextFromMarkdown(Options options, JSONObject HTMLFilters) {
      _options = options;
      _HTMLFilters = HTMLFilters;
   }

   /**
    * Process the specified file to transform its content into formatted text
    * and save it to a txt file in the specified output directory.
    * 
    * @param file
    *           the file containing the annotation json from ICCC
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file) {
      int exitVal = 0;
      ProvenanceWriter provenanceWriter = null;
      String html2mdProvenanceFileName = "unknown";
      String provenanceOutputFileName = "unknown";
      JSONObject provenance = null;
      String textOutputFileName = "unknown";
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         List markdownList = MDfromHTMLUtils.loadTextFile(fqFileName);
         String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
         int index = shortFileName.lastIndexOf("." + _ext);
         if (index < 1) {
            System.out.println(
               "Error: " + shortFileName + "doesn't end with ." + _ext);
            exitVal = -1;
         } else {
            html2mdProvenanceFileName = _inputPath + File.separator
               + shortFileName.substring(0, index) + "_html2md.json";
            provenanceOutputFileName = _outputPath
               + shortFileName.substring(0, index) + "_md2txt.json";
            provenance = MDfromHTMLUtils.loadJSONFile(html2mdProvenanceFileName);
            _HTMLFilters = (JSONObject) provenance.get("htmlFilters");
            String baseURI = (String) provenance.get("baseURI");
            textOutputFileName = _outputPath + shortFileName.substring(0, index)
               + "." + _txtext;

            String domain = Remark.getDomain(baseURI);
            
            File provenanceOutputFile = new File(provenanceOutputFileName);
            if (provenanceOutputFile.exists()) {
               provenanceOutputFile.delete();
            }
            provenanceWriter = new ProvenanceWriter(fqFileName,
               textOutputFileName, _HTMLFilters, baseURI, domain,
               new FileWriter(provenanceOutputFile, true));
            try {
               StringBuffer sb = new StringBuffer();
               int lineNum = 0;
               Map refURLs = findRefURLs(markdownList);
               for (String mdLine : markdownList) {
                  lineNum++;
                  // truncate at provenance so it isn't included 
                  if (mdLine.equals("###### Doc2Dial Provenance ######")) {
                     break;
                  }
                  String testLine = generateTextFromMarkdown(mdLine, refURLs);
                  if (testLine != null) {
                     provenanceWriter.saveMD2Text("" + lineNum, mdLine,
                        testLine);
                     sb.append(testLine);
                     sb.append("\n");
                  }
               }

               MDfromHTMLUtils.saveTextFile(textOutputFileName, sb.toString());
            } catch (Exception e) {
               e.printStackTrace();
               exitVal = -1;
            } finally {
               if (provenanceWriter != null) {
                  try {
                     provenanceWriter.close();
                  } catch (IOException e) {
                     e.printStackTrace();
                  }
               }
            }
         }
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }

   static public Map findRefURLs(List mdLines) {
      Map refURLs = new HashMap();
      int offset = 0;
      String url = "";
      String ref = "";
      for (String mdLine : mdLines) {
         mdLine = mdLine.trim();
         if (mdLine.startsWith("[")) {
            offset = mdLine.indexOf("]: ");
            if (offset > 1) {
               ref = mdLine.substring(1, offset).trim();
               url = mdLine.substring(offset + 3).trim();
               refURLs.put(ref, url);
            }
         }
      }
      return refURLs;
   }

   static public String generateTextFromMarkdown(String mdLine, Map refURLs) {
      String test = mdLine.trim();
      if (test.length() > 0) {
         test = TextUtils.filterComments(test);
         test = processText(test, refURLs);
         if (test != null) {
            test = test.trim();
         }
      }
      return test;
   }

   /**
    * Get the parameters necessary for program execution: input directory,
    * output directory, and whether to append annotation details to sentences
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag
    * @return true if we have sufficient parameters to execute the program
    */
   boolean getParams(String[] args) {
      String inputPath = "./src/test/resources";
      String outputPath = "./src/test/resources";
      String tmp = "";

      try {
         if (args.length >= 1) {
            inputPath = args[0];
         } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
               "Enter the fully qualified path to directory containing " + _ext
                  + " multimarkdown files, or q to exit (" + inputPath + "):");
            if (tmp == null || tmp.length() == 0) {
               tmp = inputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
               return false;
            }
            inputPath = tmp;
         }
         if (inputPath.endsWith(File.separator) == false) {
            inputPath += File.separator;
         }
         _inputPath = FileSystems.getDefault().getPath(inputPath);
      } catch (InvalidPathException ipe) {
         System.out.println(
            "Error: " + args[0] + " is not a valid directory to form a path.");
         return false;
      }
      if (args.length >= 2) {
         outputPath = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the text file output directory, or q to exit ("
               + outputPath + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = outputPath;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         outputPath = tmp;
      }
      if (outputPath.endsWith(File.separator) == false) {
         outputPath += File.separator;
      }
      File testOutput = new File(outputPath);
      if (testOutput.exists() == false) {
         System.out.println(
            "Error: The output directory \"" + outputPath + "\" must exist.");
         return false;
      }
      if (testOutput.isDirectory() == false) {
         System.out.println("Error: The output directory \"" + outputPath
            + "\" must be a directory.");
         return false;
      }
      _outputPath = outputPath;
      
      String includeLinks = "n";
      if (args == null || args.length < 3) {
         tmp = MDfromHTMLUtils.prompt(
            "Include links in text output (y=yes, n=no), or q to exit ("
               + includeLinks + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = includeLinks;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         includeLinks = tmp.toLowerCase().substring(0,1);
      } else {
         includeLinks = args[2].trim().toLowerCase().substring(0,1);
      }
      _includeLinks = ("y".equals(includeLinks));

      if (args.length >= 4) {
         _thumbsucker = new Boolean(args[3]);
      }

      return true;
   }

   static public String getRef(String line) {
      String retVal = "";
      int startOffset = line.indexOf("[");
      if (startOffset > -1) {
         int endOffset = line.substring(startOffset).indexOf("]");
         if (startOffset > -1 && endOffset > startOffset) {
            retVal = line.substring(startOffset + 1, endOffset + startOffset)
               .trim();
         }
      }
      return retVal;
   }

   static public String processText(String line, Map refURLs) {
      StringBuffer sb = new StringBuffer();
      String testChanged = "";
      int lineLen = line.length();
      int offset = 0;
      while (offset < lineLen) {
         if (line == null) {
            break;
         }
         char startChar = line.charAt(offset); // substring(offset, offset + 1);
         switch (startChar) {
            // handle escaped characters first
            case 0x005c: { // backslash
               // skip first backslash and save next char
               offset++;
               if (offset < lineLen) {
                  sb.append(line.charAt(offset));
                  offset++;
               }
               if (offset < lineLen) {
                  line = line.substring(offset);
                  lineLen = line.length();
               } else {
                  // reached end of line
                  lineLen = 0;
               }
               offset = 0;
               break;
            }
            case 0x0021: { // exclamation or image link
               if (offset < lineLen - 1) {
                  if ("[".equals(line.substring(offset + 1, offset + 2))) {
                     // image link
                     line = removeReferencesAndLinks(line.substring(offset),
                        refURLs);
                     if (line == null) {
                     	break;
                     }
                     offset = 0;
                     lineLen = line.length();
                  } else { // just an exclamation point
                     sb.append("!");
                     line = line.substring(offset + 1);
                     offset = 0;
                     lineLen = line.length();
                  }
               } else { // just an exclamation point at end
                  sb.append("!");
                  offset = line.length();
               }
               break;
            }
            case 0x005b: { // left bracket == link
               line = removeReferencesAndLinks(line.substring(offset), refURLs);
               // check for complete line deletion
               if (line == null) {
                  return null;
               }
               offset = 0;
               lineLen = line.length();
               break;
            }
            // Table lines
            case 0x007c: { // pipe == table column separator
               line = removeTableLines(line.substring(offset));
               offset = 0;
               lineLen = line.length();
               break;
            }
            /**
             * Note: these simple cleansers should be below link cases "!" and
             * "[" and tables
             */
            case 0x002d: { // hyphen or task list "- [x]" or "- [ ]"
               testChanged = removeTaskList(line.substring(offset));
               if (testChanged.equals(line.substring(offset))) {
                  // just a hyphen, no change
                  sb.append("-");
                  line = testChanged.substring(1);
               } else {
                  line = testChanged;
               }
               offset = 0; // skip next char (space or remaining hyphen)
               lineLen = line.length();
               break;
            }
            case 0x0060: { // back tick == fenced code blocks or code
               line = removeFencing(line.substring(offset));
               offset = 0;
               lineLen = line.length();
               break;
            }
            case 0x0023: { // hash tag == headings
               line = removeHeading(line.substring(offset));
               offset = 0;
               lineLen = line.length();
               break;
            }
            case 0x005f: { // underscore
               line = removeUnderscore(line.substring(offset));
               offset = 0;
               lineLen = line.length();
               break;
            }
            case 0x007e: { // tilde == strike through or fencing
               // first remove fencing
               line = removeFencing(line.substring(offset));
               line = removeEmphasis(line);
               offset = 0;
               lineLen = line.length();
               break;
            }
            case 0x002a: { // asterisk == bold, italic
               line = removeEmphasis(line.substring(offset));
               offset = 0;
               lineLen = line.length();
               break;
            }
            default: { // just text
               sb.append(startChar);
               offset++;
               break;
            }
         }
      }
      String temp = sb.toString();
      temp = temp.replaceAll("\\\\:", ":");
      temp = temp.replaceAll("\\\\-", "-");
      return temp;
   }

   /**
    * Presented with a line beginning with an underscore. Remove all contiguous
    * underscores that are not escaped which could include patterns like:
    * "__\\_abc\\_def__ more stuff" which should become "\\_abc\\_def more
    * stuff"
    * 
    * @param line
    *           input to be cleansed of non-escaped underscores
    * @return cleansed input
    */
   static public String removeUnderscore(String line) {
      StringBuffer sb = new StringBuffer();
      int i = 0;
      int lLen = line.length();
      boolean isEscaped = false;
      while (i < lLen) {
         // check for backslash (escaped char)
         if (0x005c == line.charAt(i)) {
            isEscaped = true;
         } else if (0x005f == line.charAt(i)) {
            if (isEscaped) {
               isEscaped = false;
               // keep this escaped char
            } else { // eat contiguous unescaped underscores
               i++;
               while (i < lLen) {
                  if (line.charAt(i) == 0x005f) {
                     i++; // skip contiguous underscores
                     continue;
                  }
                  // have a non-underscore
                  break;
               }
               // i now points to valid char unless end of line
            }
         }
         if (i < lLen) {
            sb.append(line.charAt(i));
         }
         i++;
      }
      return sb.toString();
   }

   /**
    * Remove bold, italics, underscore, strikethrough
    * 
    * @param line
    *           input to be cleansed
    * @return cleansed version of input
    */
   static public String removeEmphasis(String line) {
      line = line.replaceAll("\\*", "");
      line = line.replaceAll("~", "");
      return line;
   }

   /**
    * Remove patterns like ```, ~~~, ```json, ~~~java
    * 
    * @param line
    *           input to be cleansed
    * @return cleansed version of input
    */
   static public String removeFencing(String line) {

      StringBuffer sb = new StringBuffer();
      String[] parts = line.split("```\\w+");
      for (int i = 0; i < parts.length; i++) {
         if (parts[i].length() != 0) {
            sb.append(parts[i]);
         }
      }
      line = sb.toString();
      sb = new StringBuffer();
      parts = line.split("~~~\\w+");
      for (int i = 0; i < parts.length; i++) {
         if (parts[i].length() != 0) {
            sb.append(parts[i]);
         }
      }
      line = sb.toString();
      line = line.replaceAll("~~~", "");
      // while fencing is three ticks, single ticks connote inline code
      line = line.replaceAll("`", "");
      return line;
   }

   /**
    * Processes content with surrounding #'s signifying a header and transforms
    * them to text without the #'s
    * 
    * @param line
    *           text to be cleansed of headers
    * @return text without headers
    */
   static public String removeHeading(String line) {
      StringBuffer sb = new StringBuffer();
      String[] parts = line.split("#+");
      for (int i = 0; i < parts.length; i++) {
         if (parts[i].length() != 0) {
            sb.append(parts[i]);
         }
      }
      return sb.toString();
   }

   /**
    * Search for reference links in the markdown line. A reference link contains
    * a pattern with [...] without a following ": "
    * 
    * @param line
    *           markdown line to be examined
    * @param refURLs
    *           map of a reference to its corresponding URL to enable the URL to
    *           be added where a reference is made. The URL being added will be
    *           surrounded with " {" and "} ".
    * @return the revised line stripped of links, or null if nothing from this
    *         line should be saved (e.g., for a reference with pattern [...]:...
    */
   static public String removeReferencesAndLinks(String line, Map refURLs) {
      StringBuffer sb = new StringBuffer();
      String test = line.trim();
      char[] testChars = new char[test.length()];
      test.getChars(0, test.length(), testChars, 0);
      int offset = 0;
      boolean foundBracket = false;
      int bracketCnt = 0;
      int startOffset = -1;
      String refLink = "";
      String url = "";
      String ref = "";
      // String reference = "";
      Stack startOffsets = new Stack();
      boolean isImageRef = false;
      boolean needLabel = false;
      for (char testChar : testChars) {
         switch (testChar) {
            case 0x005b: { // "["
               if (!isImageRef) {
                  // flipflop need for label to skip refLink
                  needLabel = !needLabel;
               } // else in an image link so don't capture anything

               if (!foundBracket) {
                  foundBracket = true;
               }
               startOffsets.push(offset);
               bracketCnt++;
               break;
            }
            case 0x005d: { // "]"
               if (foundBracket) {
                  bracketCnt--;
                  startOffset = startOffsets.pop();
                  if ((offset - startOffset) > 1) {
                     if ((offset + 1) < test.length()) {
                        if (0x003a == testChars[offset + 1]) { // ":"
                           /**
                            * Don't save any reference information (signal line
                            * deletion with null
                            */
                           return null;
                           // reference = test.substring(startOffset, offset +
                           // 1);
                        } else {
                           refLink = test.substring(startOffset + 1, offset).trim();
                           if (needLabel) {
                              // clean footnote starting with carat
                              if (refLink.startsWith("^")) {
                                 refLink = refLink.substring(1);
                              }
                              if (_includeLinks) {
                                 sb.append("{ ");
                                 sb.append(refLink);
                                 sb.append(" }");
                              } else {
                                 sb.append(refLink);
                              }
                              // try to insert the corresponding URL
                              ref = getRef(test.substring(offset)).trim();
                              if (ref.equals("")) {
                                 ref = refLink;
                              }
                              url = refURLs.get(ref);
                              if (url != null) {
                                 if (_includeLinks) {
                                    sb.append(":{ ");
                                    sb.append(url);
                                    sb.append(" } ");
                                 }
                              }
                           }
                           if (isImageRef) {
                              // set up so next [ makes this false
                              needLabel = true;
                           }
                        }
                     } else {
                        refLink = test.substring(startOffset + 1, offset).trim();
                        if (needLabel) {
                           // clean footnote starting with carat
                           if (refLink.startsWith("^")) {
                              refLink = refLink.substring(1);
                           }
                           if (_includeLinks) {
                              sb.append("{");
                              sb.append(refLink);
                              sb.append("}");
                           } else {
                              sb.append(refLink);
                           }
                           // try to insert the corresponding URL
                           ref = getRef(test.substring(offset)).trim();
                           if (ref.equals("")) {
                              ref = refLink;
                           }
                           url = refURLs.get(ref);
                           if (url != null) {
                              if (_includeLinks) {
                                 sb.append(":{");
                                 sb.append(url);
                                 sb.append("} ");
                              }
                           }
                        }
                        if (isImageRef) {
                           // set up so next [ makes this false
                           needLabel = true;
                        }
                     }
                  }
                  if (bracketCnt == 0) {
                     startOffset = -1;
                     foundBracket = false;
                     isImageRef = false;
                  }
               }
               break;
            }
            case 0x0021: { // ! (may be an image
               if (offset < test.length() - 1) {
                  // check the next character
                  if (0x005b == testChars[offset + 1]) {
                     isImageRef = true;
                     // set up so next [ will grab the label
                     needLabel = true;
                  } else {
                     // just an exclamation point
                     sb.append(testChar);
                  }
               } else {
                  // last char so just an exclamation point
                  sb.append(testChar);
               }
               break;
            }
            default: {
               // capture all characters not inside a link
               if (startOffsets.empty()) {
                  sb.append(testChar);
               }
               break;
            }
         }
         offset++;
      }
      return sb.toString();
   }

   static public String removeTableLines(String line) {
      line = line.replaceAll("\\|", "");
      line = line.replaceAll(":-{3,}", "");
      line = line.replaceAll("-{3,}:", "");
      line = line.replaceAll(":-*:", "");
      line = line.replaceAll("-{3,}", "");
      return line;
   }

   /**
    * Remove task list items with the pattern "- [x]" or "- [ ]" or "- [ x ]" or
    * "- [ ]"
    * 
    * @param line
    *           input to be cleansed
    * @return cleansed input
    */
   static public String removeTaskList(String line) {
      line = line.replaceAll("- [ ]", "( )");
      line = line.replaceAll("- [x]", "(x)");
      line = line.replaceAll("- [ x ]", "(x)");
      return line;
   }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy