com.mdfromhtml.markdown.transform.GetMarkdownFromHTML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of MarkdownGenerator Show documentation
Generate markdown (.md) files from html and url provided in JSON files. The name of the generated files will use the name of the JSON file, and an incrementing number starting with 1 for each JSON file read, and for each html reference within the files.
There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Stack;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities.EscapeMode;

import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.mdfromhtml.core.MDfromHTMLUtils;
import com.mdfromhtml.remark.utils.CleanupMarkdown;
import com.overzealous.remark.Options;
import com.overzealous.remark.Remark;
import com.overzealous.remark.convert.DocumentConverter;
import com.overzealous.remark.convert.ProvenanceWriter;

/**
 * Given an input directory containing json files with an array of objects, each
 * object containing an html entry and a url from which that html was captured,
 * parse the html and generate markdown (.md) files for each html / url pair. If
 * there is no url specified or it is empty or null then no relative url
 * mappings will be provided in the resulting markdown file. Markdown files
 * retain the name of the json file and are appended with the index within the
 * JSONArray for the object from which the html was used to generate the
 * markdown.
 * 
 * @author Nathaniel Mills
 */
public class GetMarkdownFromHTML {

   public static JSONObject getMarkdownFromHTML(JSONObject htmlObject) {
      JSONObject result = new JSONObject();
      ObjectMapper mapper = new ObjectMapper();
      JsonNode jsonnode;
      try {
         jsonnode = mapper.readTree(htmlObject.toString());
      } catch (IOException e) {
         result.put("errorMsg", String.format(
            "Error: Can not parse the object: %s" + e.getLocalizedMessage()));
         return result;
      }

      jsonnode = getMarkdownFromHTML((ObjectNode) jsonnode);
      try {
         result = (JSONObject) JSON.parse(jsonnode.toString());
      } catch (IOException e) {
         result.put("errorMsg",
            String.format("Error: Can not parse the response object: %s"
               + e.getLocalizedMessage()));
         return result;
      }
      return result;
   }

   public static ObjectNode getMarkdownFromHTML(ObjectNode htmlObject) {
      ObjectNode result = JsonNodeFactory.instance.objectNode();
      boolean returnProvenance = true;
      JsonNode testProv = htmlObject.get("returnProvenance");
      if (testProv != null && testProv.isBoolean()) {
         returnProvenance = testProv.asBoolean();
      }

      // initialize environment for this call
      ObjectNode HTMLFiltersObj = (ObjectNode) htmlObject.get("HTMLFilters");
      if (HTMLFiltersObj == null) {
         try {
            ObjectMapper mapper = new ObjectMapper();
            JsonNode filters = mapper.readTree(new File("HTML_Filters.json"));
            HTMLFiltersObj = (ObjectNode) filters;
         } catch (Exception e1) {
            result.put("errorMsg",
               "Error: Can not find \"HTML_Filters\" in the request, nor can a file named \"HTML_Filters.json\" be found: "
                  + e1.getLocalizedMessage());
            return result;
         }
      }
      JSONObject HTMLFilters;
      try {
         // fold everything to lowercase to match later during filtering
         String htmlFilters = HTMLFiltersObj.toString().toLowerCase();
         HTMLFilters = (JSONObject) JSON.parse(htmlFilters);
      } catch (IOException e) {
         result.put("errorMsg",
            "Error: Can not find parse the content of the \"HTML_Filters.json\" file: "
               + e.getLocalizedMessage());
         return result;
      }
      Options options = Options.multiMarkdown();
      options.hardwraps = true;
      GetMarkdownFromHTML pgm = new GetMarkdownFromHTML(options, HTMLFilters);
      JsonNode temp = htmlObject.get("html");
      if (temp == null) {
         // try to get this from the captureArray
         ArrayNode captureArray = (ArrayNode) htmlObject.get("captureArray");
         if (captureArray != null && captureArray.size() > 0) {
            ObjectNode obj = (ObjectNode) captureArray.get(0);
            temp = obj.get("html");
            if (temp == null) {
               result.put("errorMsg", "The request captureArray"
                  + " is missing the required \"html\" key so there is nothing to process.");
               return result;
            }
         } else {
            result.put("errorMsg", "The request"
               + " is missing the required \"html\" or \"captureArray\" key so there is nothing to process.");
            return result;
         }
      }
      String html = temp.asText();
      temp = htmlObject.get("url");
      if (temp == null) {
         result.put("errorMsg",
            "The request" + " is missing the required \"url\" key.");
         return result;
      }

      String baseURI = temp.asText();

      // testing for hidden tags
      // html = html.replaceAll("<", "<");
      // html = html.replaceAll(">", ">");
      Document doc = Jsoup.parse(html, baseURI);
      doc.outputSettings().escapeMode(EscapeMode.extended);

      // determine if we should skip markdown until first header is encountered
      boolean seekHeaders = true; // default is true so only special sites need
                                  // override this
      String domain = Remark.getDomain(baseURI);
      int testindex = baseURI.indexOf(domain);
      // need to find actual domain for proper filters
      String workingURI = baseURI.substring(testindex + domain.length());
      testindex = workingURI.toLowerCase().indexOf("http");
      if (testindex >= 0) {
         workingURI = workingURI.substring(testindex);
         domain = Remark.getDomain(workingURI);
         baseURI = workingURI;
      }
      JSONObject domainFilters = (JSONObject) HTMLFilters.get(domain);
      if (domainFilters != null) {
         Boolean test = (Boolean) domainFilters
            .get(DocumentConverter.SEEK_HEADERS);
         if (test != null) {
            seekHeaders = test;
         }
      }

      // create a provenance writer using a string writer so we can return the
      // provenance in the response
      ProvenanceWriter provWriter = null;
      StringWriter sw = null;
      BufferedWriter bw = null;
      if (returnProvenance != false) {
         sw = new StringWriter();
         bw = new BufferedWriter(sw);
         try {
            provWriter = new ProvenanceWriter("", "", HTMLFilters, baseURI,
               domain, bw);
         } catch (IOException e) {
            e.printStackTrace();
            provWriter = null;
         }
      }
      String markdown = pgm.generateMarkdownFromHTML(doc, provWriter, baseURI,
         seekHeaders);

      result.set("HTMLFilters", HTMLFiltersObj);
      if (returnProvenance != false && provWriter != null) {
         // note: close finished the JSON object in sw
         try {
            provWriter.close();
         } catch (IOException e1) {
            e1.printStackTrace();
         }
         String provenance = sw.getBuffer().toString();
         ObjectMapper mapper = new ObjectMapper();
         JsonNode provObj = null;
         try {
            provObj = mapper.readTree(provenance);
         } catch (IOException e) {
            e.printStackTrace();
         }

         result.set("provenance", provObj);
         // override the HTMLFilters with the one reported by provenance.
         ObjectNode testHTMLFilters = (ObjectNode) provObj.get("HTMLFilters");
         if (testHTMLFilters != null) {
            result.set("HTMLFilters", testHTMLFilters);
         }
      }

      // add in the html used for generation in a captureArray
      ArrayNode captureArray = JsonNodeFactory.instance.arrayNode();
      ObjectNode htmlInfo = JsonNodeFactory.instance.objectNode();
      htmlInfo.put("content", doc.text());
      htmlInfo.put("html", html);
      htmlInfo.put("url", baseURI);
      captureArray.add(htmlInfo);
      result.set("captureArray", captureArray);
      result.set("markdown", JsonNodeFactory.instance.textNode(markdown));
      result.put("returnProvenance", returnProvenance);
      result.put("url", baseURI);
      return result;
   }

   /**
    * Main entry point to read a specified input directory to find json files
    * containing an array of objects with the html and url from the HTML capture
    * utility () and transform the markdown and structured html files saved in
    * *.md and *_formatted.html files in the specified output directory.
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag (if not supplied, the
    *           program prompts for their values)
    */
   public static void main(String[] args) {
      int exitVal = 0;
      JSONObject HTMLFilters = null;
      try {
         HTMLFilters = MDfromHTMLUtils.loadJSONFile("." + File.separator
            + "properties" + File.separator + "HTML_Filters.json");
         // fold to lowercase
         try {
            HTMLFilters = (JSONObject) JSON
               .parse(HTMLFilters.toString().toLowerCase());
         } catch (Exception e) {
            System.out.println("Error: \"." + File.separator + "properties"
               + File.separator + "HTML_Filters.json\" has a parsing error: "
               + e.getLocalizedMessage());
            return;
         }
      } catch (Exception e1) {
         System.out.println("Error: No HTML Filters -- can not find \"."
            + File.separator + "properties" + File.separator
            + "HTML_Filters.json\": " + e1.getLocalizedMessage());
         return;
      }
      Options options = Options.multiMarkdown();
      options.hardwraps = true;
      GetMarkdownFromHTML pgm = new GetMarkdownFromHTML(options, HTMLFilters);
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated markdown (.md), and html (.html and _foramtted.html) "
               + "saved in " + pgm._outputPath); //
         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  try {
                     exitVal = pgm.doWork(file, HTMLFilters);
                     if (exitVal != 0) {
                        break;
                     }
                  } catch (Exception e) {
                     e.printStackTrace();
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
      System.exit(exitVal);
   }

   String _ext = "json";
   Path _inputPath = null;
   boolean _interactive = false;
   String _outputPath = ".";
   Remark _remark = new Remark(Options.multiMarkdown());
   boolean _thumbsucker = false;
   boolean _keepProvenanceLinks = true;

   // public GetMarkdownFromHTML() {
   // this(Options.multiMarkdown());
   // }
   //
   // public GetMarkdownFromHTML(Options options) {
   // this(options, (JSONObject) null);
   // }

   public GetMarkdownFromHTML(Options options, JSONObject HTMLFilters) {
      _remark = new Remark(options, HTMLFilters);
   }

   /**
    * Process the specified file to transform its content into formatted text
    * and save it to a txt file in the specified output directory.
    * 
    * @param file
    *           the file containing the annotation json from ICCC
    * @param HTMLFilters
    *           object containing global and domain specific filter rules to
    *           control markdown generation
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file, JSONObject HTMLFilters) {
      int exitVal = 0;
      ProvenanceWriter provenanceWriter = null;
      String provenanceOutputFileName = "unknown";
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         ObjectMapper mapper = new ObjectMapper();
         File jsonTestFile = new File(file.toString());
         JsonNode tempJSON = mapper.readTree(jsonTestFile);
         String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
         int index = shortFileName.lastIndexOf("." + _ext);
         if (index < 1) {
            System.out.println(shortFileName + "doesn't end with ." + _ext);
            exitVal = -1;
         } else {
            ObjectNode inputJson = (ObjectNode) tempJSON;
            int htmlCounter = 0;
            ArrayNode htmlList = (ArrayNode) inputJson.get("captureArray");
            if (htmlList == null) {
               JsonNode htmlObj = (JsonNode) inputJson.get("captureDict");
               if (htmlObj == null) {
                  System.err.println(fqFileName
                     + " is missing the \"captureArray\" and the \"captureDict\" tag. Please fix and retry.");
                  System.exit(-1);
               }
               htmlList = JsonNodeFactory.instance.arrayNode();
               htmlList.add(htmlObj);
            }
            String baseURI = null;
            String html = null;
            JsonNode temp = null;
            for (Object obj : htmlList) {
               boolean seekHeaders = true; // default is true so only special
                                           // sites need override this
               JSONObject globalFilters = (JSONObject) HTMLFilters.get("*");
               if (globalFilters != null) {
                  Boolean test = (Boolean) globalFilters
                     .get(DocumentConverter.SEEK_HEADERS);
                  if (test != null) {
                     seekHeaders = test;
                  }
               }

               try {
                  htmlCounter++;
                  ObjectNode htmlObj = (ObjectNode) obj;
                  temp = htmlObj.get("html");
                  if (temp == null) {
                     System.err.println(fqFileName
                        + " is missing the \"html\" key in the ["
                        + (htmlCounter - 1) + "] element of the captureArray.");
                     System.exit(-1);
                  }
                  html = temp.asText();
                  temp = htmlObj.get("url");
                  if (temp == null) {
                     System.err.println(fqFileName
                        + " is missing the \"utl\" key in the ["
                        + (htmlCounter - 1) + "] element of the captureArray.");
                     System.exit(-1);
                  }
                  baseURI = temp.asText();
                  String htmlOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3) + ".html";
                  MDfromHTMLUtils.saveTextFile(htmlOutputFileName, html);
                  // testing for hidden tags
                  // html = html.replaceAll("<", "<");
                  // html = html.replaceAll(">", ">");
                  Document doc = Jsoup.parse(html, baseURI);
                  doc.outputSettings().escapeMode(EscapeMode.extended);

                  // TODO: process iframe elements in a loop making below a
                  // routine passing an Element
                  // Elements elements = document.select("iframe");
                  // Document iframeDoc = Jsoup.parse(elements.get(0).data());
                  // String iframeSrc = iframeDoc.attr("src");
                  /**
                   * 
                   * 
                   */

                  String formattedHTML = doc.toString();
                  formattedHTML = formattedHTML.replaceAll("&", "&");
                  // formattedHTML = formattedHTML.replaceAll("<", "<");
                  // formattedHTML = formattedHTML.replaceAll(">", ">");
                  formattedHTML = formattedHTML.replaceAll(""", "\"");

                  String formattedHTMLOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3)
                     + "_formatted.html";
                  MDfromHTMLUtils.saveTextFile(formattedHTMLOutputFileName,
                     formattedHTML);

                  String domain = Remark.getDomain(baseURI);
                  int testindex = baseURI.indexOf(domain);
                  // need to find actual domain for proper filters
                  String workingURI = baseURI
                     .substring(testindex + domain.length());
                  testindex = workingURI.toLowerCase().indexOf("http");
                  if (testindex >= 0) {
                     workingURI = workingURI.substring(testindex);
                     domain = Remark.getDomain(workingURI);
                     baseURI = workingURI;
                  }

                  String markdownOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3) + ".md";

                  provenanceOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3)
                     + "_html2md.json";

                  File provenanceOutputFile = new File(
                     provenanceOutputFileName);
                  if (provenanceOutputFile.exists()) {
                     provenanceOutputFile.delete();
                  }
                  provenanceWriter = new ProvenanceWriter(
                     formattedHTMLOutputFileName, markdownOutputFileName,
                     _remark.getHTMLFilters(), baseURI, domain,
                     new FileWriter(provenanceOutputFile, true));

                  // determine if we should skip markdown until first header is
                  // encountered
                  JSONObject domainFilters = (JSONObject) HTMLFilters
                     .get(domain);
                  if (domainFilters != null) {
                     Boolean test = (Boolean) domainFilters
                        .get(DocumentConverter.SEEK_HEADERS);
                     if (test != null) {
                        seekHeaders = test;
                     }
                  }

                  String markdown = generateMarkdownFromHTML(doc,
                     provenanceWriter, baseURI, seekHeaders);

                  if (_keepProvenanceLinks) {
                     markdown += "\n###### Doc2Dial Provenance ######\n\n"
                        + " * [Doc2Dial Original URL][]\n"
                        + " * [Doc2Dial File Processed][]\n\n[Doc2Dial Original URL]: "
                        + baseURI.replaceAll(" ", "%20")
                        // + .replaceAll("#", "%23").replaceAll("&", "%26")
                        + "\n[Doc2Dial File Processed]: file://"
                        + file.toAbsolutePath().toString();
                  }

                  MDfromHTMLUtils.saveTextFile(markdownOutputFileName,
                     markdown);
               } catch (Exception e) {
                  e.printStackTrace();
                  exitVal = -1;
               } finally {
                  if (provenanceWriter != null) {
                     try {
                        provenanceWriter.close();
                        // Note: leave all provenance with seekHeaders explicit
                        // in the HTMLFilters
                        // if (seekHeaders) {
                        // cleanUpAnnotations(provenanceOutputFileName);
                        // }
                     } catch (IOException e) {
                        e.printStackTrace();
                     }
                     provenanceWriter = null;
                  }
               }
            }
         }
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }

   String generateMarkdownFromHTML(Document doc,
      ProvenanceWriter provenanceWriter, String baseUri, boolean seekHeaders) {
      String markdown = _remark.convert(doc, provenanceWriter, baseUri);
      markdown = CleanupMarkdown.cleanAll(markdown, seekHeaders);
      return removeUnusedReferences(markdown);
   }

   void cleanUpAnnotations(String provenanceFileName) throws Exception {
      JSONObject provenanceObj = MDfromHTMLUtils
         .loadJSONFile(provenanceFileName);
      JSONArray provenanceArray = (JSONArray) provenanceObj.get("provenance");
      for (Iterator