All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mdfromhtml.markdown.transform.GetMarkdownFromHTML Maven / Gradle / Ivy

Go to download

Generate markdown (.md) files from html and url provided in JSON files. The name of the generated files will use the name of the JSON file, and an incrementing number starting with 1 for each JSON file read, and for each html reference within the files.

There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Stack;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities.EscapeMode;

import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.mdfromhtml.core.MDfromHTMLUtils;
import com.mdfromhtml.remark.utils.CleanupMarkdown;
import com.overzealous.remark.Options;
import com.overzealous.remark.Remark;
import com.overzealous.remark.convert.DocumentConverter;
import com.overzealous.remark.convert.ProvenanceWriter;

/**
 * Given an input directory containing json files with an array of objects, each
 * object containing an html entry and a url from which that html was captured,
 * parse the html and generate markdown (.md) files for each html / url pair. If
 * there is no url specified or it is empty or null then no relative url
 * mappings will be provided in the resulting markdown file. Markdown files
 * retain the name of the json file and are appended with the index within the
 * JSONArray for the object from which the html was used to generate the
 * markdown.
 * 
 * @author Nathaniel Mills
 */
public class GetMarkdownFromHTML {

   public static JSONObject getMarkdownFromHTML(JSONObject htmlObject) {
      JSONObject result = new JSONObject();
      ObjectMapper mapper = new ObjectMapper();
      JsonNode jsonnode;
      try {
         jsonnode = mapper.readTree(htmlObject.toString());
      } catch (IOException e) {
         result.put("errorMsg", String.format(
            "Error: Can not parse the object: %s" + e.getLocalizedMessage()));
         return result;
      }

      jsonnode = getMarkdownFromHTML((ObjectNode) jsonnode);
      try {
         result = (JSONObject) JSON.parse(jsonnode.toString());
      } catch (IOException e) {
         result.put("errorMsg",
            String.format("Error: Can not parse the response object: %s"
               + e.getLocalizedMessage()));
         return result;
      }
      return result;
   }

   public static ObjectNode getMarkdownFromHTML(ObjectNode htmlObject) {
      ObjectNode result = JsonNodeFactory.instance.objectNode();
      boolean returnProvenance = true;
      JsonNode testProv = htmlObject.get("returnProvenance");
      if (testProv != null && testProv.isBoolean()) {
         returnProvenance = testProv.asBoolean();
      }

      // initialize environment for this call
      ObjectNode HTMLFiltersObj = (ObjectNode) htmlObject.get("HTMLFilters");
      if (HTMLFiltersObj == null) {
         try {
            ObjectMapper mapper = new ObjectMapper();
            JsonNode filters = mapper.readTree(new File("HTML_Filters.json"));
            HTMLFiltersObj = (ObjectNode) filters;
         } catch (Exception e1) {
            result.put("errorMsg",
               "Error: Can not find \"HTML_Filters\" in the request, nor can a file named \"HTML_Filters.json\" be found: "
                  + e1.getLocalizedMessage());
            return result;
         }
      }
      JSONObject HTMLFilters;
      try {
         // fold everything to lowercase to match later during filtering
         String htmlFilters = HTMLFiltersObj.toString().toLowerCase();
         HTMLFilters = (JSONObject) JSON.parse(htmlFilters);
      } catch (IOException e) {
         result.put("errorMsg",
            "Error: Can not find parse the content of the \"HTML_Filters.json\" file: "
               + e.getLocalizedMessage());
         return result;
      }
      Options options = Options.multiMarkdown();
      options.hardwraps = true;
      GetMarkdownFromHTML pgm = new GetMarkdownFromHTML(options, HTMLFilters);
      JsonNode temp = htmlObject.get("html");
      if (temp == null) {
         // try to get this from the captureArray
         ArrayNode captureArray = (ArrayNode) htmlObject.get("captureArray");
         if (captureArray != null && captureArray.size() > 0) {
            ObjectNode obj = (ObjectNode) captureArray.get(0);
            temp = obj.get("html");
            if (temp == null) {
               result.put("errorMsg", "The request captureArray"
                  + " is missing the required \"html\" key so there is nothing to process.");
               return result;
            }
         } else {
            result.put("errorMsg", "The request"
               + " is missing the required \"html\" or \"captureArray\" key so there is nothing to process.");
            return result;
         }
      }
      String html = temp.asText();
      temp = htmlObject.get("url");
      if (temp == null) {
         result.put("errorMsg",
            "The request" + " is missing the required \"url\" key.");
         return result;
      }

      String baseURI = temp.asText();

      // testing for hidden tags
      // html = html.replaceAll("<", "<");
      // html = html.replaceAll(">", ">");
      Document doc = Jsoup.parse(html, baseURI);
      doc.outputSettings().escapeMode(EscapeMode.extended);

      // determine if we should skip markdown until first header is encountered
      boolean seekHeaders = true; // default is true so only special sites need
                                  // override this
      String domain = Remark.getDomain(baseURI);
      int testindex = baseURI.indexOf(domain);
      // need to find actual domain for proper filters
      String workingURI = baseURI.substring(testindex + domain.length());
      testindex = workingURI.toLowerCase().indexOf("http");
      if (testindex >= 0) {
         workingURI = workingURI.substring(testindex);
         domain = Remark.getDomain(workingURI);
         baseURI = workingURI;
      }
      JSONObject domainFilters = (JSONObject) HTMLFilters.get(domain);
      if (domainFilters != null) {
         Boolean test = (Boolean) domainFilters
            .get(DocumentConverter.SEEK_HEADERS);
         if (test != null) {
            seekHeaders = test;
         }
      }

      // create a provenance writer using a string writer so we can return the
      // provenance in the response
      ProvenanceWriter provWriter = null;
      StringWriter sw = null;
      BufferedWriter bw = null;
      if (returnProvenance != false) {
         sw = new StringWriter();
         bw = new BufferedWriter(sw);
         try {
            provWriter = new ProvenanceWriter("", "", HTMLFilters, baseURI,
               domain, bw);
         } catch (IOException e) {
            e.printStackTrace();
            provWriter = null;
         }
      }
      String markdown = pgm.generateMarkdownFromHTML(doc, provWriter, baseURI,
         seekHeaders);

      result.set("HTMLFilters", HTMLFiltersObj);
      if (returnProvenance != false && provWriter != null) {
         // note: close finished the JSON object in sw
         try {
            provWriter.close();
         } catch (IOException e1) {
            e1.printStackTrace();
         }
         String provenance = sw.getBuffer().toString();
         ObjectMapper mapper = new ObjectMapper();
         JsonNode provObj = null;
         try {
            provObj = mapper.readTree(provenance);
         } catch (IOException e) {
            e.printStackTrace();
         }

         result.set("provenance", provObj);
         // override the HTMLFilters with the one reported by provenance.
         ObjectNode testHTMLFilters = (ObjectNode) provObj.get("HTMLFilters");
         if (testHTMLFilters != null) {
            result.set("HTMLFilters", testHTMLFilters);
         }
      }

      // add in the html used for generation in a captureArray
      ArrayNode captureArray = JsonNodeFactory.instance.arrayNode();
      ObjectNode htmlInfo = JsonNodeFactory.instance.objectNode();
      htmlInfo.put("content", doc.text());
      htmlInfo.put("html", html);
      htmlInfo.put("url", baseURI);
      captureArray.add(htmlInfo);
      result.set("captureArray", captureArray);
      result.set("markdown", JsonNodeFactory.instance.textNode(markdown));
      result.put("returnProvenance", returnProvenance);
      result.put("url", baseURI);
      return result;
   }

   /**
    * Main entry point to read a specified input directory to find json files
    * containing an array of objects with the html and url from the HTML capture
    * utility () and transform the markdown and structured html files saved in
    * *.md and *_formatted.html files in the specified output directory.
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag (if not supplied, the
    *           program prompts for their values)
    */
   public static void main(String[] args) {
      int exitVal = 0;
      JSONObject HTMLFilters = null;
      try {
         HTMLFilters = MDfromHTMLUtils.loadJSONFile("." + File.separator
            + "properties" + File.separator + "HTML_Filters.json");
         // fold to lowercase
         try {
            HTMLFilters = (JSONObject) JSON
               .parse(HTMLFilters.toString().toLowerCase());
         } catch (Exception e) {
            System.out.println("Error: \"." + File.separator + "properties"
               + File.separator + "HTML_Filters.json\" has a parsing error: "
               + e.getLocalizedMessage());
            return;
         }
      } catch (Exception e1) {
         System.out.println("Error: No HTML Filters -- can not find \"."
            + File.separator + "properties" + File.separator
            + "HTML_Filters.json\": " + e1.getLocalizedMessage());
         return;
      }
      Options options = Options.multiMarkdown();
      options.hardwraps = true;
      GetMarkdownFromHTML pgm = new GetMarkdownFromHTML(options, HTMLFilters);
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated markdown (.md), and html (.html and _foramtted.html) "
               + "saved in " + pgm._outputPath); //
         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  try {
                     exitVal = pgm.doWork(file, HTMLFilters);
                     if (exitVal != 0) {
                        break;
                     }
                  } catch (Exception e) {
                     e.printStackTrace();
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
      System.exit(exitVal);
   }

   String _ext = "json";
   Path _inputPath = null;
   boolean _interactive = false;
   String _outputPath = ".";
   Remark _remark = new Remark(Options.multiMarkdown());
   boolean _thumbsucker = false;
   boolean _keepProvenanceLinks = true;

   // public GetMarkdownFromHTML() {
   // this(Options.multiMarkdown());
   // }
   //
   // public GetMarkdownFromHTML(Options options) {
   // this(options, (JSONObject) null);
   // }

   public GetMarkdownFromHTML(Options options, JSONObject HTMLFilters) {
      _remark = new Remark(options, HTMLFilters);
   }

   /**
    * Process the specified file to transform its content into formatted text
    * and save it to a txt file in the specified output directory.
    * 
    * @param file
    *           the file containing the annotation json from ICCC
    * @param HTMLFilters
    *           object containing global and domain specific filter rules to
    *           control markdown generation
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file, JSONObject HTMLFilters) {
      int exitVal = 0;
      ProvenanceWriter provenanceWriter = null;
      String provenanceOutputFileName = "unknown";
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         ObjectMapper mapper = new ObjectMapper();
         File jsonTestFile = new File(file.toString());
         JsonNode tempJSON = mapper.readTree(jsonTestFile);
         String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
         int index = shortFileName.lastIndexOf("." + _ext);
         if (index < 1) {
            System.out.println(shortFileName + "doesn't end with ." + _ext);
            exitVal = -1;
         } else {
            ObjectNode inputJson = (ObjectNode) tempJSON;
            int htmlCounter = 0;
            ArrayNode htmlList = (ArrayNode) inputJson.get("captureArray");
            if (htmlList == null) {
               JsonNode htmlObj = (JsonNode) inputJson.get("captureDict");
               if (htmlObj == null) {
                  System.err.println(fqFileName
                     + " is missing the \"captureArray\" and the \"captureDict\" tag. Please fix and retry.");
                  System.exit(-1);
               }
               htmlList = JsonNodeFactory.instance.arrayNode();
               htmlList.add(htmlObj);
            }
            String baseURI = null;
            String html = null;
            JsonNode temp = null;
            for (Object obj : htmlList) {
               boolean seekHeaders = true; // default is true so only special
                                           // sites need override this
               JSONObject globalFilters = (JSONObject) HTMLFilters.get("*");
               if (globalFilters != null) {
                  Boolean test = (Boolean) globalFilters
                     .get(DocumentConverter.SEEK_HEADERS);
                  if (test != null) {
                     seekHeaders = test;
                  }
               }

               try {
                  htmlCounter++;
                  ObjectNode htmlObj = (ObjectNode) obj;
                  temp = htmlObj.get("html");
                  if (temp == null) {
                     System.err.println(fqFileName
                        + " is missing the \"html\" key in the ["
                        + (htmlCounter - 1) + "] element of the captureArray.");
                     System.exit(-1);
                  }
                  html = temp.asText();
                  temp = htmlObj.get("url");
                  if (temp == null) {
                     System.err.println(fqFileName
                        + " is missing the \"utl\" key in the ["
                        + (htmlCounter - 1) + "] element of the captureArray.");
                     System.exit(-1);
                  }
                  baseURI = temp.asText();
                  String htmlOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3) + ".html";
                  MDfromHTMLUtils.saveTextFile(htmlOutputFileName, html);
                  // testing for hidden tags
                  // html = html.replaceAll("<", "<");
                  // html = html.replaceAll(">", ">");
                  Document doc = Jsoup.parse(html, baseURI);
                  doc.outputSettings().escapeMode(EscapeMode.extended);

                  // TODO: process iframe elements in a loop making below a
                  // routine passing an Element
                  // Elements elements = document.select("iframe");
                  // Document iframeDoc = Jsoup.parse(elements.get(0).data());
                  // String iframeSrc = iframeDoc.attr("src");
                  /**
                   * 
                   */

                  String formattedHTML = doc.toString();
                  formattedHTML = formattedHTML.replaceAll("&", "&");
                  // formattedHTML = formattedHTML.replaceAll("<", "<");
                  // formattedHTML = formattedHTML.replaceAll(">", ">");
                  formattedHTML = formattedHTML.replaceAll(""", "\"");

                  String formattedHTMLOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3)
                     + "_formatted.html";
                  MDfromHTMLUtils.saveTextFile(formattedHTMLOutputFileName,
                     formattedHTML);

                  String domain = Remark.getDomain(baseURI);
                  int testindex = baseURI.indexOf(domain);
                  // need to find actual domain for proper filters
                  String workingURI = baseURI
                     .substring(testindex + domain.length());
                  testindex = workingURI.toLowerCase().indexOf("http");
                  if (testindex >= 0) {
                     workingURI = workingURI.substring(testindex);
                     domain = Remark.getDomain(workingURI);
                     baseURI = workingURI;
                  }

                  String markdownOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3) + ".md";

                  provenanceOutputFileName = _outputPath
                     + shortFileName.substring(0, index) + "_"
                     + MDfromHTMLUtils.padLeftZero(htmlCounter, 3)
                     + "_html2md.json";

                  File provenanceOutputFile = new File(
                     provenanceOutputFileName);
                  if (provenanceOutputFile.exists()) {
                     provenanceOutputFile.delete();
                  }
                  provenanceWriter = new ProvenanceWriter(
                     formattedHTMLOutputFileName, markdownOutputFileName,
                     _remark.getHTMLFilters(), baseURI, domain,
                     new FileWriter(provenanceOutputFile, true));

                  // determine if we should skip markdown until first header is
                  // encountered
                  JSONObject domainFilters = (JSONObject) HTMLFilters
                     .get(domain);
                  if (domainFilters != null) {
                     Boolean test = (Boolean) domainFilters
                        .get(DocumentConverter.SEEK_HEADERS);
                     if (test != null) {
                        seekHeaders = test;
                     }
                  }

                  String markdown = generateMarkdownFromHTML(doc,
                     provenanceWriter, baseURI, seekHeaders);

                  if (_keepProvenanceLinks) {
                     markdown += "\n###### Doc2Dial Provenance ######\n\n"
                        + " * [Doc2Dial Original URL][]\n"
                        + " * [Doc2Dial File Processed][]\n\n[Doc2Dial Original URL]: "
                        + baseURI.replaceAll(" ", "%20")
                        // + .replaceAll("#", "%23").replaceAll("&", "%26")
                        + "\n[Doc2Dial File Processed]: file://"
                        + file.toAbsolutePath().toString();
                  }

                  MDfromHTMLUtils.saveTextFile(markdownOutputFileName,
                     markdown);
               } catch (Exception e) {
                  e.printStackTrace();
                  exitVal = -1;
               } finally {
                  if (provenanceWriter != null) {
                     try {
                        provenanceWriter.close();
                        // Note: leave all provenance with seekHeaders explicit
                        // in the HTMLFilters
                        // if (seekHeaders) {
                        // cleanUpAnnotations(provenanceOutputFileName);
                        // }
                     } catch (IOException e) {
                        e.printStackTrace();
                     }
                     provenanceWriter = null;
                  }
               }
            }
         }
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }

   String generateMarkdownFromHTML(Document doc,
      ProvenanceWriter provenanceWriter, String baseUri, boolean seekHeaders) {
      String markdown = _remark.convert(doc, provenanceWriter, baseUri);
      markdown = CleanupMarkdown.cleanAll(markdown, seekHeaders);
      return removeUnusedReferences(markdown);
   }

   void cleanUpAnnotations(String provenanceFileName) throws Exception {
      JSONObject provenanceObj = MDfromHTMLUtils
         .loadJSONFile(provenanceFileName);
      JSONArray provenanceArray = (JSONArray) provenanceObj.get("provenance");
      for (Iterator it = provenanceArray.iterator(); it.hasNext();) {
         JSONObject annotation = (JSONObject) it.next();
         String md = (String) annotation.get("md");
         if (md != null && md.startsWith("#") == false) {
            it.remove();
         } else {
            // found first header so break out
            break;
         }
      }
      provenanceObj.put("provenance", provenanceArray);
      MDfromHTMLUtils.saveJSONFile(provenanceFileName, provenanceObj);
   }

   /**
    * Get the parameters necessary for program execution: input directory,
    * output directory, and whether to append annotation details to sentences
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag
    * @return true if we have sufficient parameters to execute the program
    */
   boolean getParams(String[] args) {
      String inputPath = "." + File.separator + "data" + File.separator
         + "htmljson";
      String outputPath = "." + File.separator + "data" + File.separator + "md";
      String tmp = "";

      try {
         if (args.length >= 1) {
            inputPath = args[0];
         } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
               "Enter the fully qualified path to directory containing " + _ext
                  + " html capture files, or q to exit (" + inputPath + "):");
            if (tmp == null || tmp.length() == 0) {
               tmp = inputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
               return false;
            }
            inputPath = tmp;
         }
         if (inputPath.endsWith(File.separator) == false) {
            inputPath += File.separator;
         }
         _inputPath = FileSystems.getDefault().getPath(inputPath);
      } catch (InvalidPathException ipe) {
         System.out.println(
            "Error: " + args[0] + " is not a valid directory to form a path.");
         return false;
      }
      if (args.length >= 2) {
         outputPath = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the markdown output directory, or q to exit ("
               + outputPath + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = outputPath;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         outputPath = tmp;
      }
      if (outputPath.endsWith(File.separator) == false) {
         outputPath += File.separator;
      }
      File testOutput = new File(outputPath);
      if (testOutput.exists() == false) {
         System.out.println(
            "Error: The output directory \"" + outputPath + "\" must exist.");
         return false;
      }
      if (testOutput.isDirectory() == false) {
         System.out.println("Error: The output directory \"" + outputPath
            + "\" must be a directory.");
         return false;
      }
      _outputPath = outputPath;
      if (args.length >= 3) {
         _thumbsucker = new Boolean(args[2]);
      }

      if (args.length >= 4) {
         _keepProvenanceLinks = new Boolean(args[3]);
      }

      return true;
   }

   /**
    * Search for references in the markdown line. A reference contains a pattern
    * with [...]: ...
    * 
    * @param line
    *           markdown line to be examined
    * @return set of references found in the line.
    */
   Set getReferences(String line) {
      Set results = new HashSet();
      String test = line.trim();
      int index = test.indexOf("[");
      int index2 = -1;
      String reference = "";
      while (index != -1) {
         // Check that we have found a reference containing [...]: ...
         index2 = test.indexOf("]: ");
         if (index2 == -1) {
            break;
         }
         if ((index2 - index) > 2) {
            reference = test.substring(index, index2 + 1);
            results.add(reference);
         }
         test = test.substring(index2 + 3);
         index = test.indexOf("[");
      }
      return results;
   }

   /**
    * Search for reference links in the markdown line. A referrrence link
    * contains a pattern with [...] without a following ": "
    * 
    * @param line
    *           markdown line to be examined
    * @param refLinks
    *           set of reference links in the markdown
    * @param references
    *           set of references in the markdown
    */
   static public void getReferencesAndLinks(String line, Set refLinks,
      Set references) {
      String test = line.trim();
      char[] testChars = new char[test.length()];
      test.getChars(0, test.length(), testChars, 0);
      int offset = 0;
      boolean foundBracket = false;
      int bracketCnt = 0;
      int startOffset = -1;
      String refLink = "";
      String reference = "";
      Stack startOffsets = new Stack();
      for (char testChar : testChars) {
         switch (testChar) {
            case 0x005b: { // "["
               if (!foundBracket) {
                  foundBracket = true;
               }
               startOffsets.push(offset);
               bracketCnt++;
               break;
            }
            case 0x005d: { // "]"
               if (foundBracket) {
                  bracketCnt--;
                  startOffset = startOffsets.pop();
                  if ((offset - startOffset) > 1) {
                     if ((offset + 1) < test.length()) {
                        if (0x003a == testChars[offset + 1]) { // ":"
                           reference = test.substring(startOffset, offset + 1);
                           references.add(reference);
                        } else {
                           refLink = test.substring(startOffset, offset + 1);
                           refLinks.add(refLink);
                        }
                     } else {
                        refLink = test.substring(startOffset, offset + 1);
                        refLinks.add(refLink);
                     }
                  }
                  if (bracketCnt == 0) {
                     startOffset = -1;
                     foundBracket = false;
                  }
               }
               break;
            }
            default: {
               break;
            }
         }
         offset++;
      }
      return;
   }

   static public String removeUnusedReferences(String markdown) {
      /**
       * Read through the markdown twice, once to find referenced links and
       * references and then to remove unreferenced references
       */
      StringBuffer sb = new StringBuffer();
      BufferedReader br = new BufferedReader(new StringReader(markdown));
      String line = "";
      String test = "";
      Set refLinks = new HashSet();
      Set references = new HashSet();
      int index = -1;
      String refStr = "";
      try {
         while ((line = br.readLine()) != null) {
            // TODO: combine getRefLinks and getReferences into one method
            getReferencesAndLinks(line, refLinks, references);
         }
         // remove referenced links from references leaving only unreferenced
         // references
         for (String link : refLinks) {
            references.remove(link);
         }
         // if all are accounted for, we are done
         if (references.size() == 0) {
            // return original markdown as-is
            sb = new StringBuffer(markdown);
         } else { // run through markdown to remove unreferenced references
            br = new BufferedReader(new StringReader(markdown));
            while ((line = br.readLine()) != null) {
               test = line.trim();
               if (test.startsWith("[")) {
                  index = test.indexOf("]: ");
                  if (index != -1) {
                     refStr = test.substring(0, index + 1);
                     if (references.contains(refStr)) {
                        references.remove(refStr);
                        // filter this line from markdown as it contains an
                        // unreferenced reference
                        continue;
                     }
                  } else {
                     if (test.endsWith("]:")) {
                        // this is an invalid reference "[blah]: "
                        continue;
                     }
                  }
               }
               sb.append(line);
               sb.append("\n");
            }
         }
      } catch (IOException ioe) {

      }
      return sb.toString();
   }

}