com.mdfromhtml.markdown.transform.GetTextFromMarkdown Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of MarkdownGenerator Show documentation
Generate markdown (.md) files from html and url provided in JSON files. The name of the generated files will use the name of the JSON file, and an incrementing number starting with 1 for each JSON file read, and for each html reference within the files.
There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
import com.overzealous.remark.Options;
import com.overzealous.remark.Remark;
import com.overzealous.remark.convert.ProvenanceWriter;

/**
 * Utility class to transform multimarkdown generated from HTML into text files.
 * Files are read from an input directory, and written to an output directory.
 * 
 * @author Nathaniel Mills
 */
public class GetTextFromMarkdown {

    static public boolean _includeLinks = false;

    static public Map findRefURLs(List mdLines) {
        Map refURLs = new HashMap();
        int offset = 0;
        String url = "";
        String ref = "";
        for (String mdLine : mdLines) {
            mdLine = mdLine.trim();
            if (mdLine.startsWith("[")) {
                offset = mdLine.indexOf("]: ");
                if (offset > 1) {
                    ref = mdLine.substring(1, offset).trim();
                    url = mdLine.substring(offset + 3).trim();
                    url = url.replaceAll("_", _urlUnderscore);
                    refURLs.put(ref, url);
                }
            }
        }
        return refURLs;
    }

    static public String generateTextFromMarkdown(String mdLine, Map refURLs) {
        String test = mdLine.trim();
        if (test.length() > 0) {
            test = TextUtils.filterComments(test);
            test = processText(test, refURLs);
        }
        return test;
    }

    static public String getRef(String line) {
        String retVal = "";
        int startOffset = line.indexOf("[");
        int testOffset = line.indexOf(")");
        
        if (startOffset > -1 && (testOffset == -1 || startOffset < testOffset)) {
            int endOffset = line.substring(startOffset).indexOf("]");
            if (endOffset > -1 && endOffset > startOffset) {
                retVal = line.substring(startOffset + 1, endOffset + startOffset)
                    .trim();
            }
        }
        return retVal;
    }

    static public String getDirectRef(String line) {
        String retVal = "";
        int startOffset = line.indexOf("(");
        if (startOffset > -1) {
            int endOffset = line.substring(startOffset+1).indexOf(")");
            if (endOffset > -1 && endOffset > startOffset) {
                retVal = line.substring(startOffset + 1, endOffset + startOffset + 1)
                    .trim();
            }
        }
        return retVal;
    }

    /**
    * @param args
    */
    public static void main(String[] args) throws Exception {
        JSONObject HTMLFilters = null;
        try {
            HTMLFilters = MDfromHTMLUtils.loadJSONFile("." + File.separator + "properties" + File.separator + "HTML_Filters.json");
        } catch (Exception e1) {
            System.out.println(
                "Warning: Using no HTML Filters -- can not find " + "." + File.separator + "properties" + File.separator + "HTML_Filters.json\": "
                    + e1.getLocalizedMessage());
        }
        GetTextFromMarkdown pgm = new GetTextFromMarkdown(Options.multiMarkdown(),
            HTMLFilters);
        if (pgm.getParams(args)) {
            if (pgm._thumbsucker) {
                System.out.println("\nFiles ending with ." + pgm._ext
                    + " will be read from " + pgm._inputPath //
                    + "\nand the generated text files (." + pgm._txtext
                    + ") will be " + "saved in " + pgm._outputPath
                    + "\nIt is " + _includeLinks + " that links will be included in the text output."); //
            }
            if (pgm._interactive) {
                if (MDfromHTMLUtils
                    .prompt("Press q to quit or press Enter to continue...")
                    .length() == 0) {
                    pgm._interactive = false;
                }
            }
            if (!pgm._interactive) {
                try {
                    List files = MDfromHTMLUtils.listSourceFiles(
                        FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                        pgm._ext);
                    for (Path file : files) {
                        try {
                            pgm.doWork(file);
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                } catch (Exception e) {
                    System.out
                        .println("Error: Can not reference files with extension "
                            + pgm._ext + " in directory " + pgm._inputPath
                            + " reason: " + e.getLocalizedMessage());
                }
            }
            if (pgm._thumbsucker) {
                System.out.println();
            }
        }
        if (args.length == 0) {
            System.out.println("Goodbye");
        }
    }

    static public String processText(String line, Map refURLs) {
        StringBuffer sb = new StringBuffer();
        String testChanged = "";
        int lineLen = line.length();
        TextTracker tt = new TextTracker();
        int offset = 0;
        while (offset < lineLen) {
            if (line == null) {
                break;
            }
            char startChar = line.charAt(offset); // substring(offset, offset + 1);
            switch (startChar) {
                // handle escaped characters first
                case 0x005c: { // backslash
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    // skip first backslash and save next char
                    offset++;
                    if (offset < lineLen) {
                        sb.append(line.charAt(offset));
                        offset++;
                    }
                    if (offset < lineLen) {
                        line = line.substring(offset);
                        lineLen = line.length();
                    } else {
                        // reached end of line (breaks out of while loop)
                        lineLen = 0;
                        tt._inTable = false;
                        tt._colSkipped = 0;
                    }
                    offset = 0;
                    break;
                }
                case 0x0021: { // exclamation or image link
                    if (offset < lineLen - 1) {
                        if ("[".equals(line.substring(offset + 1, offset + 2))) {
                            // image link
                            line = removeReferencesAndLinks(line.substring(offset),
                                refURLs, tt);
                            if (line == null) { // signal to break from while loop
                                tt._inTable = false;
                                tt._colSkipped = 0;
                                break;
                            }
                            offset = 0;
                            lineLen = line.length();
                        } else { // just an exclamation point
                            line = line.substring(offset + 1);
                            offset = 0;
                            lineLen = line.length();
                        }
                    } else { // just an exclamation point at end of line
                        offset = line.length(); // signal end of while loop
                        tt._inTable = false;
                        tt._colSkipped = 0;
                    }
                    sb.append("!");
                    break;
                }
                case 0x005b: { // left bracket == link
                    line = removeReferencesAndLinks(line.substring(offset), refURLs, tt);
                    // check for complete line deletion
                    tt._inTable = false;
                    tt._colSkipped = 0;
                    if (line == null) {
                        return null;
                    }
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                // Table lines
                case 0x007c: { // pipe == table column separator
                    if (tt._inTable) {
                        // append _colSkipped spaces to text
                        while (tt._colSkipped > 0) {
                            sb.append(" ");
                            tt._colSkipped--;
                        }
                    }
                    tt._inTable = true;
                    line = removeTableLines(line.substring(offset));
                    offset = 0;
                    lineLen = line.length();
                    if (lineLen == 0) {
                        if (tt._inTable) {
                            tt._colSkipped = 0;
                            tt._inTable = false;
                        }
                    }
                    break;
                }
                /**
                 * Note: these simple cleansers should be below link cases "!" and
                 * "[" and tables
                 */
                case 0x002d: { // hyphen or task list "- [x]" or "- [ ]"
                    testChanged = removeTaskList(line.substring(offset), tt);
                    if (testChanged.equals(line.substring(offset))) {
                        // just a hyphen, no change
                        sb.append("-");
                        line = testChanged.substring(1);
                    } else {
                        line = testChanged;
                    }
                    offset = 0; // skip next char (space or remaining hyphen)
                    lineLen = line.length();
                    break;
                }
                case 0x0060: { // back tick == fenced code blocks or code
                    line = removeFencing(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x0023: { // hash tag == headings
                    line = removeHeading(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x005f: { // underscore
                    line = removeUnderscore(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x007e: { // tilde == strike through or fencing
                    // first remove fencing
                    line = removeFencing(line.substring(offset), tt);
                    line = removeEmphasis(line, tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x002a: { // asterisk == bold, italic
                    line = removeEmphasis(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                default: { // just text
                    sb.append(startChar);
                    offset++;
                    break;
                }
            }
        }
        String temp = sb.toString();
        if (temp.contains("\\\\:")) {
            temp = temp.replaceAll("\\\\:", ":");
        }
        if (temp.contains("\\\\-")) {
            temp = temp.replaceAll("\\\\-", "-");
        }
        // special processing to replace placeholders with brackets
        temp = temp.replaceAll(_openBracket, "[");
        temp = temp.replaceAll(_closeBracket, "]");
        temp = temp.replaceAll(_urlUnderscore, "_");
        // #202
        temp = temp.replace("\u2022", "*");
        // #84
        temp = temp.replace(">", ">");
        temp = temp.replace("<", "<");
        temp = temp.replace("≥", ">=");
        temp = temp.replace("&le", "<=");
        return temp;
    }

    /**
    * Remove bold, italics, underscore, strikethrough
    * 
    * @param line
    *           input to be cleansed
    * @return cleansed version of input
    */
    static public String removeEmphasis(String line, TextTracker tt) {
        StringBuffer sb = new StringBuffer();
        char test = ' ';
        char priorChar = 0x00;
        char nextChar = 0x00;
        for (int i = 0; i < line.length(); i++) {
            test = line.charAt(i);
            if (i < line.length()-1) {
                nextChar = line.charAt(i+1);
            } else {
                nextChar = 0x00; // non-match choice
            }
            switch (test) {
                case '~': {
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    break;
                }
                case '*': {
                    if ((i == 0 || priorChar == 0x20) && nextChar == 0x20) {
                        sb.append('\u2022');
                    } else if ((priorChar == 0x5c)) {
                        sb.append(test);
                    } else {
                        if (tt._inTable) {
                            tt._colSkipped++;
                        }
                    }
                    break;
                }
                case '|': {
                    if (tt._inTable) {
                        // keep the pipe and abort removing emphasis
                        sb.append(line.substring(i));
                        return sb.toString();
                    }
                }
                default: {
                    sb.append(test);
                }
            }
            priorChar = test;
        }
        return sb.toString();
    }

    /**
    * Remove patterns like ```, ~~~, ```json, ~~~java
    * 
    * @param line
    *           input to be cleansed
    * @param tt track whether we are in a table and the number of columns skipped
    * @return cleansed version of input
    */
    static private String removeFencing(String line, TextTracker tt) {
        String remainingLine = "";
        int pipeOffset = line.indexOf("|");
        if (tt._inTable && pipeOffset >= 0) {
            // only work on the part before the pipe
            remainingLine = line.substring(pipeOffset);
            line = line.substring(0,pipeOffset);
        }
        String remLine = new String(line);
        StringBuffer sb = new StringBuffer();
        if (tt._inTable) {
            // need to know what was removed from the string including consecutive patterns
            Matcher matcher = _fencePattern1.matcher(remLine);
            while (matcher.find()) {
                tt._colSkipped += matcher.end() - matcher.start();
            }
        }
        String[] parts = line.split("```\\w+");
        for (int i = 0; i < parts.length; i++) {
            if (parts[i].length() != 0) {
                sb.append(parts[i]);
            }
        }
        line = sb.toString();
        remLine = new String(line);
        if (tt._inTable) {
            // need to know what was removed from the string including consecutive patterns
            Matcher matcher = _fencePattern2.matcher(remLine);
            while (matcher.find()) {
                tt._colSkipped += matcher.end() - matcher.start();
            }
        }
        sb = new StringBuffer();
        parts = line.split("~~~\\w+");
        for (int i = 0; i < parts.length; i++) {
            if (parts[i].length() != 0) {
                sb.append(parts[i]);
            }
        }
        line = sb.toString();
        // while fencing is three ticks, single ticks connote inline code
        sb = new StringBuffer();
        char test = ' ';
        for (int i = 0; i < line.length(); i++) {
            test = line.charAt(i);
            switch (test) {
                case '`': {
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    break;
                }
                default: {
                    sb.append(test);
                }
            }
        }
        return sb.toString()+remainingLine;
    }

    /**
    * Processes content with surrounding #'s signifying a header and transforms
    * them to text without the #'s
    * 
    * @param line
    *           text to be cleansed of headers
    * @param tt track whether we are in a table and the number of columns skipped
    * @return text without headers
    */
    static private String removeHeading(String line, TextTracker tt) {
        StringBuffer sb = new StringBuffer();
        char test = ' ';
        for (int i = 0; i < line.length(); i++) {
            test = line.charAt(i);
            switch (test) {
                case '#': {
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    break;
                }
                case '|': {
                    if (tt._inTable) {
                        return sb.toString()+line.substring(i);
                    }
                }
                default: {
                    sb.append(test);
                }
            }
        }
        return sb.toString();
    }

    /**
    * Search for reference links in the markdown line. A reference link contains
    * a pattern with [...] without a following ": "
    * 
    * @param line
    *           markdown line to be examined
    * @param refURLs
    *           map of a reference to its corresponding URL to enable the URL to
    *           be added where a reference is made. 
    * @param tt track whether we are in a table and the number of columns skipped
    * @return the revised line stripped of links, or null if nothing from this
    *         line should be saved (e.g., for a reference with pattern [...]:...
    */
    static public String removeReferencesAndLinks(String line, Map refURLs, TextTracker tt) {
        StringBuffer sb = new StringBuffer();
        String test = line.trim();
        boolean foundBracket = false;
        int bracketCnt = 0;
        int startOffset = -1;
        String refLink = "";
        String url = "";
        String ref = "";
        // String reference = "";
        Stack startOffsets = new Stack();
        boolean isImageRef = false;
        boolean needLabel = false;
        int charOffset = 0;
        while (charOffset < test.length()) {
            char testChar = test.charAt(charOffset);
            switch (testChar) {
                case 0x005b: { // "["
                    if (!isImageRef) {
                        // flipflop need for label to skip refLink
                        needLabel = !needLabel;
                    } // else in an image link so don't capture anything

                    if (!foundBracket) {
                        foundBracket = true;
                    }
                    startOffsets.push(charOffset);
                    bracketCnt++;
                    break;
                }
                case 0x005d: { // "]"
                    if (foundBracket) {
                        bracketCnt--;
                        startOffset = startOffsets.pop();
                        if ((charOffset - startOffset) > 1) {
                            if ((charOffset + 1) < test.length()) {
                                if (0x003a == test.charAt(charOffset + 1)) { // ":"
                                    /**
                                     * Don't save any reference information (signal line
                                     * deletion with null
                                     */
                                    return null;
                                } else {
                                    refLink = test.substring(startOffset + 1, charOffset).trim();
                                    refLink = refLink.replaceAll("_", _urlUnderscore);
                                    if (needLabel && startOffsets.isEmpty()) {
                                        if (_includeLinks) {
                                            sb.append(_openBracket+refLink+_closeBracket);
                                            if (tt._inTable) {
                                              tt._colSkipped-=2;;
                                              if (tt._colSkipped < 0) {
                                                tt._colSkipped = 0;
                                              }
                                            }
                                        } else {
                                            sb.append(refLink);
                                        }
                                        // try to insert the corresponding URL
                                        ref = getRef(test.substring(charOffset+1)).trim();
                                        if (ref.equals("")) {
                                            ref = getDirectRef(test.substring(charOffset+1).trim());
                                            if (ref.equals("")) {
                                                if (tt._inTable) {
                                                    tt._colSkipped += 2; // for the [ and ]
                                                }
                                                ref = refLink;
                                                ref = ref.replaceAll("_", _urlUnderscore);
                                            } else {
                                                ref = ref.replaceAll("_", _urlUnderscore);
                                                // account for direct ref found and brackets
                                                charOffset += ref.length()+2;
                                                if (tt._inTable) {
                                                    tt._colSkipped += ref.length()+2;
                                                }
                                                if (_includeLinks) {
                                                    sb.append("(");
                                                    sb.append(ref);
                                                    sb.append(")");
                                                    if (tt._inTable) {
                                                        /**
                                                         * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                         * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                         * with any extra spaces
                                                         */
                                                        tt._colSkipped -= 2 + ref.length();
                                                        if (tt._colSkipped < 0) {
                                                            tt._colSkipped = 0;
                                                        }
                                                    }
                                                }
                                            }
                                        } else {
                                            // skip over the reference found plus brackets [ref]
                                            charOffset+=ref.length()+2;
                                            if (tt._inTable) {
                                                tt._colSkipped += ref.length() + 2;
                                            }
                                            url = refURLs.get(ref);
                                            if (url == null) {
                                                url = getDirectRef(test.substring(charOffset+1).trim());
                                            }
                                            if (_includeLinks) {
                                                url = url.replaceAll("_", _urlUnderscore);
                                                sb.append("(");
                                                sb.append(url);
                                                sb.append(")");
                                                if (tt._inTable) {
                                                    /**
                                                     * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                     * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                     * with any extra spaces
                                                     */
                                                    tt._colSkipped -= 2 + url.length();
                                                    if (tt._colSkipped < 0) {
                                                        tt._colSkipped = 0;
                                                    }
                                                }
                                            }
                                        }
                                        needLabel = false;
                                    }
                                    if (isImageRef) {
                                        // set up so next [ makes this false
                                        needLabel = true;
                                    }
                                }
                            } else { // at the end of the line
                                refLink = test.substring(startOffset + 1, charOffset).trim();
                                if (needLabel) {
                                    if (_includeLinks) {
                                      sb.append(_openBracket+refLink+_closeBracket);
                                      if (tt._inTable) {
                                          tt._colSkipped-=2;
                                        if (tt._colSkipped < 0) {
                                            tt._colSkipped = 0;
                                        }
                                      }
                                    } else {
                                        sb.append(refLink);
                                    }
                                    // try to insert the corresponding URL
                                    ref = getRef(test.substring(charOffset+1)).trim();
                                    if (ref.equals("")) {
                                        ref = getDirectRef(test.substring(charOffset+1).trim());
                                        if (ref.equals("")) {
                                            if (tt._inTable) {
                                                tt._colSkipped += 2; // for the [ and ]
                                            }
                                            ref = refLink;
                                            ref = ref.replaceAll("_", _urlUnderscore);
                                        } else {
                                            ref = ref.replaceAll("_", _urlUnderscore);
                                            // account for direct ref found and brackets
                                            charOffset += ref.length()+2;
                                            if (tt._inTable) {
                                                tt._colSkipped += ref.length()+2;
                                            }
                                            if (_includeLinks) {
                                                sb.append("(");
                                                sb.append(ref);
                                                sb.append(")");
                                                if (tt._inTable) {
                                                    /**
                                                     * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                     * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                     * with any extra spaces
                                                     */
                                                    tt._colSkipped -= 2 + ref.length();
                                                    if (tt._colSkipped < 0) {
                                                        tt._colSkipped = 0;
                                                    }
                                                }
                                            }
                                        }
                                    } else {
                                        // skip over the reference found plus brackets [ref]
                                        charOffset+=ref.length()+2;
                                        if (tt._inTable) {
                                            tt._colSkipped += ref.length() + 2;
                                        }
                                        url = refURLs.get(ref);
                                        if (url != null) {
                                            if (_includeLinks) {
                                                url = url.replaceAll("_", _urlUnderscore);
                                                sb.append("(");
                                                sb.append(url);
                                                sb.append(")");
                                                if (tt._inTable) {
                                                    /**
                                                     * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                     * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                     * with any extra spaces
                                                     */
                                                    tt._colSkipped -= 2 + url.length();
                                                    if (tt._colSkipped < 0) {
                                                        tt._colSkipped = 0;
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    needLabel = false;
                                }
                                if (isImageRef) {
                                    // set up so next [ makes this false
                                    needLabel = true;
                                }
                            }
                        }
                        if (bracketCnt == 0) {
                            startOffset = -1;
                            foundBracket = false;
                            isImageRef = false;
                            needLabel = false;
                        }
                    }
                    break;
                }
                case 0x0021: { // ! (may be an image
                    if (charOffset < test.length() - 1) {
                        // check the next character
                        if (0x005b == test.charAt(charOffset + 1)) { // [
                            isImageRef = true;
                            // set up so next [ will grab the label
                            needLabel = true;
                            if (tt._inTable) {
                                tt._colSkipped++;
                            }
                        } else {
                            // just an exclamation point
                            sb.append(testChar);
                        }
                    } else {
                        // last char so just an exclamation point
                        sb.append(testChar);
                    }
                    break;
                }
                default: {
                    // capture all characters not inside a link
                    if (startOffsets.empty()) {
                        sb.append(testChar);
                    }
                    break;
                }
            }
            charOffset++;
        }
        return sb.toString();
    }

    static public String removeTableLines(String line) {
        // essentially, if the line comprises pipes, spaces, hyphens and colons then remove it
        char test = ' ';
        boolean foundOutlier = false;
        for (int i=0;i markdownList = MDfromHTMLUtils.loadTextFile(fqFileName);
        String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
        int index = shortFileName.lastIndexOf("." + _ext);
        if (index >= 1) {
            html2mdProvenanceFileName = _inputPath + File.separator
                + shortFileName.substring(0, index) + "_html2md.json";
            provenanceOutputFileName = _outputPath
                + shortFileName.substring(0, index) + "_md2txt.json";
            provenance = MDfromHTMLUtils.loadJSONFile(html2mdProvenanceFileName);
            _HTMLFilters = (JSONObject) provenance.get("htmlFilters");
            String baseURI = (String) provenance.get("baseURI");
            textOutputFileName = _outputPath + shortFileName.substring(0, index)
                + "." + _txtext;

            String domain = Remark.getDomain(baseURI);
            int testindex = baseURI.indexOf(domain);
            // need to find actual domain for proper filters
            String workingURI = baseURI.substring(testindex + domain.length());
            testindex = workingURI.toLowerCase().indexOf("http");
            if (testindex >= 0) {
                workingURI = workingURI.substring(testindex);
                String urlDomain = domain;
                try {
                    urlDomain = URLDecoder.decode(domain,
                        StandardCharsets.UTF_8.name());
                } catch (Exception e) {
                    ; // leave as it was
                }
                while (urlDomain.endsWith("/") && urlDomain.length() > 1) {
                    urlDomain = urlDomain.substring(0, urlDomain.length() - 1);
                }
                if (urlDomain.startsWith("http")) {
                    int indexProtocol = urlDomain.lastIndexOf("/");
                    if (indexProtocol >= 0) {
                        domain = urlDomain.substring(indexProtocol);
                    } else {
                        domain = urlDomain;
                    }
                }
                baseURI = workingURI;
            }

            File provenanceOutputFile = new File(provenanceOutputFileName);
            if (provenanceOutputFile.exists()) {
                provenanceOutputFile.delete();
            }
            provenanceWriter = new ProvenanceWriter(fqFileName,
                textOutputFileName, _HTMLFilters, baseURI, domain,
                new FileWriter(provenanceOutputFile, true));
            try {
                StringBuffer sb = new StringBuffer();
                int lineNum = 0;
                Map refURLs = findRefURLs(markdownList);
                for (String mdLine : markdownList) {
                    lineNum++;
                    // truncate at provenance so it isn't included 
                    if (mdLine.equals("###### Provenance ######")) {
                        break;
                    }
                    String testLine = generateTextFromMarkdown(mdLine, refURLs);
                    if (testLine != null) {
                        provenanceWriter.saveMD2Text("" + lineNum, mdLine,
                            testLine);
                        sb.append(testLine);
                        sb.append("\n");
                    }
                }

                if (_thumbsucker) {
                    System.out.println("Writing: " + textOutputFileName);
                }

                MDfromHTMLUtils.saveTextFile(textOutputFileName, sb.toString());
            } finally {
                if (provenanceWriter != null) {
                    try {
                        provenanceWriter.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        } else {
            throw new Exception(
                "Error: " + shortFileName + "doesn't end with ." + _ext);
        }
        return;
    }

    /**
    * Get the parameters necessary for program execution: input directory,
    * output directory, and whether to append annotation details to sentences
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag
    * @return true if we have sufficient parameters to execute the program
    */
    boolean getParams(String[] args) {
        String inputPath = "." + File.separator + "data" + File.separator + "md";
        String outputPath = "." + File.separator + "data" + File.separator + "txt";
        String tmp = "";

        try {
            if (args.length >= 1) {
                inputPath = args[0];
            } else {
                _interactive = true;
                _thumbsucker = true;
                tmp = MDfromHTMLUtils.prompt(
                    "Enter the fully qualified path to directory containing " + _ext
                        + " multimarkdown files, or q to exit (" + inputPath + "):");
                if (tmp == null || tmp.length() == 0) {
                    tmp = inputPath;
                }
                if (tmp.toLowerCase().equals("q")) {
                    return false;
                }
                inputPath = tmp;
            }
            if (inputPath.endsWith(File.separator) == false) {
                inputPath += File.separator;
            }
            _inputPath = FileSystems.getDefault().getPath(inputPath);
        } catch (InvalidPathException ipe) {
            System.out.println(
                "Error: " + args[0] + " is not a valid directory to form a path.");
            return false;
        }
        if (args.length >= 2) {
            outputPath = args[1];
        } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
                "Enter the fully qualified path to the text file output directory, or q to exit ("
                    + outputPath + "):");
            if (tmp == null || tmp.length() == 0) {
                tmp = outputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
                return false;
            }
            outputPath = tmp;
        }
        if (outputPath.endsWith(File.separator) == false) {
            outputPath += File.separator;
        }
        File testOutput = new File(outputPath);
        if (testOutput.exists() == false) {
            System.out.println(
                "Error: The output directory \"" + outputPath + "\" must exist.");
            return false;
        }
        if (testOutput.isDirectory() == false) {
            System.out.println("Error: The output directory \"" + outputPath
                + "\" must be a directory.");
            return false;
        }
        _outputPath = outputPath;

        String includeLinks = "y";
        if (args == null || args.length >= 3) {
            includeLinks = args[2].trim().toLowerCase().substring(0, 1);
        } else {
            tmp = MDfromHTMLUtils.prompt(
                "Include links in text output (y=yes, n=no), or q to exit ("
                    + includeLinks + "):");
            if (tmp == null || tmp.length() == 0) {
                tmp = includeLinks;
            }
            if (tmp.toLowerCase().equals("q")) {
                return false;
            }
            includeLinks = tmp.toLowerCase().substring(0, 1);
        }
        _includeLinks = ("y".equals(includeLinks));

        if (args.length >= 4) {
            _thumbsucker = new Boolean(args[3]);
        }

        return true;
    }

}