All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mdfromhtml.markdown.transform.GetTextFromMarkdown Maven / Gradle / Ivy

/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
import com.overzealous.remark.Options;
import com.overzealous.remark.Remark;
import com.overzealous.remark.convert.ProvenanceWriter;

/**
 * Utility class to transform multimarkdown generated from HTML into text files.
 * Files are read from an input directory, and written to an output directory.
 * 
 * @author Nathaniel Mills
 */
public class GetTextFromMarkdown {

    static public boolean _includeLinks = false;

    static public Map findRefURLs(List mdLines) {
        Map refURLs = new HashMap();
        int offset = 0;
        String url = "";
        String ref = "";
        for (String mdLine : mdLines) {
            mdLine = mdLine.trim();
            if (mdLine.startsWith("[")) {
                offset = mdLine.indexOf("]: ");
                if (offset > 1) {
                    ref = mdLine.substring(1, offset).trim();
                    url = mdLine.substring(offset + 3).trim();
                    url = url.replaceAll("_", _urlUnderscore);
                    refURLs.put(ref, url);
                }
            }
        }
        return refURLs;
    }

    static public String generateTextFromMarkdown(String mdLine, Map refURLs) {
        String test = mdLine.trim();
        if (test.length() > 0) {
            test = TextUtils.filterComments(test);
            test = processText(test, refURLs);
        }
        return test;
    }

    static public String getRef(String line) {
        String retVal = "";
        int startOffset = line.indexOf("[");
        int testOffset = line.indexOf(")");
        
        if (startOffset > -1 && (testOffset == -1 || startOffset < testOffset)) {
            int endOffset = line.substring(startOffset).indexOf("]");
            if (endOffset > -1 && endOffset > startOffset) {
                retVal = line.substring(startOffset + 1, endOffset + startOffset)
                    .trim();
            }
        }
        return retVal;
    }

    static public String getDirectRef(String line) {
        String retVal = "";
        int startOffset = line.indexOf("(");
        if (startOffset > -1) {
            int endOffset = line.substring(startOffset+1).indexOf(")");
            if (endOffset > -1 && endOffset > startOffset) {
                retVal = line.substring(startOffset + 1, endOffset + startOffset + 1)
                    .trim();
            }
        }
        return retVal;
    }

    /**
    * @param args
    */
    public static void main(String[] args) throws Exception {
        JSONObject HTMLFilters = null;
        try {
            HTMLFilters = MDfromHTMLUtils.loadJSONFile("." + File.separator + "properties" + File.separator + "HTML_Filters.json");
        } catch (Exception e1) {
            System.out.println(
                "Warning: Using no HTML Filters -- can not find " + "." + File.separator + "properties" + File.separator + "HTML_Filters.json\": "
                    + e1.getLocalizedMessage());
        }
        GetTextFromMarkdown pgm = new GetTextFromMarkdown(Options.multiMarkdown(),
            HTMLFilters);
        if (pgm.getParams(args)) {
            if (pgm._thumbsucker) {
                System.out.println("\nFiles ending with ." + pgm._ext
                    + " will be read from " + pgm._inputPath //
                    + "\nand the generated text files (." + pgm._txtext
                    + ") will be " + "saved in " + pgm._outputPath
                    + "\nIt is " + _includeLinks + " that links will be included in the text output."); //
            }
            if (pgm._interactive) {
                if (MDfromHTMLUtils
                    .prompt("Press q to quit or press Enter to continue...")
                    .length() == 0) {
                    pgm._interactive = false;
                }
            }
            if (!pgm._interactive) {
                try {
                    List files = MDfromHTMLUtils.listSourceFiles(
                        FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                        pgm._ext);
                    for (Path file : files) {
                        try {
                            pgm.doWork(file);
                        } catch (Exception e) {
                            System.out.println("Error: Could not perform work for file \"" + file.toString() + "\": " + e.getLocalizedMessage());
                        }
                    }
                } catch (Exception e) {
                    System.out
                        .println("Error: Can not reference files with extension "
                            + pgm._ext + " in directory " + pgm._inputPath
                            + " reason: " + e.getLocalizedMessage());
                }
            }
            if (pgm._thumbsucker) {
                System.out.println();
            }
        }
        if (args.length == 0) {
            System.out.println("Goodbye");
        }
    }

    static public String processText(String line, Map refURLs) {
        StringBuffer sb = new StringBuffer();
        String testChanged = "";
        int lineLen = line.length();
        TextTracker tt = new TextTracker();
        int offset = 0;
        while (offset < lineLen) {
            if (line == null) {
                break;
            }
            char startChar = line.charAt(offset); // substring(offset, offset + 1);
            switch (startChar) {
                // handle escaped characters first
                case 0x005c: { // backslash
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    // skip first backslash and save next char
                    offset++;
                    if (offset < lineLen) {
                        sb.append(line.charAt(offset));
                        offset++;
                    }
                    if (offset < lineLen) {
                        line = line.substring(offset);
                        lineLen = line.length();
                    } else {
                        // reached end of line (breaks out of while loop)
                        lineLen = 0;
                        tt._inTable = false;
                        tt._colSkipped = 0;
                    }
                    offset = 0;
                    break;
                }
                case 0x0021: { // exclamation or image link
                    if (offset < lineLen - 1) {
                        if ("[".equals(line.substring(offset + 1, offset + 2))) {
                            // image link
                            line = removeReferencesAndLinks(line.substring(offset),
                                refURLs, tt);
                            if (line == null) { // signal to break from while loop
                                tt._inTable = false;
                                tt._colSkipped = 0;
                                break;
                            }
                            offset = 0;
                            lineLen = line.length();
                        } else { // just an exclamation point
                            line = line.substring(offset + 1);
                            offset = 0;
                            lineLen = line.length();
                        }
                    } else { // just an exclamation point at end of line
                        offset = line.length(); // signal end of while loop
                        tt._inTable = false;
                        tt._colSkipped = 0;
                    }
                    sb.append("!");
                    break;
                }
                case 0x005b: { // left bracket == link
                    line = removeReferencesAndLinks(line.substring(offset), refURLs, tt);
                    // check for complete line deletion
                    tt._inTable = false;
                    tt._colSkipped = 0;
                    if (line == null) {
                        return null;
                    }
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                // Table lines
                case 0x007c: { // pipe == table column separator
                    if (tt._inTable) {
                        // append _colSkipped spaces to text
                        while (tt._colSkipped > 0) {
                            sb.append(" ");
                            tt._colSkipped--;
                        }
                    }
                    tt._inTable = true;
                    line = removeTableLines(line.substring(offset));
                    offset = 0;
                    lineLen = line.length();
                    if (lineLen == 0) {
                        if (tt._inTable) {
                            tt._colSkipped = 0;
                            tt._inTable = false;
                        }
                    }
                    break;
                }
                /**
                 * Note: these simple cleansers should be below link cases "!" and
                 * "[" and tables
                 */
                case 0x002d: { // hyphen or task list "- [x]" or "- [ ]"
                    testChanged = removeTaskList(line.substring(offset), tt);
                    if (testChanged.equals(line.substring(offset))) {
                        // just a hyphen, no change
                        sb.append("-");
                        line = testChanged.substring(1);
                    } else {
                        line = testChanged;
                    }
                    offset = 0; // skip next char (space or remaining hyphen)
                    lineLen = line.length();
                    break;
                }
                case 0x0060: { // back tick == fenced code blocks or code
                    line = removeFencing(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x0023: { // hash tag == headings
                    line = removeHeading(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x005f: { // underscore
                    line = removeUnderscore(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x007e: { // tilde == strike through or fencing
                    // first remove fencing
                    line = removeFencing(line.substring(offset), tt);
                    line = removeEmphasis(line, tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                case 0x002a: { // asterisk == bold, italic
                    line = removeEmphasis(line.substring(offset), tt);
                    offset = 0;
                    lineLen = line.length();
                    break;
                }
                default: { // just text
                    sb.append(startChar);
                    offset++;
                    break;
                }
            }
        }
        String temp = sb.toString();
        if (temp.contains("\\\\:")) {
            temp = temp.replaceAll("\\\\:", ":");
        }
        if (temp.contains("\\\\-")) {
            temp = temp.replaceAll("\\\\-", "-");
        }
        // special processing to replace placeholders with brackets
        temp = temp.replaceAll(_openBracket, "[");
        temp = temp.replaceAll(_closeBracket, "]");
        temp = temp.replaceAll(_urlUnderscore, "_");
        // #202
        temp = temp.replace("\u2022", "*");
        // #84
        temp = temp.replace(">", ">");
        temp = temp.replace("<", "<");
        temp = temp.replace("≥", ">=");
        temp = temp.replace("&le", "<=");
        return temp;
    }

    /**
    * Remove bold, italics, underscore, strikethrough
    * 
    * @param line
    *           input to be cleansed
    * @return cleansed version of input
    */
    static public String removeEmphasis(String line, TextTracker tt) {
        StringBuffer sb = new StringBuffer();
        char test = ' ';
        char priorChar = 0x00;
        char nextChar = 0x00;
        for (int i = 0; i < line.length(); i++) {
            test = line.charAt(i);
            if (i < line.length()-1) {
                nextChar = line.charAt(i+1);
            } else {
                nextChar = 0x00; // non-match choice
            }
            switch (test) {
                case '~': {
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    break;
                }
                case '*': {
                    if ((i == 0 || priorChar == 0x20) && nextChar == 0x20) {
                        sb.append('\u2022');
                    } else if ((priorChar == 0x5c)) {
                        sb.append(test);
                    } else {
                        if (tt._inTable) {
                            tt._colSkipped++;
                        }
                    }
                    break;
                }
                case '|': {
                    if (tt._inTable) {
                        // keep the pipe and abort removing emphasis
                        sb.append(line.substring(i));
                        return sb.toString();
                    }
                }
                default: {
                    sb.append(test);
                }
            }
            priorChar = test;
        }
        return sb.toString();
    }

    /**
    * Remove patterns like ```, ~~~, ```json, ~~~java
    * 
    * @param line
    *           input to be cleansed
    * @param tt track whether we are in a table and the number of columns skipped
    * @return cleansed version of input
    */
    static private String removeFencing(String line, TextTracker tt) {
        String remainingLine = "";
        int pipeOffset = line.indexOf("|");
        if (tt._inTable && pipeOffset >= 0) {
            // only work on the part before the pipe
            remainingLine = line.substring(pipeOffset);
            line = line.substring(0,pipeOffset);
        }
        String remLine = new String(line);
        StringBuffer sb = new StringBuffer();
        if (tt._inTable) {
            // need to know what was removed from the string including consecutive patterns
            Matcher matcher = _fencePattern1.matcher(remLine);
            while (matcher.find()) {
                tt._colSkipped += matcher.end() - matcher.start();
            }
        }
        String[] parts = line.split("```\\w+");
        for (int i = 0; i < parts.length; i++) {
            if (parts[i].length() != 0) {
                sb.append(parts[i]);
            }
        }
        line = sb.toString();
        remLine = new String(line);
        if (tt._inTable) {
            // need to know what was removed from the string including consecutive patterns
            Matcher matcher = _fencePattern2.matcher(remLine);
            while (matcher.find()) {
                tt._colSkipped += matcher.end() - matcher.start();
            }
        }
        sb = new StringBuffer();
        parts = line.split("~~~\\w+");
        for (int i = 0; i < parts.length; i++) {
            if (parts[i].length() != 0) {
                sb.append(parts[i]);
            }
        }
        line = sb.toString();
        // while fencing is three ticks, single ticks connote inline code
        sb = new StringBuffer();
        char test = ' ';
        for (int i = 0; i < line.length(); i++) {
            test = line.charAt(i);
            switch (test) {
                case '`': {
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    break;
                }
                default: {
                    sb.append(test);
                }
            }
        }
        return sb.toString()+remainingLine;
    }

    /**
    * Processes content with surrounding #'s signifying a header and transforms
    * them to text without the #'s
    * 
    * @param line
    *           text to be cleansed of headers
    * @param tt track whether we are in a table and the number of columns skipped
    * @return text without headers
    */
    static private String removeHeading(String line, TextTracker tt) {
        StringBuffer sb = new StringBuffer();
        char test = ' ';
        for (int i = 0; i < line.length(); i++) {
            test = line.charAt(i);
            switch (test) {
                case '#': {
                    if (tt._inTable) {
                        tt._colSkipped++;
                    }
                    break;
                }
                case '|': {
                    if (tt._inTable) {
                        return sb.toString()+line.substring(i);
                    }
                }
                default: {
                    sb.append(test);
                }
            }
        }
        return sb.toString();
    }

    /**
    * Search for reference links in the markdown line. A reference link contains
    * a pattern with [...] without a following ": "
    * 
    * @param line
    *           markdown line to be examined
    * @param refURLs
    *           map of a reference to its corresponding URL to enable the URL to
    *           be added where a reference is made. 
    * @param tt track whether we are in a table and the number of columns skipped
    * @return the revised line stripped of links, or null if nothing from this
    *         line should be saved (e.g., for a reference with pattern [...]:...
    */
    static public String removeReferencesAndLinks(String line, Map refURLs, TextTracker tt) {
        StringBuffer sb = new StringBuffer();
        String test = line.trim();
        boolean foundBracket = false;
        int bracketCnt = 0;
        int startOffset = -1;
        String refLink = "";
        String url = "";
        String ref = "";
        // String reference = "";
        Stack startOffsets = new Stack();
        boolean isImageRef = false;
        boolean needLabel = false;
        int charOffset = 0;
        while (charOffset < test.length()) {
            char testChar = test.charAt(charOffset);
            switch (testChar) {
                case 0x005b: { // "["
                    if (!isImageRef) {
                        // flipflop need for label to skip refLink
                        needLabel = !needLabel;
                    } // else in an image link so don't capture anything

                    if (!foundBracket) {
                        foundBracket = true;
                    }
                    startOffsets.push(charOffset);
                    bracketCnt++;
                    break;
                }
                case 0x005d: { // "]"
                    if (foundBracket) {
                        bracketCnt--;
                        startOffset = startOffsets.pop();
                        if ((charOffset - startOffset) > 1) {
                            if ((charOffset + 1) < test.length()) {
                                if (0x003a == test.charAt(charOffset + 1)) { // ":"
                                    /**
                                     * Don't save any reference information (signal line
                                     * deletion with null
                                     */
                                    return null;
                                } else {
                                    refLink = test.substring(startOffset + 1, charOffset).trim();
                                    refLink = refLink.replaceAll("_", _urlUnderscore);
                                    if (needLabel && startOffsets.isEmpty()) {
                                        if (_includeLinks) {
                                            sb.append(_openBracket+refLink+_closeBracket);
                                            if (tt._inTable) {
                                              tt._colSkipped-=2;;
                                              if (tt._colSkipped < 0) {
                                                tt._colSkipped = 0;
                                              }
                                            }
                                        } else {
                                            sb.append(refLink);
                                        }
                                        // try to insert the corresponding URL
                                        ref = getRef(test.substring(charOffset+1)).trim();
                                        if (ref.equals("")) {
                                            ref = getDirectRef(test.substring(charOffset+1).trim());
                                            if (ref.equals("")) {
                                                if (tt._inTable) {
                                                    tt._colSkipped += 2; // for the [ and ]
                                                }
                                                ref = refLink;
                                                ref = ref.replaceAll("_", _urlUnderscore);
                                            } else {
                                                ref = ref.replaceAll("_", _urlUnderscore);
                                                // account for direct ref found and brackets
                                                charOffset += ref.length()+2;
                                                if (tt._inTable) {
                                                    tt._colSkipped += ref.length()+2;
                                                }
                                                if (_includeLinks) {
                                                    sb.append("(");
                                                    sb.append(ref);
                                                    sb.append(")");
                                                    if (tt._inTable) {
                                                        /**
                                                         * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                         * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                         * with any extra spaces
                                                         */
                                                        tt._colSkipped -= 2 + ref.length();
                                                        if (tt._colSkipped < 0) {
                                                            tt._colSkipped = 0;
                                                        }
                                                    }
                                                }
                                            }
                                        } else {
                                            // skip over the reference found plus brackets [ref]
                                            charOffset+=ref.length()+2;
                                            if (tt._inTable) {
                                                tt._colSkipped += ref.length() + 2;
                                            }
                                            url = refURLs.get(ref);
                                            if (url == null) {
                                                url = getDirectRef(test.substring(charOffset+1).trim());
                                            }
                                            if (_includeLinks) {
                                                url = url.replaceAll("_", _urlUnderscore);
                                                sb.append("(");
                                                sb.append(url);
                                                sb.append(")");
                                                if (tt._inTable) {
                                                    /**
                                                     * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                     * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                     * with any extra spaces
                                                     */
                                                    tt._colSkipped -= 2 + url.length();
                                                    if (tt._colSkipped < 0) {
                                                        tt._colSkipped = 0;
                                                    }
                                                }
                                            }
                                        }
                                        needLabel = false;
                                    }
                                    if (isImageRef) {
                                        // set up so next [ makes this false
                                        needLabel = true;
                                    }
                                }
                            } else { // at the end of the line
                                refLink = test.substring(startOffset + 1, charOffset).trim();
                                if (needLabel) {
                                    if (_includeLinks) {
                                      sb.append(_openBracket+refLink+_closeBracket);
                                      if (tt._inTable) {
                                          tt._colSkipped-=2;
                                        if (tt._colSkipped < 0) {
                                            tt._colSkipped = 0;
                                        }
                                      }
                                    } else {
                                        sb.append(refLink);
                                    }
                                    // try to insert the corresponding URL
                                    ref = getRef(test.substring(charOffset+1)).trim();
                                    if (ref.equals("")) {
                                        ref = getDirectRef(test.substring(charOffset+1).trim());
                                        if (ref.equals("")) {
                                            if (tt._inTable) {
                                                tt._colSkipped += 2; // for the [ and ]
                                            }
                                            ref = refLink;
                                            ref = ref.replaceAll("_", _urlUnderscore);
                                        } else {
                                            ref = ref.replaceAll("_", _urlUnderscore);
                                            // account for direct ref found and brackets
                                            charOffset += ref.length()+2;
                                            if (tt._inTable) {
                                                tt._colSkipped += ref.length()+2;
                                            }
                                            if (_includeLinks) {
                                                sb.append("(");
                                                sb.append(ref);
                                                sb.append(")");
                                                if (tt._inTable) {
                                                    /**
                                                     * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                     * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                     * with any extra spaces
                                                     */
                                                    tt._colSkipped -= 2 + ref.length();
                                                    if (tt._colSkipped < 0) {
                                                        tt._colSkipped = 0;
                                                    }
                                                }
                                            }
                                        }
                                    } else {
                                        // skip over the reference found plus brackets [ref]
                                        charOffset+=ref.length()+2;
                                        if (tt._inTable) {
                                            tt._colSkipped += ref.length() + 2;
                                        }
                                        url = refURLs.get(ref);
                                        if (url != null) {
                                            if (_includeLinks) {
                                                url = url.replaceAll("_", _urlUnderscore);
                                                sb.append("(");
                                                sb.append(url);
                                                sb.append(")");
                                                if (tt._inTable) {
                                                    /**
                                                     * not sure what to do as we are adding a URL that is likely longer than the reference name
                                                     * so appending the url will blow past the column width (so at a minimum we shouldn't pad 
                                                     * with any extra spaces
                                                     */
                                                    tt._colSkipped -= 2 + url.length();
                                                    if (tt._colSkipped < 0) {
                                                        tt._colSkipped = 0;
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    needLabel = false;
                                }
                                if (isImageRef) {
                                    // set up so next [ makes this false
                                    needLabel = true;
                                }
                            }
                        }
                        if (bracketCnt == 0) {
                            startOffset = -1;
                            foundBracket = false;
                            isImageRef = false;
                            needLabel = false;
                        }
                    }
                    break;
                }
                case 0x0021: { // ! (may be an image
                    if (charOffset < test.length() - 1) {
                        // check the next character
                        if (0x005b == test.charAt(charOffset + 1)) { // [
                            isImageRef = true;
                            // set up so next [ will grab the label
                            needLabel = true;
                            if (tt._inTable) {
                                tt._colSkipped++;
                            }
                        } else {
                            // just an exclamation point
                            sb.append(testChar);
                        }
                    } else {
                        // last char so just an exclamation point
                        sb.append(testChar);
                    }
                    break;
                }
                default: {
                    // capture all characters not inside a link
                    if (startOffsets.empty()) {
                        sb.append(testChar);
                    }
                    break;
                }
            }
            charOffset++;
        }
        return sb.toString();
    }

    static public String removeTableLines(String line) {
        // essentially, if the line comprises pipes, spaces, hyphens and colons then remove it
        char test = ' ';
        boolean foundOutlier = false;
        for (int i=0;i markdownList = MDfromHTMLUtils.loadTextFile(fqFileName);
        String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
        int index = shortFileName.lastIndexOf("." + _ext);
        if (index >= 1) {
            html2mdProvenanceFileName = _inputPath + File.separator
                + shortFileName.substring(0, index) + "_html2md.json";
            provenanceOutputFileName = _outputPath
                + shortFileName.substring(0, index) + "_md2txt.json";
            provenance = MDfromHTMLUtils.loadJSONFile(html2mdProvenanceFileName);
            _HTMLFilters = (JSONObject) provenance.get("htmlFilters");
            String baseURI = (String) provenance.get("baseURI");
            textOutputFileName = _outputPath + shortFileName.substring(0, index)
                + "." + _txtext;

            String domain = Remark.getDomain(baseURI);
            int testindex = baseURI.indexOf(domain);
            // need to find actual domain for proper filters
            String workingURI = baseURI.substring(testindex + domain.length());
            testindex = workingURI.toLowerCase().indexOf("http");
            if (testindex >= 0) {
                workingURI = workingURI.substring(testindex);
                String urlDomain = domain;
                try {
                    urlDomain = URLDecoder.decode(domain,
                        StandardCharsets.UTF_8.name());
                } catch (Exception e) {
                    ; // leave as it was
                }
                while (urlDomain.endsWith("/") && urlDomain.length() > 1) {
                    urlDomain = urlDomain.substring(0, urlDomain.length() - 1);
                }
                if (urlDomain.startsWith("http")) {
                    int indexProtocol = urlDomain.lastIndexOf("/");
                    if (indexProtocol >= 0) {
                        domain = urlDomain.substring(indexProtocol);
                    } else {
                        domain = urlDomain;
                    }
                }
                baseURI = workingURI;
            }

            File provenanceOutputFile = new File(provenanceOutputFileName);
            if (provenanceOutputFile.exists()) {
                provenanceOutputFile.delete();
            }
            provenanceWriter = new ProvenanceWriter(fqFileName,
                textOutputFileName, _HTMLFilters, baseURI, domain,
                new FileWriter(provenanceOutputFile, true));
            try {
                StringBuffer sb = new StringBuffer();
                int lineNum = 0;
                Map refURLs = findRefURLs(markdownList);
                for (String mdLine : markdownList) {
                    lineNum++;
                    // truncate at provenance so it isn't included 
                    if (mdLine.equals("###### Provenance ######")) {
                        break;
                    }
                    String testLine = generateTextFromMarkdown(mdLine, refURLs);
                    if (testLine != null) {
                        provenanceWriter.saveMD2Text("" + lineNum, mdLine,
                            testLine);
                        sb.append(testLine);
                        sb.append("\n");
                    }
                }

                if (_thumbsucker) {
                    System.out.println("Writing: " + textOutputFileName);
                }

                MDfromHTMLUtils.saveTextFile(textOutputFileName, sb.toString());
            } finally {
                if (provenanceWriter != null) {
                    try {
                        provenanceWriter.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        } else {
            throw new Exception(
                "Error: " + shortFileName + "doesn't end with ." + _ext);
        }
        return;
    }

    /**
    * Get the parameters necessary for program execution: input directory,
    * output directory, and whether to append annotation details to sentences
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag
    * @return true if we have sufficient parameters to execute the program
    */
    boolean getParams(String[] args) {
        String inputPath = "." + File.separator + "data" + File.separator + "md";
        String outputPath = "." + File.separator + "data" + File.separator + "txt";
        String tmp = "";

        try {
            if (args.length >= 1) {
                inputPath = args[0];
            } else {
                _interactive = true;
                _thumbsucker = true;
                tmp = MDfromHTMLUtils.prompt(
                    "Enter the fully qualified path to directory containing " + _ext
                        + " multimarkdown files, or q to exit (" + inputPath + "):");
                if (tmp == null || tmp.length() == 0) {
                    tmp = inputPath;
                }
                if (tmp.toLowerCase().equals("q")) {
                    return false;
                }
                inputPath = tmp;
            }
            if (inputPath.endsWith(File.separator) == false) {
                inputPath += File.separator;
            }
            _inputPath = FileSystems.getDefault().getPath(inputPath);
        } catch (InvalidPathException ipe) {
            System.out.println(
                "Error: " + args[0] + " is not a valid directory to form a path.");
            return false;
        }
        if (args.length >= 2) {
            outputPath = args[1];
        } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
                "Enter the fully qualified path to the text file output directory, or q to exit ("
                    + outputPath + "):");
            if (tmp == null || tmp.length() == 0) {
                tmp = outputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
                return false;
            }
            outputPath = tmp;
        }
        if (outputPath.endsWith(File.separator) == false) {
            outputPath += File.separator;
        }
        File testOutput = new File(outputPath);
        if (testOutput.exists() == false) {
            System.out.println(
                "Error: The output directory \"" + outputPath + "\" must exist.");
            return false;
        }
        if (testOutput.isDirectory() == false) {
            System.out.println("Error: The output directory \"" + outputPath
                + "\" must be a directory.");
            return false;
        }
        _outputPath = outputPath;

        String includeLinks = "y";
        if (args == null || args.length >= 3) {
            includeLinks = args[2].trim().toLowerCase().substring(0, 1);
        } else {
            tmp = MDfromHTMLUtils.prompt(
                "Include links in text output (y=yes, n=no), or q to exit ("
                    + includeLinks + "):");
            if (tmp == null || tmp.length() == 0) {
                tmp = includeLinks;
            }
            if (tmp.toLowerCase().equals("q")) {
                return false;
            }
            includeLinks = tmp.toLowerCase().substring(0, 1);
        }
        _includeLinks = ("y".equals(includeLinks));

        if (args.length >= 4) {
            _thumbsucker = new Boolean(args[3]);
        }

        return true;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy