All Downloads are FREE. Search and download functionalities are using the official Maven repository. Maven / Gradle / Ivy



import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@SuppressWarnings({ "WeakerAccess", "SameParameterValue" })
public class FlexmarkHtmlParser {
    public static final DataKey LIST_CONTENT_INDENT = new DataKey("LIST_CONTENT_INDENT", true);
    public static final DataKey SETEXT_HEADINGS = new DataKey("SETEXT_HEADINGS", true);
    public static final DataKey OUTPUT_UNKNOWN_TAGS = new DataKey("OUTPUT_UNKNOWN_TAGS", false);
    public static final DataKey TYPOGRAPHIC_QUOTES = new DataKey("TYPOGRAPHIC_QUOTES", true);
    public static final DataKey TYPOGRAPHIC_SMARTS = new DataKey("TYPOGRAPHIC_SMARTS", true);
    public static final DataKey EXTRACT_AUTO_LINKS = new DataKey("EXTRACT_AUTO_LINKS", true);
    public static final DataKey WRAP_AUTO_LINKS = new DataKey("WRAP_AUTO_LINKS", false);
    public static final DataKey RENDER_COMMENTS = new DataKey("RENDER_COMMENTS", false);
    public static final DataKey DOT_ONLY_NUMERIC_LISTS = new DataKey("DOT_ONLY_NUMERIC_LISTS", true);
    public static final DataKey PRE_CODE_PRESERVE_EMPHASIS = new DataKey("PRE_CODE_PRESERVE_EMPHASIS", false);
    public static final DataKey ORDERED_LIST_DELIMITER = new DataKey("ORDERED_LIST_DELIMITER", '.');
    public static final DataKey UNORDERED_LIST_DELIMITER = new DataKey("UNORDERED_LIST_DELIMITER", '*');
    public static final DataKey DEFINITION_MARKER_SPACES = new DataKey("DEFINITION_MARKER_SPACES", 3);
    public static final DataKey CODE_INDENT = new DataKey("CODE_INDENT", "    ");
    public static final DataKey NBSP_TEXT = new DataKey("NBSP_TEXT", " ");
    public static final DataKey EOL_IN_TITLE_ATTRIBUTE = new DataKey("EOL_IN_TITLE_ATTRIBUTE", " ");
    public static final DataKey THEMATIC_BREAK = new DataKey("THEMATIC_BREAK", "*** ** * ** ***");

    public static final DataKey TABLE_MIN_SEPARATOR_COLUMN_WIDTH = TableFormatOptions.MIN_SEPARATOR_COLUMN_WIDTH;
    public static final DataKey TABLE_MIN_SEPARATOR_DASHES = TableFormatOptions.MIN_SEPARATOR_DASHES;
    public static final DataKey TABLE_LEAD_TRAIL_PIPES = TableFormatOptions.LEAD_TRAIL_PIPES;
    public static final DataKey TABLE_SPACE_AROUND_PIPES = TableFormatOptions.SPACE_AROUND_PIPES;
    public static final DataKey LISTS_END_ON_DOUBLE_BLANK = new DataKey("LISTS_END_ON_DOUBLE_BLANK", false);
    public static final DataKey DIV_AS_PARAGRAPH = new DataKey("DIV_AS_PARAGRAPH", false);
    public static final DataKey BR_AS_PARA_BREAKS = new DataKey("BR_AS_PARA_BREAKS", true);
    public static final DataKey BR_AS_EXTRA_BLANK_LINES = new DataKey("BR_AS_EXTRA_BLANK_LINES", true);

    private static final Map tableCellAlignments = new LinkedHashMap();
    static {
        tableCellAlignments.put(Pattern.compile("\\bleft\\b"), CellAlignment.LEFT);
        tableCellAlignments.put(Pattern.compile("\\bcenter\\b"), CellAlignment.CENTER);
        tableCellAlignments.put(Pattern.compile("\\bright\\b"), CellAlignment.RIGHT);
        tableCellAlignments.put("text-left", CellAlignment.LEFT);
        tableCellAlignments.put("text-center", CellAlignment.CENTER);
        tableCellAlignments.put("text-right", CellAlignment.RIGHT);

    private static final Map specialCharsMap = new HashMap();
    private static final String typographicQuotes = "“|”|‘|’|«|»|“|”|‘|’|'|«|»";
    private static final String typographicSmarts = "…|–|—|…|&endash;|&emdash;";
    static {
        specialCharsMap.put("“", "\"");
        specialCharsMap.put("”", "\"");
        specialCharsMap.put("“", "\"");
        specialCharsMap.put("”", "\"");
        specialCharsMap.put("‘", "'");
        specialCharsMap.put("’", "'");
        specialCharsMap.put("‘", "'");
        specialCharsMap.put("’", "'");
        specialCharsMap.put("'", "'");
        specialCharsMap.put("«", "<<");
        specialCharsMap.put("«", "<<");
        specialCharsMap.put("»", ">>");
        specialCharsMap.put("»", ">>");
        specialCharsMap.put("…", "...");
        specialCharsMap.put("…", "...");
        specialCharsMap.put("–", "--");
        specialCharsMap.put("&endash;", "--");
        specialCharsMap.put("—", "---");
        specialCharsMap.put("&emdash;", "---");

    private static final Pattern NUMERIC_DOT_LIST = Pattern.compile("^\\d+\\.$");
    private static final Pattern NUMERIC_PAREN_LIST = Pattern.compile("^\\d+\\)$");
    private static final Pattern NON_NUMERIC_DOT_LIST = Pattern.compile("^(?:(?:" + RomanNumeral.ROMAN_NUMERAL.pattern() + ")|(?:" + RomanNumeral.LOWERCASE_ROMAN_NUMERAL.pattern() + ")|[a-z]+|[A-Z]+)\\.$");
    private static final Pattern NON_NUMERIC_PAREN_LIST = Pattern.compile("^(?:[a-z]+|[A-Z]+)\\)$");

    public static final DataKey> TABLE_CELL_ALIGNMENT_MAP = new DataKey>("TABLE_CELL_ALIGNMENT_MAP", tableCellAlignments);

    private final HtmlParserOptions myOptions;
    private final Pattern specialCharsPattern;

    private Stack myStateStack;
    private Map myAbbreviations;
    private State myState;
    private boolean myTrace;
    private boolean myInlineCode;

    private FlexmarkHtmlParser(DataHolder options) {
        myOptions = new HtmlParserOptions(options);

        if (myOptions.typographicQuotes && myOptions.typographicSmarts) {
            specialCharsPattern = Pattern.compile(typographicQuotes + "|" + typographicSmarts);
        } else if (myOptions.typographicQuotes) {
            specialCharsPattern = Pattern.compile(typographicQuotes);
        } else if (myOptions.typographicSmarts) {
            specialCharsPattern = Pattern.compile(typographicSmarts);
        } else {
            specialCharsPattern = null;

        //myTrace = true;

    private void resetForParse() {
        myStateStack = new Stack();
        myAbbreviations = new HashMap();
        myState = null;

    public HtmlParserOptions getOptions() {
        return myOptions;

    public boolean isTrace() {
        return myTrace;

    public void setTrace(final boolean trace) {
        myTrace = trace;

     * Parse HTML append to out
     * @param out  formatting appendable to append the resulting Markdown
     * @param html html to be parsed
    public void parse(FormattingAppendable out, String html) {

        Document document = Jsoup.parse(html);

        Element body = document.body();

        if (myTrace) {
            FormattingAppendableImpl trace = new FormattingAppendableImpl(0);
            trace.setIndentPrefix("  ");
            dumpHtmlTree(trace, body);

        processHtmlTree(out, body);

        // output abbreviations if any
        if (!myAbbreviations.isEmpty()) {
            for (Map.Entry entry : myAbbreviations.entrySet()) {
                out.line().append("*[").append(entry.getKey()).append("]: ").append(entry.getValue()).line();

     * Build parser with default options
     * @return html parser instance
    public static FlexmarkHtmlParser build() {
        return new FlexmarkHtmlParser(null);

     * Build parser
     * @param options parser options
     * @return html parser instance
    public static FlexmarkHtmlParser build(DataHolder options) {
        return new FlexmarkHtmlParser(options);

    public void dumpHtmlTree(FormattingAppendable out, Node node) {
        for (Attribute attribute : node.attributes().asList()) {
            out.append(' ').append(attribute.getKey()).append("=\"").append(attribute.getValue()).append("\"");


        for (Node child : node.childNodes()) {
            dumpHtmlTree(out, child);


     * Parse HTML with default options
     * @param html html to be parsed
     * @return resulting markdown string
    public static String parse(String html) {
        return parse(html, 1);

     * Parse HTML with default options and max trailing blank lines
     * @param html html to be parsed
     * @return resulting markdown string
    public static String parse(String html, int maxBlankLines) {
        return parse(html, maxBlankLines, null);

     * Parse HTML with given options and max trailing blank lines
     * @param html          html to be parsed
     * @param maxBlankLines max trailing blank lines, -1 will suppress trailing EOL
     * @param options       data holder for parsing options
     * @return resulting markdown string
    public static String parse(String html, int maxBlankLines, DataHolder options) {
        FormattingAppendableImpl out = new FormattingAppendableImpl(FormattingAppendable.SUPPRESS_TRAILING_WHITESPACE | FormattingAppendable.COLLAPSE_WHITESPACE);
        FlexmarkHtmlParser parser = new FlexmarkHtmlParser(options);
        parser.parse(out, html);
        return out.getText(maxBlankLines);

    private static class State {
        Node myParent;
        List myElements;
        int myIndex;

        State(final Node parent) {
            myParent = parent;
            myElements = parent.childNodes();
            myIndex = 0;

        public String toString() {
            return "State{" +
                    "myParent=" + myParent +
                    ", myElements=" + myElements +
                    ", myIndex=" + myIndex +

    private String dumpState() {
        if (!myStateStack.isEmpty()) {
            StringBuilder sb = new StringBuilder();

            while (!myStateStack.isEmpty()) {
                State state = myStateStack.pop();

            return sb.toString();
        return "";

    private void processHtmlTree(FormattingAppendable out, Node parent) {

        State oldState = myState;
        Node node;

        while ((node = peek()) != null) {
            processElement(out, node);

        if (oldState != myState) {
            throw new IllegalStateException("State not equal after process " + dumpState());


    private void processElement(FormattingAppendable out, Node node) {
        TagParam tagParam = getTagParam(node);
        boolean processed = false;

        if (tagParam != null) {
            switch (tagParam.tagType) {
                // @formatter:off
                case A                       : processed = processA(out, (Element) node); break;
                case BR                      : processed = processBr(out, (Element) node); break;
                case ABBR                    : processed = processAbbr(out, (Element) node); break;
                case ASIDE                   : processed = processAside(out, (Element) node); break;
                case BLOCKQUOTE              : processed = processBlockquote(out, (Element) node); break;
                case CODE                    : processed = processCode(out, (Element) node); break;
                case DEL                     : processed = processDel(out, (Element) node); break;
                case DIV                     : processed = processDiv(out, (Element) node); break;
                case DL                      : processed = processDl(out, (Element) node); break;
                case _EMPHASIS               : processed = processEmphasis(out, (Element) node); break;
                case HR                      : processed = processHr(out, (Element) node); break;
                case IMG                     : processed = processImg(out, (Element) node); break;
                case INPUT                   : processed = processInput(out, (Element)node); break;
                case INS                     : processed = processIns(out, (Element) node); break;
                case OL                      : processed = processOl(out, (Element) node); break;
                case P                       : processed = processP(out, (Element) node); break;
                case PRE                     : processed = processPre(out, (Element) node); break;
                case _STRONG_EMPHASIS        : processed = processStrong(out, (Element) node); break;
                case SUB                     : processed = processSub(out, (Element) node); break;
                case SUP                     : processed = processSup(out, (Element) node); break;
                case SVG                     : processed = processSvg(out, (Element) node); break;
                case UL                      : processed = processUl(out, (Element) node); break;
                case LI                      : processed = processList(out, (Element) node, false, true); break;
                case TABLE                   : processed = processTable(out, (Element) node); break;
                case _UNWRAPPED              : processed = processUnwrapped(out, node); break;
                case _SPAN                   : processed = processSpan(out, (Element) node); break;
                case _WRAPPED                : processed = processWrapped(out, node, tagParam.param == null); break;
                case _COMMENT                : processed = processComment(out, (Comment)node); break;
                case _HEADING                : processed = processHeading(out, (Element) node); break;
                case _TEXT                   : processed = processText(out, (TextNode)node); break;
                // @formatter:on
        } else {
            if (node instanceof Element) {
                processed = processEmoji(out, (Element) node);

        if (!processed) {
            // wrap markdown in HTML tag
            if (myOptions.outputUnknownTags) processWrapped(out, node, null);
            else processUnwrapped(out, node);

    private boolean processWrapped(final FormattingAppendable out, final Node node, Boolean isBlock) {
        if (node instanceof Element && (isBlock == null && ((Element) node).isBlock() || isBlock != null && isBlock)) {
            String s = node.toString();
            int pos = s.indexOf(">");
            out.lineIf(isBlock != null).append(s.substring(0, pos + 1)).lineIf(isBlock != null);

            processHtmlTree(out, node);

            int endPos = s.lastIndexOf("<");
            out.lineIf(isBlock != null).append(s.substring(endPos)).lineIf(isBlock != null);
        } else {
        return true;

    private String prepareText(String text) {
        if (specialCharsPattern != null) {
            Matcher matcher = specialCharsPattern.matcher(text);
            int length = text.length();
            StringBuilder sb = new StringBuilder(length * 2);
            int lastPos = 0;

            while (matcher.find()) {
                if (lastPos < matcher.start()) {
                    sb.append(text, lastPos, matcher.start());
                lastPos = matcher.end();

            if (lastPos < length) {
                sb.append(text, lastPos, length);

            text = sb.toString();
        text = text.replace("\\", "\\\\").replace("\u00A0", myOptions.nbspText);
        if (!myInlineCode) {
            // replace [ ] by escaped versions
            text = text.replace("[", "\\[").replace("]", "\\]");

        return text;

    private boolean processText(FormattingAppendable out, TextNode node) {
        if (out.isPreFormatted()) {
        } else {
        return true;

    private void processTextNodes(FormattingAppendable out, Node node) {
        processTextNodes(out, node, null, null);

    private void processTextNodes(FormattingAppendable out, Node node, CharSequence wrapText) {
        processTextNodes(out, node, wrapText, wrapText);

    private void processTextNodes(FormattingAppendable out, Node node, CharSequence textPrefix, CharSequence textSuffix) {

        Node child;

        while ((child = peek()) != null) {
            if (child instanceof TextNode) {
                if (textPrefix != null && textPrefix.length() > 0) out.append(textPrefix);
                String text = ((TextNode) child).getWholeText();
                String preparedText = prepareText(text);
                if (textSuffix != null && textSuffix.length() > 0) out.append(textSuffix);
            } else if (child instanceof Element) {
                processElement(out, child);
            } else {


    private void wrapTextNodes(FormattingAppendable out, Node node, CharSequence wrapText, boolean needSpaceAround) {
        String text = processTextNodes(node);
        String prefixBefore = null;
        String appendAfter = null;
        boolean addSpaceBefore = false;
        boolean addSpaceAfter = false;

        if (!text.isEmpty() && needSpaceAround) {
            if ("\u00A0 \t\n".indexOf(text.charAt(0)) != -1) {
                prefixBefore = prepareText(text.substring(0, 1));
                text = text.substring(1);
            } else if (text.startsWith(" ")) {
                prefixBefore = " ";
                text = text.substring(prefixBefore.length());
            } else {
                // if we already have space or nothing before us
                addSpaceBefore = !(out.isPendingEOL() || out.isPendingSpace() || out.getModCount() == 0);

            if (!text.isEmpty() && "\u00A0 \t\n".indexOf(text.charAt(text.length() - 1)) != -1) {
                appendAfter = prepareText(text.substring(text.length() - 1));
                text = text.substring(0, text.length() - 1);
            } else if (text.endsWith(" ")) {
                appendAfter = " ";
                text = text.substring(0, text.length() - appendAfter.length());
            } else {
                // if next is not text space
                Node next = peek();
                addSpaceAfter = true;

                if (next instanceof TextNode) {
                    String nextText = ((TextNode) next).getWholeText();
                    if (!nextText.isEmpty() && Character.isWhitespace(nextText.charAt(0))) {
                        addSpaceAfter = false;

        if (!text.isEmpty()) {
            // need to trim end of string
            int pos = text.length() - 1;
            while (pos >= 0 && Character.isWhitespace(text.charAt(pos))) pos--;

            if (pos > 0) {
                if (prefixBefore != null) out.append(prefixBefore);
                if (addSpaceBefore) out.append(' ');

                text = text.substring(0, pos);

                if (appendAfter != null) out.append(appendAfter);
                if (addSpaceAfter) out.append(' ');

    private String processTextNodes(Node node) {

        Node child;
        FormattingAppendable out = new FormattingAppendableImpl(0);

        while ((child = peek()) != null) {
            if (child instanceof TextNode) {
                String text = ((TextNode) child).getWholeText();
            } else if (child instanceof Element) {
                processElement(out, child);

        return out.getText();

    private boolean processA(FormattingAppendable out, Element element) {

        // see if it is an anchor or a link
        if (element.hasAttr("href")) {
            String text = Utils.removeStart(Utils.removeEnd(processTextNodes(element), '\n'), '\n');
            String href = element.attr("href");
            String title = element.hasAttr("title") ? element.attr("title") : null;

            if (!text.isEmpty() || !href.contains("#")) {
                if (myOptions.extractAutoLinks && href.equals(text) && (title == null || title.isEmpty())) {
                    if (myOptions.wrapAutoLinks) out.append('<');
                    if (myOptions.wrapAutoLinks) out.append('>');
                } else {
                    if (title != null) out.append(" \"").append(element.attr("title").replace("\n", myOptions.eolInTitleAttribute).replace("\"", "\\\"")).append('"');
        } else {
            processTextNodes(out, element);
        return true;

    private boolean processBr(FormattingAppendable out, Element element) {
        if (out.isPreFormatted()) {
        } else {
            int options = out.getOptions();
            out.setOptions(options & ~(FormattingAppendable.SUPPRESS_TRAILING_WHITESPACE | FormattingAppendable.COLLAPSE_WHITESPACE));
            if (out.getPendingEOL() == 0) {
                // hard break
                out.repeat(' ', 2).line();
            } else {
                if (out.getPendingEOL() == 1) {
                    final String s = out.getAppendable().toString();
                    if (!s.endsWith("
")) { // this is a paragraph break if (myOptions.brAsParaBreaks) { out.blankLine(); } } else { // this is blank line insertion via
if (myOptions.brAsExtraBlankLines) { out.append("
").line(); } } } else { // this is blank line insertion via
if (myOptions.brAsExtraBlankLines) { out.append("
").line(); } } } out.setOptions(options); } return true; } private boolean processAbbr(FormattingAppendable out, Element element) { skip(); // see if it is an anchor or a link if (element.hasAttr("title")) { String text = processTextNodes(element).trim(); myAbbreviations.put(text, element.attr("title")); } return true; } private boolean processAside(FormattingAppendable out, Element element) { skip(); out.pushPrefix(); out.addPrefix("| "); processHtmlTree(out, element); out.popPrefix(); return true; } private boolean processBlockquote(FormattingAppendable out, Element element) { skip(); out.pushPrefix(); out.addPrefix("> "); processHtmlTree(out, element); out.popPrefix(); return true; } private boolean processCode(FormattingAppendable out, Element element) { skip(); BasedSequence text = SubSequence.of(element.ownText()); int backTickCount = getMaxRepeatedChars(text, '`', 1); CharSequence backTicks = RepeatedCharSequence.of("`", backTickCount); boolean oldInlineCode = myInlineCode; myInlineCode = true; processTextNodes(out, element, backTicks); myInlineCode = oldInlineCode; return true; } private int getMaxRepeatedChars(final CharSequence text, final char c, int minCount) { BasedSequence chars = BasedSequenceImpl.of(text); int lastPos = 0; while (lastPos < chars.length()) { int pos = chars.indexOf(c, lastPos); if (pos < 0) break; int count = chars.countChars(c, pos); if (minCount <= count) minCount = count + 1; lastPos = pos + count; } return minCount; } private boolean processDel(FormattingAppendable out, Element element) { skip(); if (!myOptions.preCodePreserveEmphasis && out.isPreFormatted()) { wrapTextNodes(out, element, "", false); } else { wrapTextNodes(out, element, "~~", true); } return true; } private boolean processEmphasis(FormattingAppendable out, Element element) { skip(); if (!myOptions.preCodePreserveEmphasis && out.isPreFormatted()) { wrapTextNodes(out, element, "", false); } else { wrapTextNodes(out, element, "*", true); } return true; } private boolean processHeading(FormattingAppendable out, Element element) { skip(); TagParam tagParam = getTagParam(element); if (tagParam != null) { int level = (Integer) tagParam.param; if (level >= 1 && level <= 6) { String headingText = processTextNodes(element).trim(); if (!headingText.isEmpty()) { out.blankLine(); if (myOptions.setextHeadings && level <= 2) { out.append(headingText); out.line().repeat(level == 1 ? '=' : '-', Utils.minLimit(headingText.length(), myOptions.minSetextHeadingMarkerLength)); } else { out.repeat('#', level).append(' '); out.append(headingText); } out.blankLine(); } } } return true; } private boolean processHr(FormattingAppendable out, Element element) { skip(); out.blankLine().append(myOptions.thematicBreak).blankLine(); return true; } private boolean processImg(FormattingAppendable out, Element element) { skip(); // see if it is an anchor or a link if (element.hasAttr("src")) { String src = element.attr("src"); EmojiCheatSheet.EmojiShortcut shortcut = EmojiCheatSheet.getImageShortcut(src); if (shortcut != null) { out.append(':').append(':'); } else { out.append("!["); if (element.hasAttr("alt")) out.append(element.attr("alt")); out.append(']'); out.append('('); int pos = src.indexOf('?'); int eol = pos < 0 ? pos : src.indexOf("%0A", pos); if (pos > 0 && eol > 0) { out.append(src, 0, pos + 1); String decoded = Utils.urlDecode(src.substring(pos + 1).replace("+", "%2B"), "UTF8"); out.line().append(decoded); } else { out.append(src); } if (element.hasAttr("title")) out.append(" \"").append(element.attr("title").replace("\n", myOptions.eolInTitleAttribute).replace("\"", "\\\"")).append('"'); out.append(")"); } } return true; } private boolean processP(FormattingAppendable out, Element element) { skip(); boolean isItemParagraph = false; boolean isDefinitionItemParagraph = false; Element firstElementSibling = element.firstElementSibling(); if (firstElementSibling == null || element == firstElementSibling) { String tagName = element.parent().tagName(); isItemParagraph = tagName.equalsIgnoreCase("li"); isDefinitionItemParagraph = tagName.equalsIgnoreCase("dd"); } out.blankLineIf(!(isItemParagraph || isDefinitionItemParagraph)); processTextNodes(out, element); out.blankLineIf(isItemParagraph || isDefinitionItemParagraph).line(); return true; } private boolean processInput(FormattingAppendable out, Element element) { boolean isItemParagraph = false; Element firstElementSibling = element.firstElementSibling(); if (firstElementSibling == null || element == firstElementSibling) { String tagName = element.parent().tagName(); isItemParagraph = tagName.equalsIgnoreCase("li"); } if (isItemParagraph) { if (element.hasAttr("type") && "checkbox".equalsIgnoreCase(element.attr("type"))) { skip(); if (element.hasAttr("checked")) { out.append("[x] "); } else { out.append("[ ] "); } return true; } } return false; } private boolean processIns(FormattingAppendable out, Element element) { skip(); if (!myOptions.preCodePreserveEmphasis && out.isPreFormatted()) { wrapTextNodes(out, element, "", false); } else { wrapTextNodes(out, element, "++", true); } return true; } private boolean processStrong(FormattingAppendable out, Element element) { skip(); if (!myOptions.preCodePreserveEmphasis && out.isPreFormatted()) { wrapTextNodes(out, element, "", false); } else { wrapTextNodes(out, element, "**", true); } return true; } private boolean processSub(FormattingAppendable out, Element element) { skip(); if (!myOptions.preCodePreserveEmphasis && out.isPreFormatted()) { wrapTextNodes(out, element, "", false); } else { wrapTextNodes(out, element, "~", false); } return true; } private boolean processSup(FormattingAppendable out, Element element) { skip(); if (!myOptions.preCodePreserveEmphasis && out.isPreFormatted()) { wrapTextNodes(out, element, "", false); } else { wrapTextNodes(out, element, "^", false); } return true; } private boolean processSvg(FormattingAppendable out, Element element) { if (element.hasClass("octicon")) { skip(); return true; } return false; } private boolean processPre(FormattingAppendable out, Element element) { pushState(element); Node next = peek(); String text; boolean hadCode = false; String className = ""; if (next != null && next.nodeName().equalsIgnoreCase("code")) { hadCode = true; Element code = (Element) next; //text = code.toString(); FormattingAppendable preText = new FormattingAppendableImpl(out.getOptions() & ~(FormattingAppendable.COLLAPSE_WHITESPACE | FormattingAppendable.SUPPRESS_TRAILING_WHITESPACE)); preText.openPreFormatted(false); processHtmlTree(preText, code); preText.closePreFormatted(); text = preText.getText(2); skip(1); className = Utils.removeStart(code.className(), "language-"); } else { FormattingAppendable preText = new FormattingAppendableImpl(out.getOptions() & ~(FormattingAppendable.COLLAPSE_WHITESPACE | FormattingAppendable.SUPPRESS_TRAILING_WHITESPACE)); preText.openPreFormatted(false); processHtmlTree(preText, element); preText.closePreFormatted(); text = preText.getText(2); } //int start = text.indexOf('>'); //int end = text.lastIndexOf('<'); //text = text.substring(start + 1, end); //text = Escaping.unescapeHtml(text); int backTickCount = getMaxRepeatedChars(text, '`', 3); CharSequence backTicks = RepeatedCharSequence.of("`", backTickCount); if (!className.isEmpty() || text.trim().isEmpty() || !hadCode) { out.blankLine().append(backTicks); if (!className.isEmpty()) { out.append(className); } out.line(); out.openPreFormatted(true); out.append(text.isEmpty() ? "\n" : text); out.closePreFormatted(); out.line().append(backTicks).blankLine(); } else { // we indent the whole thing by 4 spaces out.blankLine(); out.pushPrefix(); out.addPrefix(myOptions.codeIndent); out.openPreFormatted(true); out.append(text.isEmpty() ? "\n" : text); out.closePreFormatted(); out.blankLine(); out.popPrefix(); } popState(); next(); return true; } private static class ListState { boolean isNumbered; int itemCount; ListState(final boolean isNumbered) { this.isNumbered = isNumbered; itemCount = 0; } String getItemPrefix(final HtmlParserOptions options) { if (isNumbered) { return String.format("%d%c ", itemCount, options.orderedListDelimiter); } else { return String.format("%c ", options.unorderedListDelimiter); } } } private void processListItem(FormattingAppendable out, Element item, ListState listState) { skip(); pushState(item); listState.itemCount++; CharSequence itemPrefix = listState.getItemPrefix(this.myOptions); CharSequence childPrefix = RepeatedCharSequence.of(" ", myOptions.listContentIndent ? itemPrefix.length() : 4); out.line().append(itemPrefix); out.pushPrefix(); out.addPrefix(childPrefix); int offset = out.offset(); processHtmlTree(out, item); if (offset == out.offset()) { // completely empty, add space and make sure it is not suppressed out.append(' '); out.flushWhitespaces(); } out.line(); out.popPrefix(); popState(); } private boolean hasListItemParent(Element element) { Element parent = element.parent(); while (parent != null) { if (parent.tagName().equalsIgnoreCase("li")) { return true; } parent = parent.parent(); } return false; } private boolean processList(FormattingAppendable out, Element element, boolean isNumbered, boolean isFakeList) { if (!isFakeList) { skip(); pushState(element); } final Element previousElementSibling = element.previousElementSibling(); final String tag = previousElementSibling == null ? null : previousElementSibling.tagName().toUpperCase(); if (tag != null && tag.equals(element.tagName().toUpperCase()) && (tag.equals("UL") || tag.equals("OL"))) { if (myOptions.listsEndOnDoubleBlank) { out.blankLine(2); } else { out.line().append("").line(); } } ListState listState = new ListState(isNumbered); Node item; while ((item = peek()) != null) { TagParam tagParam = getTagParam(item); if (tagParam != null) { switch (tagParam.tagType) { case LI: processListItem(out, (Element) item, listState); break; case P: processListItem(out, (Element) item, listState); break; default: skip(); break; } } else { processWrapped(out, item, true); } } out.blankLine(); if (!isFakeList) { popState(); } return true; } private boolean processOl(FormattingAppendable out, Element element) { return processList(out, element, true, false); } private boolean processUl(FormattingAppendable out, Element element) { return processList(out, element, false, false); } private void processDefinition(FormattingAppendable out, Element item) { pushState(item); int options = out.getOptions(); Elements children = item.children(); boolean firstIsPara = false; if (!children.isEmpty() && children.get(0).tagName().equalsIgnoreCase("p")) { // we need a blank line out.blankLine(); firstIsPara = true; } CharSequence childPrefix = RepeatedCharSequence.of(" ", myOptions.listContentIndent ? myOptions.definitionMarkerSpaces + 1 : 4); out.line().setOptions(options & ~FormattingAppendable.COLLAPSE_WHITESPACE); out.append(':').repeat(' ', myOptions.definitionMarkerSpaces); out.pushPrefix(); out.addPrefix(childPrefix); out.setOptions(options); if (firstIsPara) { processHtmlTree(out, item); } else { processTextNodes(out, item); } out.popPrefix(); popState(); } private boolean processDl(FormattingAppendable out, Element element) { skip(); pushState(element); Node item; boolean lastWasDefinition = true; boolean firstItem = true; while ((item = next()) != null) { TagParam tagParam = getTagParam(item); if (tagParam != null) { switch (tagParam.tagType) { case DT: out.blankLineIf(lastWasDefinition).lineIf(!firstItem); processTextNodes(out, item); out.line(); lastWasDefinition = false; firstItem = false; break; case DD: processDefinition(out, (Element) item); lastWasDefinition = true; firstItem = false; break; default: break; } } else { processWrapped(out, item, true); } } popState(); return true; } private boolean processDiv(FormattingAppendable out, Element element) { // unwrap and process content skip(); processHtmlTree(out, element); out.line(); if (myOptions.divAsParagraph) out.blankLine(); return true; } private boolean processUnwrapped(FormattingAppendable out, Node element) { // unwrap and process content skip(); processHtmlTree(out, element); return true; } private boolean processSpan(FormattingAppendable out, Element element) { // unwrap and process content if (element.hasAttr("style")) { String style = element.attr("style"); if (style.equals("mso-list:Ignore")) { skip(); String text = processTextNodes(element); if (NUMERIC_DOT_LIST.matcher(text).matches()) { out.append(text).append(' '); } else if (NUMERIC_PAREN_LIST.matcher(text).matches()) { if (myOptions.dotOnlyNumericLists) { out.append(text, 0, text.length() - 1).append(". "); } else { out.append(text).append(' '); } } else if (NON_NUMERIC_DOT_LIST.matcher(text).matches()) { out.append("1. "); } else if (NON_NUMERIC_PAREN_LIST.matcher(text).matches()) { if (myOptions.dotOnlyNumericLists) { out.append("1. "); } else { out.append("1) "); } } else if (text.equals("·")) { out.append("* "); } else { out.append("* ").append(text); } return true; } } skip(); processHtmlTree(out, element); return true; } private boolean processEmoji(FormattingAppendable out, Element element) { if (element.tagName().equalsIgnoreCase("g-emoji")) { if (element.hasAttr("alias")) { skip(); out.append(':').append(element.attr("alias")).append(':'); return true; } if (element.hasAttr("fallback-src")) { EmojiCheatSheet.EmojiShortcut shortcut = EmojiCheatSheet.getImageShortcut(element.attr("fallback-src")); if (shortcut != null) { skip(); out.append(':').append(':'); return true; } } } return false; } private boolean processComment(FormattingAppendable out, Comment element) { skip(); if (myOptions.renderComments) { out.append(""); } return true; } private TagParam getTagParam(final Node node) { return ourTagProcessors.get(node.nodeName().toLowerCase()); } private Table myTable; private boolean processTable(FormattingAppendable out, Element table) { Table oldTable = myTable; skip(); pushState(table); myTable = new Table(myOptions.tableOptions); Node item; while ((item = next()) != null) { TagParam tagParam = getTagParam(item); if (tagParam != null) { switch (tagParam.tagType) { case CAPTION: processTableCaption(out, (Element) item); break; case TBODY: myTable.setHeading(false); processTableSection(out, (Element) item); break; case THEAD: myTable.setHeading(true); processTableSection(out, (Element) item); break; case TR: Element tableRow = (Element) item; Elements children = tableRow.children(); myTable.setHeading(!children.isEmpty() && children.get(0).tagName().equalsIgnoreCase("th")); processTableRow(out, (Element) item); break; } } else { // nothing //processWrapped(out, item, null); } } myTable.finalizeTable(); int sepColumns = myTable.getMaxColumns(); if (sepColumns > 0) { out.blankLine(); myTable.appendTable(out); out.blankLine(); } myTable = oldTable; popState(); return true; } private void processTableSection(FormattingAppendable out, Element element) { pushState(element); Node node; while ((node = next()) != null) { TagParam tagParam = getTagParam(node); if (tagParam != null) { switch (tagParam.tagType) { case TR: { Element tableRow = (Element) node; Elements children = tableRow.children(); boolean wasHeading = myTable.isHeading(); if (!children.isEmpty()) { if (children.get(0).tagName().equalsIgnoreCase("th")) { myTable.setHeading(true); } } processTableRow(out, tableRow); myTable.setHeading(wasHeading); break; } } } else { processWrapped(out, element, true); } } popState(); } private void processTableRow(FormattingAppendable out, Element element) { pushState(element); Node node; while ((node = next()) != null) { TagParam tagParam = getTagParam(node); if (tagParam != null) { switch (tagParam.tagType) { // @formatter:off case TH : processTableCell(out, (Element) node); break; case TD : processTableCell(out, (Element) node); break; // @formatter:on } } else { processWrapped(out, element, true); } } myTable.nextRow(); popState(); } private void processTableCaption(FormattingAppendable out, Element element) { myTable.setCaption(processTextNodes(element).trim()); } private void processTableCell(FormattingAppendable out, Element element) { String cellText = processTextNodes(element).trim().replaceAll("\\s*\n\\s*", " "); int colSpan = 1; int rowSpan = 1; CellAlignment alignment = null; if (element.hasAttr("colSpan")) { try { colSpan = Integer.parseInt(element.attr("colSpan")); } catch (NumberFormatException ignored) { } } if (element.hasAttr("rowSpan")) { try { rowSpan = Integer.parseInt(element.attr("rowSpan")); } catch (NumberFormatException ignored) { } } if (element.hasAttr("align")) { alignment = CellAlignment.getAlignment(element.attr("align")); } else { // see if has class that matches final Set classNames = element.classNames(); if (!classNames.isEmpty()) { for (String clazz : classNames) { CellAlignment cellAlignment = myOptions.tableCellAlignmentMap.get(clazz); if (cellAlignment != null) { alignment = cellAlignment; break; } } if (alignment == null) { // see if we have matching patterns for (Object key : myOptions.tableCellAlignmentMap.keySet()) { if (key instanceof Pattern) { Pattern pattern = (Pattern) key; for (String clazz : classNames) { if (pattern.matcher(clazz).find()) { // have a match alignment = myOptions.tableCellAlignmentMap.get(key); break; } } if (alignment != null) break; } } } } } // skip cells defined by row spans in previous rows myTable.addCell(new Table.TableCell(SubSequence.NULL, cellText, BasedSequence.NULL, rowSpan, colSpan, alignment)); } private void pushState(Node parent) { myStateStack.push(myState); myState = new State(parent); } private void popState() { if (myStateStack.isEmpty()) throw new IllegalStateException("popState with an empty stack"); myState = myStateStack.pop(); } private Node peek() { if (myState.myIndex < myState.myElements.size()) return myState.myElements.get(myState.myIndex); return null; } private Node peek(int skip) { if (myState.myIndex + skip >= 0 && myState.myIndex + skip < myState.myElements.size()) return myState.myElements.get(myState.myIndex + skip); return null; } private Node next() { Node next = peek(); if (next != null) myState.myIndex++; return next; } private void skip() { Node next = peek(); if (next != null) myState.myIndex++; } private Node next(int skip) { if (skip > 0) { Node next = peek(skip - 1); if (next != null) myState.myIndex += skip; return next; } return peek(); } @SuppressWarnings("SameParameterValue") private void skip(int skip) { if (skip > 0) { Node next = peek(skip - 1); if (next != null) myState.myIndex += skip; } } enum TagType { A, ABBR, ASIDE, BR, BLOCKQUOTE, CAPTION, CODE, DEL, DIV, DL, DD, DT, _EMPHASIS, HR, IMG, INPUT, INS, LI, OL, P, PRE, _STRONG_EMPHASIS, SUB, SUP, SVG, TABLE, TBODY, TD, TH, THEAD, TR, UL, _HEADING, _UNWRAPPED, _SPAN, _WRAPPED, _COMMENT, _TEXT, } private static class TagParam { final TagType tagType; final Object param; TagParam(final TagType tagType, final Object param) { this.tagType = tagType; this.param = param; } static TagParam tag(TagType tagType, Object param) { return new TagParam(tagType, param); } } private final static Map ourTagProcessors = new HashMap(); static { ourTagProcessors.put("a", TagParam.tag(TagType.A, null)); ourTagProcessors.put("abbr", TagParam.tag(TagType.ABBR, null)); ourTagProcessors.put("aside", TagParam.tag(TagType.ASIDE, null)); ourTagProcessors.put("br", TagParam.tag(TagType.BR, null)); ourTagProcessors.put("blockquote", TagParam.tag(TagType.BLOCKQUOTE, null)); ourTagProcessors.put("caption", TagParam.tag(TagType.CAPTION, null)); ourTagProcessors.put("code", TagParam.tag(TagType.CODE, null)); ourTagProcessors.put("del", TagParam.tag(TagType.DEL, null)); ourTagProcessors.put("div", TagParam.tag(TagType.DIV, null)); ourTagProcessors.put("dd", TagParam.tag(TagType.DD, null)); ourTagProcessors.put("dl", TagParam.tag(TagType.DL, null)); ourTagProcessors.put("dt", TagParam.tag(TagType.DT, null)); ourTagProcessors.put("i", TagParam.tag(TagType._EMPHASIS, null)); ourTagProcessors.put("em", TagParam.tag(TagType._EMPHASIS, null)); ourTagProcessors.put("h1", TagParam.tag(TagType._HEADING, 1)); ourTagProcessors.put("h2", TagParam.tag(TagType._HEADING, 2)); ourTagProcessors.put("h3", TagParam.tag(TagType._HEADING, 3)); ourTagProcessors.put("h4", TagParam.tag(TagType._HEADING, 4)); ourTagProcessors.put("h5", TagParam.tag(TagType._HEADING, 5)); ourTagProcessors.put("h6", TagParam.tag(TagType._HEADING, 6)); ourTagProcessors.put("hr", TagParam.tag(TagType.HR, null)); ourTagProcessors.put("img", TagParam.tag(TagType.IMG, null)); ourTagProcessors.put("input", TagParam.tag(TagType.INPUT, null)); ourTagProcessors.put("ins", TagParam.tag(TagType.INS, null)); ourTagProcessors.put("u", TagParam.tag(TagType.INS, null)); ourTagProcessors.put("li", TagParam.tag(TagType.LI, null)); ourTagProcessors.put("ol", TagParam.tag(TagType.OL, null)); ourTagProcessors.put("p", TagParam.tag(TagType.P, null)); ourTagProcessors.put("pre", TagParam.tag(TagType.PRE, null)); ourTagProcessors.put("b", TagParam.tag(TagType._STRONG_EMPHASIS, null)); ourTagProcessors.put("strong", TagParam.tag(TagType._STRONG_EMPHASIS, null)); ourTagProcessors.put("strike", TagParam.tag(TagType.DEL, null)); ourTagProcessors.put("sub", TagParam.tag(TagType.SUB, null)); ourTagProcessors.put("sup", TagParam.tag(TagType.SUP, null)); ourTagProcessors.put("svg", TagParam.tag(TagType.SVG, null)); ourTagProcessors.put("table", TagParam.tag(TagType.TABLE, null)); ourTagProcessors.put("tbody", TagParam.tag(TagType.TBODY, null)); ourTagProcessors.put("td", TagParam.tag(TagType.TD, null)); ourTagProcessors.put("th", TagParam.tag(TagType.TH, null)); ourTagProcessors.put("thead", TagParam.tag(TagType.THEAD, null)); ourTagProcessors.put("tr", TagParam.tag(TagType.TR, null)); ourTagProcessors.put("ul", TagParam.tag(TagType.UL, null)); ourTagProcessors.put("#comment", TagParam.tag(TagType._COMMENT, null)); ourTagProcessors.put("#text", TagParam.tag(TagType._TEXT, null)); // keep wrapped ourTagProcessors.put("kbd", TagParam.tag(TagType._WRAPPED, false)); ourTagProcessors.put("var", TagParam.tag(TagType._WRAPPED, false)); // tags ignored, contents processed ourTagProcessors.put("article", TagParam.tag(TagType._UNWRAPPED, null)); ourTagProcessors.put("address", TagParam.tag(TagType._UNWRAPPED, null)); ourTagProcessors.put("article", TagParam.tag(TagType._UNWRAPPED, null)); ourTagProcessors.put("frameset", TagParam.tag(TagType._UNWRAPPED, null)); ourTagProcessors.put("section", TagParam.tag(TagType._UNWRAPPED, null)); ourTagProcessors.put("span", TagParam.tag(TagType._SPAN, null)); ourTagProcessors.put("small", TagParam.tag(TagType._UNWRAPPED, null)); ourTagProcessors.put("iframe", TagParam.tag(TagType._UNWRAPPED, null)); } }

© 2015 - 2024 Weber Informatics LLC | Privacy Policy