All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.composum.ai.backend.base.service.chat.impl.HtmlToMarkdownConverter Maven / Gradle / Ivy

package com.composum.ai.backend.base.service.chat.impl;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A quick HTML markdown converter that handles the tags [a, strong, code, em, p, br, u, ul, li, ol] used in rich text editor.
 * Not threadsafe, use only once.
 * 

* We do not want to use a library since the libraries doing this have many parts and are quite some work to deploy, and * we only need to convert a few tags from richtext editors. * Original generated by ChatGPT with "Please make an HTML to Markdown converter that handles the tags [a, strong, code, em, p, br, u, ul, li, ol] . Use the jsoup library for that." * but some heavy rewrite. */ public class HtmlToMarkdownConverter { private static final Logger LOG = LoggerFactory.getLogger(HtmlToMarkdownConverter.class); private static Set missingTags = new ConcurrentSkipListSet<>(); private static final Map HEADER_TAGS = new HashMap<>(); {{ HEADER_TAGS.put("title", "# "); HEADER_TAGS.put("h1", "# "); HEADER_TAGS.put("h2", "## "); HEADER_TAGS.put("h3", "### "); HEADER_TAGS.put("h4", "#### "); HEADER_TAGS.put("h5", "##### "); HEADER_TAGS.put("h6", "###### "); }} /** * Important table attributes we need to keep. */ private static final List TABLE_ATTRIBUTES = Arrays.asList("border", "colspan", "rowspan", "align", "valign", "scope", "cellpadding", "cellspacing", "width", "height", "bgcolor"); // continued indentation. Two spaces since four would be code block private final String indentStep = " "; // continued indentation that is inserted before a continuation line private String continuedIndentation = ""; protected StringBuilder sb = new StringBuilder(); @Nonnull public String convert(@Nullable String html) { sb.setLength(0); continuedIndentation = ""; if (html != null) { Document doc = Jsoup.parseBodyFragment(html); convertElement(doc.body()); } return sb.toString(); } private void convertNode(Node node) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; insertText(textNode.text()); } else if (node instanceof Element) { Element element = (Element) node; convertElement(element); } else if (node instanceof Comment || node instanceof DocumentType || node instanceof XmlDeclaration) { // no text content } else if (node instanceof DataNode) { // not quite sure what to do with this, but this is very likely not text content. } else { throw new UnsupportedOperationException("Unknown node type " + node.getClass()); } } /** * Split text into lines to add indentation before each line. */ protected void insertText(String text) { if (text != null) { String splitText = Stream.of(text.split("\n")) .collect(Collectors.joining(continuedIndentation + "\n")); if (sb.length() > 0 && sb.charAt(sb.length() - 1) == '\n') { // only happens if we have mixed text and block level elements within a block level element sb.append(continuedIndentation); } sb.append(splitText); } } protected void convertChildren(Node node) { for (Node child : node.childNodes()) { convertNode(child); } } /** * Convention: a block level element has to print a newline before itself, also after itself. */ private void convertElement(Element element) { String tagName = element.tagName().toLowerCase(); String oldindentation; switch (tagName) { case "a": sb.append("["); convertChildren(element); sb.append("]("); sb.append(element.attr("href")); String title = element.attr("title"); if (StringUtil.isBlank(title)) { title = element.attr("alt"); } if (!StringUtil.isBlank(title)) { sb.append(" \""); sb.append(title.replaceAll("\"", "\\\"")); sb.append("\""); } sb.append(")"); break; case "em": case "u": sb.append("_"); convertChildren(element); sb.append("_"); break; case "b": case "strong": sb.append("**"); convertChildren(element); sb.append("**"); break; case "i": sb.append("*"); convertChildren(element); sb.append("*"); break; case "del": case "s": sb.append("~~"); convertChildren(element); sb.append("~~"); break; case "code": sb.append("`"); convertChildren(element); sb.append("`"); break; case "pre": // TODO: a pre code nesting would be wrong. sb.append("\n```\n"); sb.append(element.html().replaceAll("\\s+$", "")); sb.append("\n```\n"); break; case "p": sb.append("\n"); convertChildren(element); sb.append("\n"); break; case "br": sb.append("\n"); break; case "ul": oldindentation = continuedIndentation; continuedIndentation += indentStep; for (Element li : element.children()) { sb.append("\n" + oldindentation + "- "); convertChildren(li); } sb.append("\n"); continuedIndentation = oldindentation; break; case "ol": oldindentation = continuedIndentation; int i = 1; for (Element li : element.children()) { String prefix = (i++) + ". "; continuedIndentation = oldindentation + prefix.replaceAll(".", " "); sb.append("\n" + oldindentation + prefix); convertChildren(li); } sb.append("\n"); continuedIndentation = oldindentation; break; case "li": throw new UnsupportedOperationException("Bug: li outside of ul or ol"); case "img": case "video": // video is not quite properly treated here, but likely not very relevant. sb.append("!["); sb.append(element.attr("alt")); sb.append("]("); sb.append(element.attr("src")); sb.append(")"); break; case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": case "title": ensureEmptyOrEndsWith("\n\n"); String prefix = HEADER_TAGS.get(tagName); sb.append(prefix); convertChildren(element); sb.append("\n"); break; case "hr": sb.append("\n"); sb.append(continuedIndentation); sb.append("---\n"); break; case "input": String type = element.attr("type"); String placeholder = element.attr("placeholder"); sb.append("[Input: Type="); sb.append(type.isEmpty() ? "text" : type); if (!placeholder.isEmpty()) { sb.append(", Placeholder="); sb.append(placeholder); } sb.append("]"); break; case "dl": // there is no markdown for dl, so we just use embedded HTML. Possibly it could also be // **term** // definition // oldindentation = continuedIndentation; continuedIndentation += indentStep; sb.append("\n" + oldindentation + "

"); convertChildren(element); sb.append("\n" + oldindentation + "
\n"); continuedIndentation = oldindentation; break; case "dt": sb.append("\n" + continuedIndentation + "
"); convertChildren(element); sb.append("
"); break; case "dd": sb.append("\n" + continuedIndentation + "
"); convertChildren(element); sb.append("
"); break; case "blockquote": oldindentation = continuedIndentation; continuedIndentation += "> "; sb.append("\n"); sb.append(continuedIndentation); convertChildren(element); sb.append("\n"); continuedIndentation = oldindentation; break; case "mark": case "small": case "ins": case "sub": case "sup": // use embedded HTML syntax sb.append("<").append(tagName).append(">"); convertChildren(element); sb.append(""); break; case "#root": case "html": case "body": case "span": case "div": case "section": case "button": // ignore the tag convertChildren(element); break; case "noscript": case "meta": case "nav": case "script": case "link": // ignore the content, too break; // rudimentary support for tables case "table": case "thead": case "tbody": case "tfoot": case "tr": case "td": case "th": // use embedded HTML syntax sb.append("<").append(tagName); for (String attr : TABLE_ATTRIBUTES) { String value = element.attr(attr); if (!value.trim().isEmpty()) { sb.append(" ").append(attr).append("=\"").append(value).append("\""); } } sb.append(">"); convertChildren(element); sb.append(""); break; default: // ignore tags we do not know LOG.debug("Unknown tag {}", tagName); missingTags.add(tagName); LOG.warn("Currently unsupported tags: {}", missingTags); convertChildren(element); break; } } /** * We ensure sb is either empty or that it ends with the given suffix. */ protected void ensureEmptyOrEndsWith(@Nonnull String suffix) { if (sb.length() == 0) { return; } // find the longest prefix of suffix (incl. suffix itself) that sb already ends with String alreadyEnding = null; for (int i = 0; i <= suffix.length(); i++) { String prefix = suffix.substring(0, i); if (sb.length() >= prefix.length() && prefix.equals(sb.substring(sb.length() - i))) { alreadyEnding = prefix; } } if (alreadyEnding != null) { sb.append(suffix.substring(alreadyEnding.length())); } else { sb.append(suffix); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy