
com.composum.ai.backend.base.service.chat.impl.HtmlToMarkdownConverter Maven / Gradle / Ivy
package com.composum.ai.backend.base.service.chat.impl;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A quick HTML markdown converter that handles the tags [a, strong, code, em, p, br, u, ul, li, ol] used in rich text editor.
* Not threadsafe, use only once.
*
* We do not want to use a library since the libraries doing this have many parts and are quite some work to deploy, and
* we only need to convert a few tags from richtext editors.
* Original generated by ChatGPT with "Please make an HTML to Markdown converter that handles the tags [a, strong, code, em, p, br, u, ul, li, ol] . Use the jsoup library for that."
* but some heavy rewrite.
*/
public class HtmlToMarkdownConverter {
private static final Logger LOG = LoggerFactory.getLogger(HtmlToMarkdownConverter.class);
private static Set missingTags = new ConcurrentSkipListSet<>();
private static final Map HEADER_TAGS = new HashMap<>();
{{
HEADER_TAGS.put("title", "# ");
HEADER_TAGS.put("h1", "# ");
HEADER_TAGS.put("h2", "## ");
HEADER_TAGS.put("h3", "### ");
HEADER_TAGS.put("h4", "#### ");
HEADER_TAGS.put("h5", "##### ");
HEADER_TAGS.put("h6", "###### ");
}}
/**
* Important table attributes we need to keep.
*/
private static final List TABLE_ATTRIBUTES = Arrays.asList("border", "colspan", "rowspan",
"align", "valign", "scope", "cellpadding", "cellspacing", "width", "height", "bgcolor");
// continued indentation. Two spaces since four would be code block
private final String indentStep = " ";
// continued indentation that is inserted before a continuation line
private String continuedIndentation = "";
protected StringBuilder sb = new StringBuilder();
@Nonnull
public String convert(@Nullable String html) {
sb.setLength(0);
continuedIndentation = "";
if (html != null) {
Document doc = Jsoup.parseBodyFragment(html);
convertElement(doc.body());
}
return sb.toString();
}
private void convertNode(Node node) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
insertText(textNode.text());
} else if (node instanceof Element) {
Element element = (Element) node;
convertElement(element);
} else if (node instanceof Comment || node instanceof DocumentType || node instanceof XmlDeclaration) {
// no text content
} else if (node instanceof DataNode) {
// not quite sure what to do with this, but this is very likely not text content.
} else {
throw new UnsupportedOperationException("Unknown node type " + node.getClass());
}
}
/**
* Split text into lines to add indentation before each line.
*/
protected void insertText(String text) {
if (text != null) {
String splitText = Stream.of(text.split("\n"))
.collect(Collectors.joining(continuedIndentation + "\n"));
if (sb.length() > 0 && sb.charAt(sb.length() - 1) == '\n') {
// only happens if we have mixed text and block level elements within a block level element
sb.append(continuedIndentation);
}
sb.append(splitText);
}
}
protected void convertChildren(Node node) {
for (Node child : node.childNodes()) {
convertNode(child);
}
}
/**
* Convention: a block level element has to print a newline before itself, also after itself.
*/
private void convertElement(Element element) {
String tagName = element.tagName().toLowerCase();
String oldindentation;
switch (tagName) {
case "a":
sb.append("[");
convertChildren(element);
sb.append("](");
sb.append(element.attr("href"));
String title = element.attr("title");
if (StringUtil.isBlank(title)) {
title = element.attr("alt");
}
if (!StringUtil.isBlank(title)) {
sb.append(" \"");
sb.append(title.replaceAll("\"", "\\\""));
sb.append("\"");
}
sb.append(")");
break;
case "em":
case "u":
sb.append("_");
convertChildren(element);
sb.append("_");
break;
case "b":
case "strong":
sb.append("**");
convertChildren(element);
sb.append("**");
break;
case "i":
sb.append("*");
convertChildren(element);
sb.append("*");
break;
case "del":
case "s":
sb.append("~~");
convertChildren(element);
sb.append("~~");
break;
case "code":
sb.append("`");
convertChildren(element);
sb.append("`");
break;
case "pre": // TODO: a pre code nesting would be wrong.
sb.append("\n```\n");
sb.append(element.html().replaceAll("\\s+$", ""));
sb.append("\n```\n");
break;
case "p":
sb.append("\n");
convertChildren(element);
sb.append("\n");
break;
case "br":
sb.append("\n");
break;
case "ul":
oldindentation = continuedIndentation;
continuedIndentation += indentStep;
for (Element li : element.children()) {
sb.append("\n" + oldindentation + "- ");
convertChildren(li);
}
sb.append("\n");
continuedIndentation = oldindentation;
break;
case "ol":
oldindentation = continuedIndentation;
int i = 1;
for (Element li : element.children()) {
String prefix = (i++) + ". ";
continuedIndentation = oldindentation + prefix.replaceAll(".", " ");
sb.append("\n" + oldindentation + prefix);
convertChildren(li);
}
sb.append("\n");
continuedIndentation = oldindentation;
break;
case "li":
throw new UnsupportedOperationException("Bug: li outside of ul or ol");
case "img":
case "video": // video is not quite properly treated here, but likely not very relevant.
sb.append(";
sb.append(element.attr("src"));
sb.append(")");
break;
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "title":
ensureEmptyOrEndsWith("\n\n");
String prefix = HEADER_TAGS.get(tagName);
sb.append(prefix);
convertChildren(element);
sb.append("\n");
break;
case "hr":
sb.append("\n");
sb.append(continuedIndentation);
sb.append("---\n");
break;
case "input":
String type = element.attr("type");
String placeholder = element.attr("placeholder");
sb.append("[Input: Type=");
sb.append(type.isEmpty() ? "text" : type);
if (!placeholder.isEmpty()) {
sb.append(", Placeholder=");
sb.append(placeholder);
}
sb.append("]");
break;
case "dl": // there is no markdown for dl, so we just use embedded HTML. Possibly it could also be
// **term**
// definition
//
oldindentation = continuedIndentation;
continuedIndentation += indentStep;
sb.append("\n" + oldindentation + "");
convertChildren(element);
sb.append("\n" + oldindentation + "
\n");
continuedIndentation = oldindentation;
break;
case "dt":
sb.append("\n" + continuedIndentation + "");
convertChildren(element);
sb.append(" ");
break;
case "dd":
sb.append("\n" + continuedIndentation + "");
convertChildren(element);
sb.append(" ");
break;
case "blockquote":
oldindentation = continuedIndentation;
continuedIndentation += "> ";
sb.append("\n");
sb.append(continuedIndentation);
convertChildren(element);
sb.append("\n");
continuedIndentation = oldindentation;
break;
case "mark":
case "small":
case "ins":
case "sub":
case "sup":
// use embedded HTML syntax
sb.append("<").append(tagName).append(">");
convertChildren(element);
sb.append("").append(tagName).append(">");
break;
case "#root":
case "html":
case "body":
case "span":
case "div":
case "section":
case "button":
// ignore the tag
convertChildren(element);
break;
case "noscript":
case "meta":
case "nav":
case "script":
case "link":
// ignore the content, too
break;
// rudimentary support for tables
case "table":
case "thead":
case "tbody":
case "tfoot":
case "tr":
case "td":
case "th":
// use embedded HTML syntax
sb.append("<").append(tagName);
for (String attr : TABLE_ATTRIBUTES) {
String value = element.attr(attr);
if (!value.trim().isEmpty()) {
sb.append(" ").append(attr).append("=\"").append(value).append("\"");
}
}
sb.append(">");
convertChildren(element);
sb.append("").append(tagName).append(">");
break;
default:
// ignore tags we do not know
LOG.debug("Unknown tag {}", tagName);
missingTags.add(tagName);
LOG.warn("Currently unsupported tags: {}", missingTags);
convertChildren(element);
break;
}
}
/**
* We ensure sb is either empty or that it ends with the given suffix.
*/
protected void ensureEmptyOrEndsWith(@Nonnull String suffix) {
if (sb.length() == 0) {
return;
}
// find the longest prefix of suffix (incl. suffix itself) that sb already ends with
String alreadyEnding = null;
for (int i = 0; i <= suffix.length(); i++) {
String prefix = suffix.substring(0, i);
if (sb.length() >= prefix.length() && prefix.equals(sb.substring(sb.length() - i))) {
alreadyEnding = prefix;
}
}
if (alreadyEnding != null) {
sb.append(suffix.substring(alreadyEnding.length()));
} else {
sb.append(suffix);
}
}
}