All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.composum.ai.backend.slingbase.impl.ApproximateMarkdownServiceImpl Maven / Gradle / Ivy

Go to download

Common Functionality for Composum AI specific to Sling but would be useable in both Composum and AEM and similar.

The newest version!
package com.composum.ai.backend.slingbase.impl;

import static com.composum.ai.backend.slingbase.ApproximateMarkdownServicePlugin.PluginResult.HANDLED_ATTRIBUTES;
import static com.composum.ai.backend.slingbase.ApproximateMarkdownServicePlugin.PluginResult.NOT_HANDLED;
import static com.composum.ai.backend.slingbase.impl.AllowDenyMatcherUtil.allowDenyCheck;
import static org.apache.commons.lang3.StringUtils.isNotBlank;

import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.jackrabbit.JcrConstants;
import org.apache.sling.api.SlingHttpServletRequest;
import org.apache.sling.api.SlingHttpServletResponse;
import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceUtil;
import org.jetbrains.annotations.NotNull;
import org.osgi.service.component.annotations.Activate;
import org.osgi.service.component.annotations.Component;
import org.osgi.service.component.annotations.Deactivate;
import org.osgi.service.component.annotations.Modified;
import org.osgi.service.component.annotations.Reference;
import org.osgi.service.component.annotations.ReferenceCardinality;
import org.osgi.service.component.annotations.ReferencePolicy;
import org.osgi.service.metatype.annotations.AttributeDefinition;
import org.osgi.service.metatype.annotations.Designate;
import org.osgi.service.metatype.annotations.ObjectClassDefinition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.composum.ai.backend.base.service.chat.GPTChatCompletionService;
import com.composum.ai.backend.slingbase.ApproximateMarkdownService;
import com.composum.ai.backend.slingbase.ApproximateMarkdownServicePlugin;
import com.composum.ai.backend.slingbase.ApproximateMarkdownServicePlugin.PluginResult;

/**
 * Implementation for {@link ApproximateMarkdownService}.
 */
@Component
@Designate(ocd = ApproximateMarkdownServiceImpl.Config.class)
public class ApproximateMarkdownServiceImpl implements ApproximateMarkdownService {

    public static final Map ATTRIBUTE_TO_MARKDOWN_PREFIX = new HashMap<>();

    {
        {
            ATTRIBUTE_TO_MARKDOWN_PREFIX.put("jcr:title", "## ");
            ATTRIBUTE_TO_MARKDOWN_PREFIX.put("title", "## ");
            ATTRIBUTE_TO_MARKDOWN_PREFIX.put("subtitle", "### ");
            ATTRIBUTE_TO_MARKDOWN_PREFIX.put("cq:panelTitle", "#### ");
            // , "code", "```" handled in extra method
        }
    }

    /**
     * Ignored values for labelled output: "true"/ "false" / single number (int / float) attributes or array of numbers attributes, or shorter than 3 digits or path, or array or type date or boolean or {Date} or {Boolean} , inherit, blank, html tags, target .
     */
    protected final static Pattern IGNORED_VALUE_PATTERN = Pattern.compile("true|false|[0-9][0-9]?[0-9]?|/(conf|content|etc|apps|libs|var|preview|public|mnt)/.*|\\{Boolean\\}(true|false)|inherit|blank|target|h[0-9]|div|p");

    /**
     * We ignore nodes named i18n or renditions and nodes starting with rep:, dam:, cq:
     */
    protected final static Pattern IGNORED_NODE_NAMES = Pattern.compile("i18n|renditions|rep:.*|dam:.*|cq:.*");

    protected final static Pattern IMAGE_PATTERN = Pattern.compile("\\.(png|jpg|jpeg|gif|svg)(/|$)", Pattern.CASE_INSENSITIVE);

    protected final static Pattern VIDEO_PATTERN = Pattern.compile("\\.(mp4|mov)(/|$)", Pattern.CASE_INSENSITIVE);

    /** We allow generating markdown for subpaths of /content, /public and /preview . */
    public static final Pattern ADMISSIBLE_PATH_PATTERN = Pattern.compile("/(content|preview|public)/.*/.*");

    /** If that occurs in a string it has several words. */
    public static final Pattern THREE_WHITESPACE_PATTERN = Pattern.compile("\\s\\S+\\s+\\S+\\s");

    /**
     * A list of attributes that are output (in that ordering) without any label, each on a line for itself.
     */
    @Nonnull
    protected List textAttributes;

    /**
     * A list of labelled attributes that come first if they are present, in the given order.
     */
    protected List labelledAttributeOrder;

    /**
     * A pattern which attributes have to be output with a label: the attribute name, a colon and a space and then the
     * trimmed attribute value followed by newline.
     */
    @Nullable
    protected Pattern labeledAttributePatternAllow;

    /**
     * A pattern matching exceptions for {@link #labeledAttributePatternAllow}.
     */
    @Nullable
    protected Pattern labeledAttributePatternDeny;

    /**
     * Whitelist for URLs we can connect to get the markdown. Required - the URL has to match one of the patterns.
     */
    protected List urlBlacklist;

    /**
     * Blacklist for URLs we can connect to get the markdown. The URL must not match one of the patterns.
     */
    protected List urlWhitelist;


    private static final Logger LOG = LoggerFactory.getLogger(ApproximateMarkdownServiceImpl.class);

    @Reference
    protected GPTChatCompletionService chatCompletionService;

    // List of ApproximateMarkdownServicePlugin dynamically injected by OSGI
    @Nonnull
    @Reference(cardinality = ReferenceCardinality.MULTIPLE, policy = ReferencePolicy.DYNAMIC, service = ApproximateMarkdownServicePlugin.class)
    protected volatile List plugins;

    protected void logUnhandledAttributes(Resource resource) {
        for (Map.Entry entry : resource.getValueMap().entrySet()) {
            if (entry.getValue() instanceof String) {
                String value = (String) entry.getValue();
                if (!textAttributes.contains(entry.getKey()) && value.contains(" ") && THREE_WHITESPACE_PATTERN.matcher(value).find() &&
                        !allowDenyCheck(entry.getKey(), labeledAttributePatternAllow, labeledAttributePatternDeny)) {
                    // check whether we forgot something
                    LOG.info("Ignoring text attribute {} in {}", entry.getKey(), resource.getPath());
                }
            }
        }
    }

    @Nonnull
    @Override
    public String approximateMarkdown(@Nullable Resource resource, SlingHttpServletRequest request, SlingHttpServletResponse response) {
        LOG.debug(">>> approximateMarkdown for {}", resource != null ? resource.getPath() : null);
        try (StringWriter s = new StringWriter()) {
            try (PrintWriter out = new PrintWriter(s)) {
                approximateMarkdown(resource, out, request, response);
            }
            return s.toString();
        } catch (IOException e) {
            // pretty much impossible for a StringWriter , no sensible handling.
            throw new IllegalStateException(e);
        } finally {
            LOG.debug("<<< approximateMarkdown for {}", resource != null ? resource.getPath() : null);
        }
    }

    @Override
    public void approximateMarkdown(
            @Nullable Resource resource, @Nonnull PrintWriter realOutput,
            @Nonnull SlingHttpServletRequest request, @Nonnull SlingHttpServletResponse response) {
        if (resource == null || IGNORED_NODE_NAMES.matcher(resource.getName()).matches()) {
            // The content of i18n nodes would be a duplication as it was already printed as "text" attribute in the parent node.
            // TODO(hps,26.05.23) this might lead to trouble if the user edits a non-default language first. Join with translations?
            // Also, it'd be not quite clear what language we should take. That's Composum only, though.
            return;
        }
        if (!ADMISSIBLE_PATH_PATTERN.matcher(ResourceUtil.normalize(resource.getPath())).matches()) {
            throw new IllegalArgumentException("For security reasons the resource must be in /content,/preview,/public but is: " + resource.getPath());
        }
        if (!resource.getPath().contains("/jcr:content") && resource.getChild("jcr:content") != null) {
            resource = resource.getChild("jcr:content");
        }
        StringWriter buf = new StringWriter();
        PrintWriter out = new PrintWriter(buf, false);
        PluginResult pluginResult = executePlugins(resource, out, request, response);
        boolean printEmptyLine = false;
        if (pluginResult == NOT_HANDLED) {
            if (resource.getValueMap().isEmpty() && !resource.hasChildren()) { // attribute resource
                String attributeName = resource.getName();
                String markdown = attributeToMarkdown(resource.getParent(), attributeName,
                        resource.getParent().getValueMap().get(attributeName, String.class));
                out.println(markdown);
                // no need for empty line since there are no children.
            } else {
                printEmptyLine = handleResource(resource, out, printEmptyLine);
            }
        }
        if (printEmptyLine) {
            out.println();
        }
        if (pluginResult == NOT_HANDLED || pluginResult == HANDLED_ATTRIBUTES) {
            resource.getChildren().forEach(child -> approximateMarkdown(child, out, request, response));
        }
        logUnhandledAttributes(resource);

        out.close();
        String markdown = buf.toString();
        if (!markdown.isEmpty()) {
            realOutput.print(markdown);
            for (ApproximateMarkdownServicePlugin plugin : plugins) {
                plugin.cacheMarkdown(resource, markdown);
            }
        }
    }

    protected boolean handleResource(@NotNull Resource resource, @NotNull PrintWriter out, boolean printEmptyLine) {
        for (String attributename : textAttributes) {
            String value = resource.getValueMap().get(attributename, String.class);
            if (isNotBlank(value)) {
                String prefix = ATTRIBUTE_TO_MARKDOWN_PREFIX.getOrDefault(attributename, "");
                String markdown = attributeToMarkdown(resource, attributename, value);
                out.println(prefix + markdown);
                printEmptyLine = true;
            }
        }
        printEmptyLine = handleCodeblock(resource, out, printEmptyLine);
        printEmptyLine = handleLabeledAttributes(resource, out, printEmptyLine);
        return printEmptyLine;
    }

    protected String attributeToMarkdown(@NotNull Resource resource, String attributename, String value) {
        String markdown;
        if ("text".equals(attributename) && resource.getValueMap().get("textIsRich") != null) {
            String textIsRich = resource.getValueMap().get("textIsRich", String.class);
            markdown = "true".equalsIgnoreCase(textIsRich) ?
                    chatCompletionService.htmlToMarkdown(value).trim() : value;
        } else {
            markdown = getMarkdown(value);
        }
        return markdown;
    }

    @Nonnull
    protected PluginResult executePlugins(
            @Nonnull Resource resource, @Nonnull PrintWriter out,
            @Nonnull SlingHttpServletRequest request, @Nonnull SlingHttpServletResponse response) {
        for (ApproximateMarkdownServicePlugin plugin : plugins) {
            PluginResult pluginResult =
                    plugin.maybeHandle(resource, out, this, request, response);
            if (pluginResult != null && pluginResult != NOT_HANDLED) {
                return pluginResult;
            }
        }
        return NOT_HANDLED;
    }

    @Override
    @Nonnull
    public String getMarkdown(@Nullable String value) {
        String markdown;
        if (value == null) {
            markdown = "";
        } else if (PATTERN_HTML_TAG.matcher(value).find()) {
            markdown = chatCompletionService.htmlToMarkdown(value).trim();
        } else {
            markdown = value.trim();
        }
        return markdown;
    }

    @NotNull
    @Override
    public String getMarkdown(@Nonnull URI uri) throws MalformedURLException, IOException, IllegalArgumentException {
        URLConnection conn = checkUrlAdmissible(uri).toURL().openConnection();
        conn.setConnectTimeout(3000);
        conn.setReadTimeout(3000);
        try (InputStream in = conn.getInputStream()) {
            String contentType = conn.getContentType() == null ? "" : conn.getContentType();
            if (contentType.contains("text/html") || contentType.contains("application/xhtml+xml")) {
                Charset charset = conn.getContentEncoding() != null ?
                        Charset.forName(conn.getContentEncoding()) : Charset.defaultCharset();
                String result = IOUtils.toString(in, charset);
                return chatCompletionService.htmlToMarkdown(result);
            } else if (contentType.contains("text/plain")) {
                Charset charset = conn.getContentEncoding() != null ?
                        Charset.forName(conn.getContentEncoding()) : Charset.defaultCharset();
                // not quite markdown, but there is no sensible conversion and that's likely OK, anyway.
                return IOUtils.toString(in, charset);
            } else {
                throw new IllegalArgumentException("Unsupported content type " + contentType + " for " + uri);
            }
        }
    }

    protected URI checkUrlAdmissible(URI uri) {
        if (urlBlacklist.stream().anyMatch(pattern -> pattern.matcher(uri.toString()).find())) {
            throw new IllegalArgumentException("URL " + uri + " is blacklisted " +
                    "in OSGI configuration 'Composum AI Approximate Markdown Service Configuration'");
        }
        if (urlWhitelist.isEmpty() || urlWhitelist.stream().noneMatch(pattern -> pattern.matcher(uri.toString()).find())) {
            throw new IllegalArgumentException("URL " + uri + " is not whitelisted " +
                    "in OSGI configuration 'Composum AI Approximate Markdown Service Configuration'");
        }
        return uri;
    }

    protected boolean handleCodeblock(Resource resource, PrintWriter out, boolean printEmptyLine) {
        String code = resource.getValueMap().get("code", String.class);
        if (isNotBlank(code)) {
            out.println("```\n");
            out.println(code.trim());
            out.println("\n```\n");
            return true;
        }
        return printEmptyLine;
    }

    protected boolean handleLabeledAttributes(Resource resource, PrintWriter out, boolean printEmptyLine) {
        if (labeledAttributePatternAllow == null) {
            return false;
        }
        boolean firstline = true;
        for (String attributename : labelledAttributeOrder) {
            String value = resource.getValueMap().get(attributename, String.class);
            if (isNotBlank(value)) {
                if (printEmptyLine && firstline) {
                    out.println();
                    firstline = false;
                }
                out.println(attributename + ": " + getMarkdown(value) + " 
"); printEmptyLine = true; } } for (Map.Entry entry : resource.getValueMap().entrySet()) { if (labelledAttributeOrder.contains(entry.getKey()) || textAttributes.contains(entry.getKey())) { continue; } if (entry.getValue() instanceof String) { String value = (String) entry.getValue(); if (isNotBlank(value) && admissibleValue(value) && allowDenyCheck(entry.getKey(), labeledAttributePatternAllow, labeledAttributePatternDeny)) { if (printEmptyLine && firstline) { out.println(); firstline = false; } out.println(entry.getKey() + ": " + getMarkdown(value) + "
"); printEmptyLine = true; } } } return printEmptyLine; } /** * We do not print pure numbers, booleans and some special strings since those are likely attributes determining the component layout, not actual text that is printed. * all "true"/ "false" / single number (int / float) attributes or array of numbers attributes, or shorter than 3 digits or path, or array or type date or boolean or {Date} or {Boolean} , inherit, blank, html tags, target . */ protected boolean admissibleValue(Object object) { if (object instanceof String) { String value = (String) object; return !IGNORED_VALUE_PATTERN.matcher(value).matches(); } return false; } @Activate @Modified protected void activate(Config config) { LOG.info("Activated with configuration {}", config); textAttributes = Stream.of(config.textAttributes()) .filter(StringUtils::isNotBlank).collect(Collectors.toList()); labeledAttributePatternAllow = AllowDenyMatcherUtil.joinPatternsIntoAnyMatcher(config.labelledAttributePatternAllow()); labeledAttributePatternDeny = AllowDenyMatcherUtil.joinPatternsIntoAnyMatcher(config.labelledAttributePatternDeny()); labelledAttributeOrder = Stream.of(config.labelledAttributeOrder()) .filter(StringUtils::isNotBlank).collect(Collectors.toList()); urlBlacklist = Stream.of(config.urlSourceBlacklist() != null ? config.urlSourceBlacklist() : new String[0]) .filter(StringUtils::isNotBlank).map(Pattern::compile).collect(Collectors.toList()); urlWhitelist = Stream.of(config.urlSourceWhitelist() != null ? config.urlSourceWhitelist() : new String[0]) .filter(StringUtils::isNotBlank).map(Pattern::compile).collect(Collectors.toList()); } @Deactivate protected void deactivate() { LOG.info("Deactivated."); } /** * Configuration class Config that allows us to configure TEXT_ATTRIBUTES. */ @ObjectClassDefinition(name = "Composum AI Approximate Markdown Service Configuration", description = "Configuration for the Approximate Markdown Service used to get a text representation of a page or component for use with the AI.") public @interface Config { @AttributeDefinition(name = "URL Source Whitelist Regex", description = "Only if using URLs as external source: Whitelist for URLs that can be read and turned into markdown. If not set, reading URLs is turned off." + "For security reasons you might want to prevent local addresses to be contacted." + "To allow everything you might use https?://.* , but make sure you have a good blacklist in that case.") String[] urlSourceWhitelist(); @AttributeDefinition(name = "URL Source Blacklist Regex", description = "Only if using URLs as external source: Blacklist for URLs that can be read and turned into markdown. Has precendence over whitelist.") String[] urlSourceBlacklist() default { ".*localhost.*", "^(?!https?://).*", // URLs that are not http / https are not allowed // URLs where the host name is only digits / periods (numeric IPs) ".*://[0-9.]*/.*", // IPv6 hostnames in URLs are not allowed: ".*://\\[[0-9a-fA-F:]*\\].*", }; @AttributeDefinition(name = "Text Attributes", description = "List of attributes that are treated as text and converted to markdown. If not present, no attributes are treated as text.") String[] textAttributes() default { "jcr:title", "title", "subtitle", "linkTitle", "jcr:description", "text", "cq:panelTitle", /* "code", */ "copyright", // code component; code is handled in extra method "defaultValue", "exampleCode", "suffix", "exampleResult", "footer" // for servlet component }; // these will be joined with | and then compiled as a pattern @AttributeDefinition(name = "Labeled Attribute Pattern Allow", description = "Regular expressions for attributes that are output with a label. If not present, none will be output except the text attributes.") String[] labelledAttributePatternAllow() default {".*"}; @AttributeDefinition(name = "Labeled Attribute Pattern Deny", description = "Regular expressions for attributes that are not output with a label. Takes precedence over the corresponding allow regexp list.") String[] labelledAttributePatternDeny() default {".*:.*", "layout", "backgroundColor", "color", "textColor", "template", "theme", "variation", "buttonSymbol", "columns", "icon", "elementType", "textAlignment", "alignment", "linkTarget", "interval", "fileReference", "height", "width", "textIsRich", "style", "padding.*", ".*[cC]ss[cC]lass.*"}; @AttributeDefinition(name = "Labelled Attribute Order", description = "List of labelled attributes that come first if they are present, in the given order.") String[] labelledAttributeOrder() default {}; } /** * {@inheritDoc} * We traverse the attributes of resource and all children and collect everything that starts with /content. * If there are less than 5 links, we continue with the parent resource until jcr:content is reached. * The link title will be the jcr:title or title attribute. */ @NotNull @Override public List getComponentLinks(@NotNull Resource resource) { List resourceLinks = new ArrayList<>(); if (resource == null) { return resourceLinks; } plugins.stream().forEach(plugin -> resourceLinks.addAll(plugin.getMasterLinks(resource))); Resource searchResource = resource; if (resource.getValueMap().isEmpty()) { // attribute resource, use parent searchResource = resource.getParent(); } while (searchResource != null && resourceLinks.size() < 5 && searchResource.getPath().contains("/jcr:content/")) { List resourceLinkCandidates = new ArrayList<>(); collectLinks(searchResource, resourceLinkCandidates); Iterator iterator = resourceLinkCandidates.iterator(); while (resourceLinks.size() < 5 && iterator.hasNext()) { Link link = iterator.next(); if (!resourceLinks.contains(link)) { resourceLinks.add(link); } } searchResource = searchResource.getParent(); } return resourceLinks; } /** * Collects links from a resource and its children. The link title will be the jcr:title or title attribute. * * @param resource the resource to collect links from * @param resourceLinks the list to store the collected links */ protected void collectLinks(@NotNull Resource resource, List resourceLinks) { resource.getValueMap().entrySet().stream() .filter(entry -> entry.getValue() instanceof String) .filter(entry -> ((String) entry.getValue()).startsWith("/content/")) .forEach(entry -> { String path = (String) entry.getValue(); Resource targetResource = resource.getResourceResolver().getResource(path); if (targetResource != null) { if (targetResource.getChild(JcrConstants.JCR_CONTENT) != null) { targetResource = targetResource.getChild(JcrConstants.JCR_CONTENT); } String title = targetResource.getValueMap().get("jcr:title", String.class); if (title == null) { title = targetResource.getValueMap().get("title", String.class); } if (title == null) { title = targetResource.getName(); if (JcrConstants.JCR_CONTENT.equals(title)) { title = targetResource.getParent().getName(); } } boolean needsVision = isNeedsVision(targetResource); if (!VIDEO_PATTERN.matcher(targetResource.getPath()).find()) { Link link = new Link(path, title, needsVision); if (!resourceLinks.contains(link)) { resourceLinks.add(link); } } } }); resource.getChildren().forEach(child -> { collectLinks(child, resourceLinks); }); } private static boolean isNeedsVision(Resource targetResource) { if (IMAGE_PATTERN.matcher(targetResource.getPath()).find()) { return true; } if (targetResource.getValueMap().get("jcr:content/jcr:mimeType", String.class) != null) { return true; } return false; } @Override public String getImageUrl(Resource imageResource) { if (imageResource == null) { return null; } for (ApproximateMarkdownServicePlugin plugin : plugins) { String imageUrl = plugin.getImageUrl(imageResource); if (imageUrl != null) { return imageUrl; } } return null; } // debugging code; remove after it works. protected Pattern PATTERN_HTML_TAG = Pattern.compile("<\\s*(ext|a|sly|strong|code|em|language|type|p|br|div|path|u|ul|attributes|li|ol|h[1-6]|b|i)(\\s+[^>]*)?>", Pattern.CASE_INSENSITIVE); protected final Set htmltags = new HashSet<>(); /** * This is debugging code we needed to gather information for the implementation; we keep it around for now. * out.println("Approximated markdown for " + path); * traverseTreeForStructureGathering(resource, out, null, null); * out.println("DONE"); * out.println("HTML tags found:" + htmltags); */ protected void traverseTreeForStructureGathering(Resource resource, PrintWriter out, String outerResourceType, String subpath) { String resourceType = resource.getValueMap().get("sling:resourceType", String.class); final String resourceTypeForChildren = resourceType != null ? resourceType : outerResourceType; final String pathForChildren = resourceType != null ? "" : subpath + resource.getName(); final String subpathForAttributes = resourceType != null ? "" : subpath; // iterate over all attributes of map resource.getValueMap() and write them to out if they contain several spaces. for (Map.Entry entry : resource.getValueMap().entrySet()) { if (entry.getValue() instanceof String) { String value = (String) entry.getValue(); if (value.matches(".*\\s+.*\\s+.*\\s+.*")) { // out.println(resource.getPath() + " [" + resourceTypeForChildren + "] " + subpathForAttributes + entry.getKey() + ": " + value); // out.println("[" + resourceTypeForChildren + "] " + subpathForAttributes + entry.getKey() + ": " + value); out.println("[" + resourceTypeForChildren + "] " + subpathForAttributes + entry.getKey()); // out.println(entry.getKey() + " [" + resourceTypeForChildren + "] " + subpathForAttributes + entry.getKey()); captureHtmlTags(value); } } } // iterate over child resources: call traverseTree with the child resource and the resourceTypeForChildren // and pathForChildren. resource.getChildren().forEach(child -> traverseTreeForStructureGathering(child, out, resourceTypeForChildren, pathForChildren + "/")); } protected void captureHtmlTags(String value) { Matcher m = PATTERN_HTML_TAG.matcher(value); while (m.find()) { htmltags.add(m.group(1)); } // -> found: [ext, a, sly, strong, code, em, language, type, p, br, div, path, u, ul, attributes, li, ol] } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy