com.composum.ai.backend.slingbase.impl.HtmlToApproximateMarkdownServicePlugin Maven / Gradle / Ivy
package com.composum.ai.backend.slingbase.impl;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import javax.servlet.AsyncContext;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.ServletOutputStream;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.Cookie;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;
import javax.servlet.http.HttpUpgradeHandler;
import org.apache.commons.io.output.StringBuilderWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.sling.api.SlingHttpServletRequest;
import org.apache.sling.api.SlingHttpServletResponse;
import org.apache.sling.api.adapter.AdapterManager;
import org.apache.sling.api.request.RequestDispatcherOptions;
import org.apache.sling.api.request.RequestParameter;
import org.apache.sling.api.request.RequestParameterMap;
import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceUtil;
import org.apache.sling.api.wrappers.SlingHttpServletRequestWrapper;
import org.apache.sling.api.wrappers.SlingHttpServletResponseWrapper;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.osgi.framework.Constants;
import org.osgi.service.component.annotations.Activate;
import org.osgi.service.component.annotations.Component;
import org.osgi.service.component.annotations.ConfigurationPolicy;
import org.osgi.service.component.annotations.Deactivate;
import org.osgi.service.component.annotations.Modified;
import org.osgi.service.component.annotations.Reference;
import org.osgi.service.metatype.annotations.AttributeDefinition;
import org.osgi.service.metatype.annotations.Designate;
import org.osgi.service.metatype.annotations.ObjectClassDefinition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.composum.ai.backend.slingbase.ApproximateMarkdownService;
import com.composum.ai.backend.slingbase.ApproximateMarkdownServicePlugin;
/**
* A plugin for the {@link com.composum.ai.backend.slingbase.ApproximateMarkdownService} that transforms the rendered
* HTML to markdown.
* That doesn't work for all components, but might more easily capture the text content of certain components than
* trying to guess it from the JCR representation, as is the default.
*/
@Designate(ocd = HtmlToApproximateMarkdownServicePlugin.Config.class)
@Component(configurationPolicy = ConfigurationPolicy.REQUIRE,
property = Constants.SERVICE_RANKING + ":Integer=10000"
)
public class HtmlToApproximateMarkdownServicePlugin implements ApproximateMarkdownServicePlugin {
private static final Logger LOG = LoggerFactory.getLogger(HtmlToApproximateMarkdownServicePlugin.class);
@Reference
private AdapterManager adapterManager;
protected Pattern allowedResourceTypePattern;
protected Pattern deniedResourceTypePattern;
/**
* ResourceTypes we ignore since their rendering uses unsupported methods.
* Blacklisting for only 1h since there might be a deployment in the meantime.
* Maps the resource type to the time (ms) until it is blacklisted.
*/
protected Map blacklistedResourceType = new ConcurrentHashMap<>();
protected volatile Long blacklistedResourceTypeCleanupTime = Long.MIN_VALUE;
@NotNull
@Override
public PluginResult maybeHandle(
@NotNull Resource resource, @NotNull PrintWriter out,
@NotNull ApproximateMarkdownService service,
@Nonnull SlingHttpServletRequest request, @Nonnull SlingHttpServletResponse response) {
if (isIgnoredNode(resource)) {
return PluginResult.NOT_HANDLED;
}
String resourceType = resource.getResourceType();
if (blacklistedResourceType.containsKey(resourceType)) {
Long timeout = blacklistedResourceType.get(resourceType);
if (timeout != null && timeout > System.currentTimeMillis()) {
return PluginResult.NOT_HANDLED;
} else if (timeout != null) {
blacklistedResourceType.remove(resourceType);
}
}
cleanupBlacklist();
if (allowedResourceTypePattern != null && allowedResourceTypePattern.matcher(resourceType).matches()) {
if (deniedResourceTypePattern != null && deniedResourceTypePattern.matcher(resourceType).matches()) {
LOG.debug("Resourcetype {} denied", resourceType);
return PluginResult.NOT_HANDLED;
}
LOG.debug("Resourcetype {} allowed", resourceType);
try {
String html = renderedAsHTML(resource, request, response);
if (StringUtils.isBlank(html)) {
LOG.debug("No HTML generated for {} with resource type {}", resource.getPath(), resource.getResourceType());
return PluginResult.NOT_HANDLED;
}
String markdown = service.getMarkdown(html);
if (StringUtils.isBlank(markdown)) {
LOG.debug("No markdown generated for {} with resource type {}", resource.getPath(), resource.getResourceType());
} else {
LOG.debug("Markdown generated for {} with resource type {}:\n{}", resource.getPath(), resource.getResourceType(), markdown);
out.println(markdown);
out.println();
}
return PluginResult.HANDLED_ALL;
} catch (ServletException | IOException | RuntimeException e) {
if (isBecauseOfUnsupportedOperation(e)) {
LOG.warn("Blacklisting because of using unsupported operations: resource type {} (at {})", resource.getResourceType(), resource.getPath());
blacklistedResourceType.put(resourceType, System.currentTimeMillis() + TimeUnit.HOURS.toMillis(1));
return PluginResult.NOT_HANDLED;
}
LOG.error("Error rendering resource {} with resource type {}", resource.getPath(), resource.getResourceType(), e);
return PluginResult.NOT_HANDLED;
}
}
return PluginResult.NOT_HANDLED;
}
protected void cleanupBlacklist() {
if (System.currentTimeMillis() > blacklistedResourceTypeCleanupTime) {
synchronized (blacklistedResourceType) {
if (System.currentTimeMillis() > blacklistedResourceTypeCleanupTime) {
long currentTime = System.currentTimeMillis();
blacklistedResourceTypeCleanupTime = currentTime + TimeUnit.HOURS.toMillis(1);
List entriesToRemove = blacklistedResourceType.entrySet().stream()
.filter(entry -> entry.getValue() < currentTime)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
entriesToRemove.forEach(blacklistedResourceType::remove);
}
}
}
}
@Nullable
@Override
public String getImageUrl(@Nullable Resource imageResource) {
return null;
}
protected boolean isBecauseOfUnsupportedOperation(Throwable e) {
if (e instanceof UnsupportedOperationCalled) {
return true;
}
if (e.getCause() != null && e.getCause() != e && isBecauseOfUnsupportedOperation(e.getCause())) {
return true;
}
for (Throwable throwable : e.getSuppressed()) {
if (throwable != e && isBecauseOfUnsupportedOperation(throwable)) {
return true;
}
}
return false;
}
/**
* We start with depth 3 since the higher nodes often contain headers, navigation and such that don't help for ChatGPT.
*/
protected boolean isIgnoredNode(@Nonnull Resource resource) {
if (ResourceUtil.getParent(resource.getPath(), 2) == null) {
return true;
}
if (resource.getName().equals("jcr:content") || resource.getParent().getName().equals("jcr:content")) {
return true;
}
return false;
}
/**
* We render the resource into a mock response and capture and return the generated HTML.
* The response is wrapped so that the real response cannot be modified.
* We don't do that for the request, because that would be more complicated and probably not needed.
*/
protected String renderedAsHTML(Resource resource, SlingHttpServletRequest request, SlingHttpServletResponse response) throws ServletException, IOException {
StringBuilderWriter writer = new StringBuilderWriter();
try (PrintWriter printWriter = new PrintWriter(writer)) {
SlingHttpServletResponse wrappedResponse = new CapturingResponse(response, printWriter, resource.getPath());
NonModifyingRequestWrapper wrappedRequest = new NonModifyingRequestWrapper(request, resource.getPath());
Object oldWcmAttribute = request.getAttribute("com.day.cq.wcm.api.WCMMode");
try { // for AEM we have to avoid that edit mode introduces artifacts.
request.removeAttribute("com.day.cq.wcm.api.WCMMode");
request.getRequestDispatcher(resource.getPath() + ".html").include(wrappedRequest, wrappedResponse);
} finally {
if (oldWcmAttribute != null) {
request.setAttribute("com.day.cq.wcm.api.WCMMode", oldWcmAttribute);
}
}
if (wrappedRequest.hadInvalidOperation) { // if that exception has been swallowed
throw new UnsupportedOperationCalled();
}
}
if (writer.toString().contains("Resource dumped by HtmlRenderer")) {
return null;
}
return writer.toString();
}
@Activate
@Modified
protected void activate(Config config) {
this.allowedResourceTypePattern = AllowDenyMatcherUtil.joinPatternsIntoAnyMatcher(config.allowedResourceTypes());
this.deniedResourceTypePattern = AllowDenyMatcherUtil.joinPatternsIntoAnyMatcher(config.deniedResourceTypes());
LOG.info("Allowed HTML to Markdown resource types: {}", this.allowedResourceTypePattern);
LOG.info("Denied HTML to Markdown resource types: {}", this.deniedResourceTypePattern);
}
@Deactivate
protected void deactivate() {
this.allowedResourceTypePattern = null;
this.deniedResourceTypePattern = null;
}
@ObjectClassDefinition(name = "Composum AI Html To Approximate Markdown Service Plugin", description = "A plugin for the ApproximateMarkdownService that transforms the rendered HTML of components to markdown, which can work better than trying to guess the text content from the JCR representation (as is the default) but probably doesn't work for all components. So it can be enabled for some sling resource types by regex. We will not use this for the first two levels below the page, as that could include unwanted stuff like headers and footers.")
protected @interface Config {
@AttributeDefinition(name = "Allowed resource types", description = "Regular expressions for allowed resource types. If not present, no resource types are allowed.") String[] allowedResourceTypes() default {".*"};
@AttributeDefinition(name = "Denied resource types", description = "Regular expressions for denied resource types. Takes precedence over allowed resource types.") String[] deniedResourceTypes() default {};
}
/**
* We wrap a response to capture the content, forwarding all but modifying methods to the original response.
*/
protected static class CapturingResponse extends SlingHttpServletResponseWrapper {
private final PrintWriter writer;
private final String debuginfo;
public CapturingResponse(SlingHttpServletResponse response, PrintWriter printWriter, String debuginfo) {
super(response);
this.writer = printWriter;
this.debuginfo = debuginfo;
}
@Override
public PrintWriter getWriter() {
return writer;
}
protected UnsupportedOperationException logAndThrow(String error) {
LOG.warn("Unsupported method called for {} : {}", debuginfo, error);
throw new UnsupportedOperationException(error);
}
// The following methods are likely not needed; we mostly throw an exception to find whether this assumption is right.
@Override
public ServletOutputStream getOutputStream() throws IOException {
throw logAndThrow("Not implemented: CapturingResponse.getOutputStream");
}
@Override
public void addCookie(Cookie cookie) {
// that might actually get a problem later, so we at least log it. Not to be expected, though.
LOG.warn("Not implemented: CapturingResponse.addCookie {}", cookie.getName());
}
@Override
public void sendError(int sc, String msg) throws IOException {
throw logAndThrow("Not implemented: CapturingResponse.sendError");
}
@Override
public void sendError(int sc) throws IOException {
throw logAndThrow("Not implemented: CapturingResponse.sendError");
}
@Override
public void sendRedirect(String location) throws IOException {
throw logAndThrow("Not implemented: CapturingResponse.sendRedirect");
}
@Override
public void setDateHeader(String name, long date) {
// ignore
}
@Override
public void setHeader(String name, String value) {
// ignore
}
@Override
public void setIntHeader(String name, int value) {
// ignore
}
@Override
public void setStatus(int sc) {
if (sc != 200) {
throw logAndThrow("Status other than 200 not supported: CapturingResponse.setStatus " + sc);
}
// just ignore OK, that's fine
}
@Override
public void setStatus(int sc, String sm) {
if (sc != 200) {
throw logAndThrow("Not implemented: CapturingResponse.setStatus " + sc + " : " + sm);
}
// just ignore 200, that's fine
}
@Override
public int getStatus() {
return 200;
}
@Override
public void setCharacterEncoding(String charset) {
// ignore
}
@Override
public void setContentLength(int len) {
// ignore
}
@Override
public void setContentLengthLong(long len) {
// ignore
}
@Override
public void setContentType(String type) {
// ignore
}
@Override
public void setBufferSize(int size) {
// ignore
}
@Override
public void flushBuffer() throws IOException {
// ignore
}
@Override
public void reset() {
throw logAndThrow("Not implemented: CapturingResponse.reset");
}
@Override
public void resetBuffer() {
throw logAndThrow("Not implemented: CapturingResponse.resetBuffer");
}
@Override
public void setLocale(Locale loc) {
// ignore
}
}
/**
* Wraps the request to make sure nothing is modified.
*/
protected class NonModifyingRequestWrapper extends SlingHttpServletRequestWrapper {
private final String debuginfo;
protected boolean inAdaptTo;
protected boolean hadInvalidOperation;
/**
* Either Object[0] for a removed attribute or new Object{attributevalue} for changed object.
*/
private Map changedAttributes = new HashMap<>();
public NonModifyingRequestWrapper(SlingHttpServletRequest wrappedRequest, String debuginfo) {
super(wrappedRequest);
this.debuginfo = debuginfo;
}
protected UnsupportedOperationCalled logAndThrow(String error) {
LOG.warn("Unsupported method called for {} : {}", debuginfo, error);
hadInvalidOperation = true;
throw new UnsupportedOperationCalled();
}
@Override
public RequestParameter getRequestParameter(String name) {
return null;
}
@Override
public RequestParameterMap getRequestParameterMap() {
return new EmptyRequestParameterMap();
}
@Override
public List getRequestParameterList() {
return Collections.emptyList();
}
@Override
public RequestParameter[] getRequestParameters(String name) {
return new RequestParameter[0];
}
/** We render with GET, obviously. */
@Override
public String getMethod() {
return "GET";
}
// Methods we think are too dangerous to use since they might modify the request, so we mostly throw an exception.
// Possibly we'll have to rethink this.
@Nullable
@Override
public RequestDispatcher getRequestDispatcher(@NotNull String path, RequestDispatcherOptions options) {
// This actually happens on some components, but we rather don't want to support it to avoid side effects.
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.getRequestDispatcher");
}
@Nullable
@Override
public RequestDispatcher getRequestDispatcher(@NotNull Resource resource, RequestDispatcherOptions options) {
// This actually happens on some components, but we rather don't want to support it to avoid side effects.
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.getRequestDispatcher");
}
@Nullable
@Override
public RequestDispatcher getRequestDispatcher(@NotNull Resource resource) {
// This actually happens on some components, but we rather don't want to support it to avoid side effects.
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.getRequestDispatcher");
}
@Override
public HttpSession getSession(boolean create) {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.getSession");
}
@Override
public HttpSession getSession() {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.getSession");
}
@Override
public String changeSessionId() {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.changeSessionId");
}
@Override
public boolean authenticate(HttpServletResponse response) throws IOException, ServletException {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.authenticate");
}
@Override
public void login(String username, String password) throws ServletException {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.login");
}
@Override
public void logout() throws ServletException {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.logout");
}
@Override
public T upgrade(Class handlerClass) throws IOException, ServletException {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.upgrade");
}
@Override
public void setCharacterEncoding(String env) {
LOG.debug("ignoring NonModifyingRequestWrapper.setCharacterEncoding {}", env);
// ignore, though somewhat doubtfully
}
@Override
public void setAttribute(String name, Object o) {
LOG.trace("emulating NonModifyingRequestWrapper.setAttribute {} for {}", name, debuginfo);
changedAttributes.put(name, new Object[]{o});
}
@Override
public void removeAttribute(String name) {
LOG.trace("emulating NonModifyingRequestWrapper.removeAttribute {} for {}", name, debuginfo);
changedAttributes.put(name, new Object[0]);
}
@Override
public Object getAttribute(String name) {
Object[] change = changedAttributes.get(name);
if (change != null) {
if (change.length == 0) {
return null;
} else {
return change[0];
}
}
return super.getAttribute(name);
}
@Override
public AsyncContext startAsync() throws IllegalStateException {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.startAsync");
}
@Override
public AsyncContext startAsync(ServletRequest servletRequest, ServletResponse servletResponse) throws IllegalStateException {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.startAsync");
}
@Override
public AsyncContext getAsyncContext() {
throw logAndThrow("Not implemented: NonModifyingRequestWrapper.getAsyncContext");
}
@Override
public AdapterType adaptTo(Class type) {
if (inAdaptTo) {
throw logAndThrow("Loop in NonModifyingRequestWrapper.adaptTo " + type);
}
try {
inAdaptTo = true; // make sure the adaptermanager doesn't just call adaptTo again - we'll have to give up then.
return adapterManager.getAdapter(this, type);
} finally {
inAdaptTo = false;
}
}
}
/**
* Thrown when unsupported operation was called that requires blacklisting.
*/
protected static class UnsupportedOperationCalled extends RuntimeException {
// empty
}
protected static class EmptyRequestParameterMap extends AbstractMap implements RequestParameterMap {
@Override
public Set> entrySet() {
return Collections.emptySet();
}
@Nullable
@Override
public RequestParameter[] getValues(@NotNull String name) {
return null;
}
@Nullable
@Override
public RequestParameter getValue(String name) {
return null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy