
com.offerready.xslt.HtmlBodyExtractor Maven / Gradle / Ivy
Show all versions of xslt-library Show documentation
package com.offerready.xslt;
import com.databasesandlife.util.Timer;
import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* Extracts the body of an HTML report document.
*
* An "HTML document generator" generates HTML, suitable for viewing in a browser, or inclusion in an iframe.
* However, this HTML contains e.g. <html> tags which are not suitable for display within a <div>.
* Objects of this class extract the relevant information from the HTML document, and return an HTML
* string suitable for inclusion in a <div>.
*
* The entire contents of the <body> are returned.
* Any <style> tags in the <head> are extracted are prepended to this body.
*/
public class HtmlBodyExtractor {
protected List scriptsToIgnore = new ArrayList<>();
protected static class Range {
int startIncl, endExcl;
Range(int s, int e) { startIncl = s; endExcl = e; }
boolean overlaps(Range x) {
if (x.endExcl <= startIncl) return false;
if (x.startIncl >= endExcl) return false;
return true;
}
}
/**
* For example if JQuery should not be included; simply add "jquery" to this method and this <script>
* tag will not be included in the result.
*/
public @Nonnull HtmlBodyExtractor addScriptToIgnore(@Nonnull String scriptSubstring) {
scriptsToIgnore.add(scriptSubstring);
return this;
}
protected @Nonnull String ignoreScripts(@Nonnull String html) {
for (var s : scriptsToIgnore)
html = html.replaceAll(
"",
"");
return html;
}
protected void extractElements(
@Nonnull StringBuilder result, @Nonnull String input,
@Nonnull List ranges,
@Nonnull String start, @CheckForNull String endOfStartOrNull, @Nonnull String end
) {
var startIdx = -1;
var ourResult = new StringBuilder();
while ((startIdx = input.indexOf(start, startIdx+1)) >= 0) {
var idxOfEndTag = input.indexOf(end, startIdx);
Range range = endOfStartOrNull == null
? new Range(startIdx, idxOfEndTag + end.length())
: new Range(input.indexOf(endOfStartOrNull, startIdx) + endOfStartOrNull.length(), idxOfEndTag);
var rangeOverlaps = false;
for (var r : ranges) if (r.overlaps(range)) rangeOverlaps = true;
if (rangeOverlaps) continue;
ourResult.append(ignoreScripts(input.substring(range.startIncl, range.endExcl)));
ranges.add(range);
}
ourResult.append("\n");
result.insert(0, ourResult);
}
public @Nonnull String extractBody(@Nonnull String htmlText) {
try (var ignored = new Timer("HtmlBodyExtractor.extractBody")) {
var result = new StringBuilder(); // 2.5 seconds at 1k iterations on i3
var ranges = new ArrayList();
extractElements(result, htmlText, ranges, "", "");
extractElements(result, htmlText, ranges, "");
extractElements(result, htmlText, ranges, "");
extractElements(result, htmlText, ranges, "