All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.offerready.xslt.HtmlBodyExtractor Maven / Gradle / Ivy

The newest version!
package com.offerready.xslt;

import com.databasesandlife.util.Timer;

import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

/**
 * Extracts the body of an HTML report document.
 *    

* An "HTML document generator" generates HTML, suitable for viewing in a browser, or inclusion in an iframe. * However, this HTML contains e.g. <html> tags which are not suitable for display within a <div>. * Objects of this class extract the relevant information from the HTML document, and return an HTML * string suitable for inclusion in a <div>. *

* The entire contents of the <body> are returned. * Any <style> tags in the <head> are extracted are prepended to this body. */ public class HtmlBodyExtractor { protected List scriptsToIgnore = new ArrayList<>(); protected static class Range { int startIncl, endExcl; Range(int s, int e) { startIncl = s; endExcl = e; } boolean overlaps(Range x) { if (x.endExcl <= startIncl) return false; if (x.startIncl >= endExcl) return false; return true; } } /** * For example if JQuery should not be included; simply add "jquery" to this method and this <script> * tag will not be included in the result. */ public @Nonnull HtmlBodyExtractor addScriptToIgnore(@Nonnull String scriptSubstring) { scriptsToIgnore.add(scriptSubstring); return this; } protected @Nonnull String ignoreScripts(@Nonnull String html) { for (var s : scriptsToIgnore) html = html.replaceAll( "", ""); return html; } protected void extractElements( @Nonnull StringBuilder result, @Nonnull String input, @Nonnull List ranges, @Nonnull String start, @CheckForNull String endOfStartOrNull, @Nonnull String end ) { var startIdx = -1; var ourResult = new StringBuilder(); while ((startIdx = input.indexOf(start, startIdx+1)) >= 0) { var idxOfEndTag = input.indexOf(end, startIdx); Range range = endOfStartOrNull == null ? new Range(startIdx, idxOfEndTag + end.length()) : new Range(input.indexOf(endOfStartOrNull, startIdx) + endOfStartOrNull.length(), idxOfEndTag); var rangeOverlaps = false; for (var r : ranges) if (r.overlaps(range)) rangeOverlaps = true; if (rangeOverlaps) continue; ourResult.append(ignoreScripts(input.substring(range.startIncl, range.endExcl))); ranges.add(range); } ourResult.append("\n"); result.insert(0, ourResult); } public @Nonnull String extractBody(@Nonnull String htmlText) { try (var ignored = new Timer("HtmlBodyExtractor.extractBody")) { var result = new StringBuilder(); // 2.5 seconds at 1k iterations on i3 var ranges = new ArrayList(); extractElements(result, htmlText, ranges, "", ""); extractElements(result, htmlText, ranges, ""); extractElements(result, htmlText, ranges, ""); extractElements(result, htmlText, ranges, ""); return result.toString(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy