lt.velykis.maven.skins.reflow.HtmlTool Maven / Gradle / Ivy
/*
* Copyright 2012 Andrius Velykis
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lt.velykis.maven.skins.reflow;
import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Pattern;
import org.apache.velocity.tools.ToolContext;
import org.apache.velocity.tools.config.DefaultKey;
import org.apache.velocity.tools.generic.SafeConfig;
import org.apache.velocity.tools.generic.ValueParser;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
/**
* An Apache Velocity tool that provides utility methods to manipulate HTML code using
* jsoup HTML5 parser.
*
* The methods utilise CSS
* selectors to refer to specific elements for manipulation.
*
*
* @author Andrius Velykis
* @since 1.0
* @see jsoup HTML parser
* @see jsoup CSS selectors
*/
@DefaultKey("htmlTool")
public class HtmlTool extends SafeConfig {
/** A list of all HTML heading classes (h1-6) */
private static List HEADINGS = Collections.unmodifiableList(
Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
/** Enum indicating separator handling strategy for document partitioning. */
public enum JoinSeparator {
/**
* Keep separators at the start of partitions. The first partition will not have a
* separator.
*/
AFTER,
/**
* Keep separators at the end of partitions. The last partition will not have a separator.
*/
BEFORE,
/** Drop separators altogether. */
NO
}
private String outputEncoding = "UTF-8";
/**
* {@inheritDoc}
*
* @see SafeConfig#configure(ValueParser)
*/
@Override
protected void configure(ValueParser values) {
// retrieve the Velocity context for output encoding
Object velocityContext = values.get("velocityContext");
if (!(velocityContext instanceof ToolContext)) {
return;
}
ToolContext ctxt = (ToolContext) velocityContext;
// get the output encoding
Object outputEncodingObj = ctxt.get("outputEncoding");
if (outputEncodingObj instanceof String) {
this.outputEncoding = (String) outputEncodingObj;
}
}
/**
* Splits the given HTML content into partitions based on the given separator selector. The
* separators themselves are dropped from the results.
*
* @param content
* HTML content to split
* @param separatorCssSelector
* CSS selector for separators.
* @return a list of HTML partitions split on separator locations, but without the separators.
* @since 1.0
* @see #split(String, String, JoinSeparator)
*/
public List split(String content, String separatorCssSelector) {
return split(content, separatorCssSelector, JoinSeparator.NO);
}
/**
* Splits the given HTML content into partitions based on the given separator selector. The
* separators are kept as first elements of the partitions.
*
* Note that the first part is removed if the split was successful. This is because the first
* part does not include the separator.
*
*
* @param content
* HTML content to split
* @param separatorCssSelector
* CSS selector for separators
* @return a list of HTML partitions split on separator locations (except the first one), with
* separators at the beginning of each partition
* @since 1.0
* @see #split(String, String, JoinSeparator)
*/
public List splitOnStarts(String content, String separatorCssSelector) {
List result = split(content, separatorCssSelector, JoinSeparator.AFTER);
if (result == null || result.size() <= 1) {
// no result or just one part - return what we have
return result;
}
// otherwise, drop the first part - the first split will be the first 'start'
// e.g. if we split on headings, the first part will contain everything
// before the first heading.
return result.subList(1, result.size());
}
/**
* Splits the given HTML content into partitions based on the given separator selector. The
* separators are either dropped or joined with before/after depending on the indicated
* separator strategy.
*
* @param content
* HTML content to split
* @param separatorCssSelector
* CSS selector for separators
* @param separatorStrategy
* strategy to drop or keep separators, one of "after", "before" or "no"
* @return a list of HTML partitions split on separator locations.
* @since 1.0
* @see #split(String, String, JoinSeparator)
*/
public List split(String content, String separatorCssSelector,
String separatorStrategy) {
JoinSeparator sepStrategy;
if ("before".equals(separatorStrategy)) {
sepStrategy = JoinSeparator.BEFORE;
} else if ("after".equals(separatorStrategy)) {
sepStrategy = JoinSeparator.AFTER;
} else {
sepStrategy = JoinSeparator.NO;
}
return split(content, separatorCssSelector, sepStrategy);
}
/**
* Splits the given HTML content into partitions based on the given separator selector.The
* separators are either dropped or joined with before/after depending on the indicated
* separator strategy.
*
* Note that splitting algorithm tries to resolve nested elements so that returned partitions
* are self-contained HTML elements. The nesting is normally contained within the first
* applicable partition.
*
*
* @param content
* HTML content to split
* @param separatorCssSelector
* CSS selector for separators
* @param separatorStrategy
* strategy to drop or keep separators
* @return a list of HTML partitions split on separator locations. If no splitting occurs,
* returns the original content as the single element of the list
* @since 1.0
*/
public List split(String content, String separatorCssSelector,
JoinSeparator separatorStrategy) {
Element body = parseContent(content);
List separators = body.select(separatorCssSelector);
if (separators.size() > 0) {
List> partitions = split(separators, separatorStrategy, body);
List sectionHtml = new ArrayList();
for (List partition : partitions) {
sectionHtml.add(outerHtml(partition));
}
return sectionHtml;
} else {
// nothing to split
return Collections.singletonList(content);
}
}
/**
* Recursively splits the {@code parent} element based on the given {@code separators}. If a
* separator is encountered in the parent, it is split on that position. The outstanding nested
* elements go with the first of the partitions in each case.
*
* @param separators
* @param separatorStrategy
* @param parent
* @return list of partitions (as lists of root elements for each partition). Partition can be
* an empty list, e.g. if the separator is at the start of the content.
*/
private static List> split(Collection separators,
JoinSeparator separatorStrategy, Element parent) {
List> partitions = new LinkedList>();
for (Element child : parent.children()) {
if (separators.contains(child)) {
// split here and do not go deeper
// first ensure there was a partition before
// otherwise the split is not recognised on an outer level
getLastPartition(partitions);
if (separatorStrategy == JoinSeparator.BEFORE) {
// add to the last partition
getLastPartition(partitions).add(child);
}
// add an empty new partition
List newPartition = new LinkedList();
partitions.add(newPartition);
if (separatorStrategy == JoinSeparator.AFTER) {
// add to the new partition
newPartition.add(child);
}
} else {
// go deeper
List> childPartitions = split(separators, separatorStrategy, child);
// add the child to the last partition
getLastPartition(partitions).add(child);
if (childPartitions.size() > 1) {
// more than one partition:
// only keep the first partition elements in the child
// so for all other partitions, remove them from their parents
List allChildren = child.children();
List firstPartition = childPartitions.get(0);
allChildren.removeAll(firstPartition);
for (Element removeChild : allChildren) {
removeChild.remove();
}
// add the remaining partitions
for (List nextPartition : childPartitions.subList(1, childPartitions.size())) {
partitions.add(nextPartition);
}
}
}
}
return partitions;
}
/**
* Retrieves the last partition (as list of elements) or creates a new one if there was none
* before.
*
* @param partitions
* @return
*/
private static List getLastPartition(List> partitions) {
if (partitions.isEmpty()) {
List newPartition = new LinkedList();
partitions.add(newPartition);
return newPartition;
} else {
return partitions.get(partitions.size() - 1);
}
}
/**
* Outputs the list of partition root elements to HTML.
*
* @param elements
* @return
*/
private static String outerHtml(List elements) {
switch (elements.size()) {
case 0:
return "";
case 1:
return elements.get(0).outerHtml();
default: {
// more than one element
// wrap into which we will remove afterwards
Element root = new Element(Tag.valueOf("div"), "");
for (Element elem : elements) {
root.appendChild(elem);
}
return root.html();
}
}
}
/**
* Reorders elements in HTML content so that selected elements are found at the top of the
* content. Can be limited to a certain amount, e.g. to bring just the first of selected
* elements to the top.
*
* @param content
* HTML content to reorder
* @param selector
* CSS selector for elements to bring to top of the content
* @param amount
* Maximum number of elements to reorder
* @return HTML content with reordered elements, or the original content if no such elements
* found.
* @since 1.0
*/
public String reorderToTop(String content, String selector, int amount) {
return reorderToTop(content, selector, amount, null);
}
/**
* Reorders elements in HTML content so that selected elements are found at the top of the
* content. Can be limited to a certain amount, e.g. to bring just the first of selected
* elements to the top.
*
* @param content
* HTML content to reorder
* @param selector
* CSS selector for elements to bring to top of the content
* @param amount
* Maximum number of elements to reorder
* @param wrapRemaining
* HTML to wrap the remaining (non-reordered) part
* @return HTML content with reordered elements, or the original content if no such elements
* found.
* @since 1.0
*/
public String reorderToTop(String content, String selector, int amount,
String wrapRemaining) {
// extract the elements and then prepend them to the remaining body
List extracted = extractElements(content, selector, amount);
if (extracted.size() > 1) {
Element body = extracted.get(0);
if (wrapRemaining != null) {
wrapInner(body, wrapRemaining);
}
List elements = extracted.subList(1, extracted.size());
// now prepend extracted elements to the body (in backwards to preserve original order)
for (int index = elements.size() - 1; index >= 0; index--) {
body.prependChild(elements.get(index));
}
return body.html();
} else {
// nothing to reorder
return content;
}
}
private static Element wrapInner(Element element, String html) {
// wrap everything into an additional for wrapping
// otherwise there may be problems, e.g. with element
Element topDiv = new Element(Tag.valueOf("div"), "");
for (Element topElem : element.children()) {
// add all elements in the body to the `topDiv`
topElem.remove();
topDiv.appendChild(topElem);
}
// add topDiv to the body
element.appendChild(topDiv);
// wrap topDiv
topDiv.wrap(html);
// now unwrap topDiv - will remove it from the hierarchy
topDiv.unwrap();
return element;
}
/**
* Extracts elements from the HTML content.
*
* @param content
* @param selector
* @param amount
* @return the remainder and a list of extracted elements. The main body (remainder after
* extraction) is always returned as the first element of the list.
*/
private List extractElements(String content, String selector, int amount) {
Element body = parseContent(content);
List elements = body.select(selector);
if (elements.size() > 0) {
elements = filterParents(elements);
if (amount >= 0) {
// limit to the indicated amount
elements = elements.subList(0, Math.min(amount, elements.size()));
}
// remove all from their parents
for (Element element : elements) {
element.remove();
}
}
List results = new ArrayList();
// first element is the body
results.add(body);
results.addAll(elements);
return results;
}
/**
* Filters the list of elements to only contain parent elements. This is to avoid both parent
* and child being in the list of elements.
*
* @param elements
* @return
*/
private static List filterParents(List elements) {
List filtered = new ArrayList();
for (Element element : elements) {
// get the intersection of parents and selected elements
List parentsInter = element.parents();
parentsInter.retainAll(elements);
if (parentsInter.isEmpty()) {
// no intersection - element's parents are not in the selected list
filtered.add(element);
}
}
return filtered;
}
/**
* Extracts HTML elements from the main HTML content. The result consists of the extracted HTML
* elements and the remainder of HTML content, with these elements removed. Can be limited to a
* certain amount, e.g. to extract just the first of selected elements.
*
* @param content
* HTML content to extract elements from
* @param selector
* CSS selector for elements to extract
* @param amount
* Maximum number of elements to extract
* @return HTML content of the extracted elements together with the remainder of the original
* content. If no elements are found, the remainder contains the original content.
* @since 1.0
*/
public ExtractResult extract(String content, String selector, int amount) {
List extracted = extractElements(content, selector, amount);
if (extracted.size() > 1) {
// first element is the remaining body, the rest are extracted
Element body = extracted.get(0);
List elements = extracted.subList(1, extracted.size());
// convert to HTML
List elementStr = new ArrayList();
for (Element el : elements) {
elementStr.add(el.outerHtml());
}
return new DefaultExtractResult(elementStr, body.html());
} else {
// nothing to extract
return new DefaultExtractResult(Collections. emptyList(), content);
}
}
/**
* A container to carry element extraction results. Contains the extracted element HTML
* code and the remainder of the body content with elements removed.
*
* @author Andrius Velykis
* @since 1.0
*/
public static interface ExtractResult {
/**
* Retrieves the extracted HTML elements.
*
* @return List of HTML of extracted elements. Can be empty if no elements found.
*/
public List getExtracted();
/**
* Retrieves the content from which elements were extracted.
*
* @return The HTML content with extracted elements removed.
*/
public String getRemainder();
}
private static class DefaultExtractResult implements ExtractResult {
private final List extracted;
private final String remainder;
public DefaultExtractResult(List extracted, String remainder) {
this.extracted = extracted;
this.remainder = remainder;
}
@Override
public List getExtracted() {
return Collections.unmodifiableList(extracted);
}
@Override
public String getRemainder() {
return remainder;
}
}
/**
* Sets attribute to the given value on elements in HTML.
*
* @param content
* HTML content to set attributes on
* @param selector
* CSS selector for elements to modify
* @param attributeKey
* Attribute name
* @param value
* Attribute value
* @return HTML content with modified elements. If no elements are found, the original content
* is returned.
* @since 1.0
*/
public String setAttr(String content, String selector, String attributeKey, String value) {
Element body = parseContent(content);
List elements = body.select(selector);
if (elements.size() > 0) {
for (Element element : elements) {
element.attr(attributeKey, value);
}
return body.html();
} else {
// nothing to update
return content;
}
}
/**
* Parses body fragment to the {@code } element.
*
* @param content
* @return the {@code body} element of the parsed content
*/
private Element parseContent(String content) {
Document doc = Jsoup.parseBodyFragment(content);
doc.outputSettings().charset(outputEncoding);
return doc.body();
}
/**
* Retrieves attribute value on elements in HTML. Will return all attribute values for the
* selector, since there can be more than one element.
*
* @param content
* HTML content to read attributes from
* @param selector
* CSS selector for elements to find
* @param attributeKey
* Attribute name
* @return Attribute values for all matching elements. If no elements are found, empty list is
* returned.
* @since 1.0
*/
public List getAttr(String content, String selector, String attributeKey) {
Element body = parseContent(content);
List elements = body.select(selector);
List attrs = new ArrayList();
for (Element element : elements) {
String attrValue = element.attr(attributeKey);
attrs.add(attrValue);
}
return attrs;
}
/**
* Adds given class names to the elements in HTML.
*
* @param content
* HTML content to modify
* @param selector
* CSS selector for elements to add classes to
* @param classNames
* Names of classes to add to the selected elements
* @param amount
* Maximum number of elements to modify
* @return HTML content with modified elements. If no elements are found, the original content
* is returned.
* @since 1.0
*/
public String addClass(String content, String selector, List classNames, int amount) {
Element body = parseContent(content);
List elements = body.select(selector);
if (amount >= 0) {
// limit to the indicated amount
elements = elements.subList(0, Math.min(amount, elements.size()));
}
if (elements.size() > 0) {
for (Element element : elements) {
for (String className : classNames) {
element.addClass(className);
}
}
return body.html();
} else {
// nothing to update
return content;
}
}
/**
* Adds given class names to the elements in HTML.
*
* @param content
* HTML content to modify
* @param selector
* CSS selector for elements to add classes to
* @param classNames
* Names of classes to add to the selected elements
* @return HTML content with modified elements. If no elements are found, the original content
* is returned.
* @since 1.0
*/
public String addClass(String content, String selector, List classNames) {
return addClass(content, selector, classNames, -1);
}
/**
* Adds given class to the elements in HTML.
*
* @param content
* HTML content to modify
* @param selector
* CSS selector for elements to add the class to
* @param className
* Name of class to add to the selected elements
* @return HTML content with modified elements. If no elements are found, the original content
* is returned.
* @since 1.0
*/
public String addClass(String content, String selector, String className) {
return addClass(content, selector, Collections.singletonList(className));
}
/**
* Wraps elements in HTML with the given HTML.
*
* @param content
* HTML content to modify
* @param selector
* CSS selector for elements to wrap
* @param wrapHtml
* HTML to use for wrapping the selected elements
* @param amount
* Maximum number of elements to modify
* @return HTML content with modified elements. If no elements are found, the original content
* is returned.
* @since 1.0
*/
public String wrap(String content, String selector, String wrapHtml, int amount) {
Element body = parseContent(content);
List elements = body.select(selector);
if (amount >= 0) {
// limit to the indicated amount
elements = elements.subList(0, Math.min(amount, elements.size()));
}
if (elements.size() > 0) {
for (Element element : elements) {
element.wrap(wrapHtml);
}
return body.html();
} else {
// nothing to update
return content;
}
}
/**
* Removes elements from HTML.
*
* @param content
* HTML content to modify
* @param selector
* CSS selector for elements to remove
* @return HTML content with removed elements. If no elements are found, the original content is
* returned.
* @since 1.0
*/
public String remove(String content, String selector) {
Element body = parseContent(content);
List elements = body.select(selector);
if (elements.size() > 0) {
for (Element element : elements) {
element.remove();
}
return body.html();
} else {
// nothing changed
return content;
}
}
/**
* Replaces elements in HTML.
*
* @param content
* HTML content to modify
* @param selector
* CSS selector for elements to replace
* @param replacement
* HTML replacement (must parse to a single element)
* @return HTML content with replaced elements. If no elements are found, the original content is
* returned.
* @since 1.0
*/
public String replace(String content, String selector, String replacement) {
return replaceAll(content, Collections.singletonMap(selector, replacement));
}
/**
* Replaces elements in HTML.
*
* @param content
* HTML content to modify
* @param replacements
* Map of CSS selectors to their replacement HTML texts. CSS selectors find elements
* to be replaced with the HTML in the mapping. The HTML must parse to a single
* element.
* @return HTML content with replaced elements. If no elements are found, the original content
* is returned.
* @since 1.0
*/
public String replaceAll(String content, Map replacements) {
Element body = parseContent(content);
boolean modified = false;
for (Entry replacementEntry : replacements.entrySet()) {
String selector = replacementEntry.getKey();
String replacement = replacementEntry.getValue();
List elements = body.select(selector);
if (elements.size() > 0) {
// take the first child
Element replacementElem = parseContent(replacement).child(0);
if (replacementElem != null) {
for (Element element : elements) {
element.replaceWith(replacementElem.clone());
}
modified = true;
}
}
}
if (modified) {
return body.html();
} else {
// nothing changed
return content;
}
}
/**
* Retrieves text content of the selected elements in HTML. Renders the element's text as it
* would be displayed on the web page (including its children).
*
* @param content
* HTML content with the elements
* @param selector
* CSS selector for elements to extract contents
* @return A list of element texts as rendered to display. Empty list if no elements are found.
* @since 1.0
*/
public List text(String content, String selector) {
Element body = parseContent(content);
List elements = body.select(selector);
List texts = new ArrayList();
for (Element element : elements) {
texts.add(element.text());
}
return texts;
}
/**
* Transforms the given HTML content by moving anchor ({@code }) names to
* IDs for heading elements.
*
* The anchors are used to indicate positions within a HTML page. In HTML5, however, the
* {@code name} attribute is no longer supported on {@code }) tag. The positions within pages
* are indicated using {@code id} attribute instead, e.g. {@code }.
*
*
* The method finds anchors inside, immediately before or after the heading tags and uses their
* name as heading {@code id} instead. The anchors themselves are removed.
*
*
* @param content
* HTML content to modify
* @return HTML content with modified elements. Anchor names are used for adjacent headings, and
* anchor tags are removed. If no elements are found, the original content is returned.
* @since 1.0
*/
public String headingAnchorToId(String content) {
Element body = parseContent(content);
// selectors for headings without IDs
List headNoIds = concat(HEADINGS, ":not([id])", true);
// selector for anchor with name attribute only
String nameA = "a[name]:not([href])";
// select all headings that have inner named anchor
List headingsInnerA = body.select(StringUtil.join(
concat(headNoIds, ":has(" + nameA + ")", true), ", "));
boolean modified = false;
for (Element heading : headingsInnerA) {
List anchors = heading.select(nameA);
// take first
if (!anchors.isEmpty()) {
anchorToId(heading, anchors.get(0));
modified = true;
}
}
// select all headings that have a preceding named anchor
List headingsPreA = body.select(StringUtil.join(
concat(headNoIds, nameA + " + ", false), ", "));
for (Element heading : headingsPreA) {
Element anchor = heading.previousElementSibling();
if (anchor != null) {
anchorToId(heading, anchor);
modified = true;
}
}
// select all headings that are followed by a named anchor
// no selector available for that, so first select the anchors
// then retrieve the headings
List anchorsPreH = body.select(StringUtil.join(
concat(headNoIds, " + " + nameA, true), ", "));
for (Element anchor : anchorsPreH) {
Element heading = anchor.previousElementSibling();
if (heading != null) {
anchorToId(heading, anchor);
modified = true;
}
}
if (modified) {
return body.html();
} else {
// nothing to update
return content;
}
}
/**
* Moves anchor name to heading id, if one does not exist. Removes the anchor.
*
* @param heading
* @param anchor
*/
private static void anchorToId(Element heading, Element anchor) {
if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
String aName = anchor.attr("name");
if (!aName.isEmpty()) {
// set the anchor name as heading ID
heading.attr("id", aName);
// remove the anchor
anchor.remove();
}
}
}
/**
* Utility method to concatenate a String to a list of Strings. The text can be either appended
* or prepended.
*
* @param elements
* list of elements to append/prepend the text to
* @param text
* the given text to append/prepend
* @param append
* if {@code true}, text will be appended to the elements. If {@code false}, it will
* be prepended
* @return list of elements with the text appended/prepended
* @since 1.0
*/
public static List concat(List elements, String text, boolean append) {
List concats = new ArrayList();
for (String element : elements) {
concats.add(append ? element + text : text + element);
}
return concats;
}
/**
* Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that
* do not have one.
*
* IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a
* heading tag without an {@code id} is found, its "slug" is generated automatically based on
* the heading contents and used as the ID.
*
*
* Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS
* selectors, e.g. ":", ".", etc. The symbols are removed.
*
*
* @param content
* HTML content to modify
* @return HTML content with all heading elements having {@code id} attributes. If all headings
* were with IDs already, the original content is returned.
* @since 1.0
*/
public String ensureHeadingIds(String content, String idSeparator) {
Element body = parseContent(content);
// first find all existing IDs (to avoid generating duplicates)
List idElems = body.select("*[id]");
Set ids = new HashSet();
boolean modified = false;
for (Element idElem : idElems) {
// fix all existing IDs - remove colon and other symbols which mess up jQuery
String id = idElem.id();
idElem.attr("id", adaptSlug(id, idSeparator));
modified = true;
ids.add(idElem.id());
}
List headNoIds = concat(HEADINGS, ":not([id])", true);
// select all headings that do not have an ID
List headingsNoId = body.select(StringUtil.join(headNoIds, ", "));
if (!headingsNoId.isEmpty() || modified) {
for (Element heading : headingsNoId) {
String headingText = heading.text();
String headingSlug = slug(headingText, idSeparator);
// also limit slug to 50 symbols
if (headingSlug.length() > 50) {
headingSlug = headingSlug.substring(0, 50);
}
String headingId = generateUniqueId(ids, headingSlug);
heading.attr("id", headingId);
}
return body.html();
} else {
// nothing to update
return content;
}
}
/**
* Generated a unique ID within the given set of IDs. Appends an incrementing number for
* duplicates.
*
* @param ids
* @param idBase
* @return
*/
private static String generateUniqueId(Set ids, String idBase) {
String id = idBase;
int counter = 1;
while (ids.contains(id)) {
id = idBase + String.valueOf(counter++);
}
// put the newly generated one into the set
ids.add(id);
return id;
}
/**
* Fixes table heads: wraps rows with {@code } (table heading) elements into {@code }
* element if they are currently in {@code }.
*
* @param content
* HTML content to modify
* @return HTML content with all table heads fixed. If all heads were correct, the original
* content is returned.
* @since 1.0
*/
public String fixTableHeads(String content) {
Element body = parseContent(content);
// select rows with tags within
List tableHeadRows = body.select("table > tbody > tr:has(th)");
if (tableHeadRows.size() > 0) {
for (Element row : tableHeadRows) {
// get the row's table
Element table = row.parent().parent();
// remove row from its original position
row.remove();
// create table header element with the row
Element thead = new Element(Tag.valueOf("thead"), "");
thead.appendChild(row);
// add at the beginning of the table
table.prependChild(thead);
}
return body.html();
} else {
// nothing changed
return content;
}
}
private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
/**
* Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
* use in URLs).
*
* @param input
* text to generate the slug from
* @param separator
* separator for whitespace replacement
* @return the slug of the given text that contains alphanumeric symbols and separator only
* @since 1.0
* @see http://www.codecodex.com/wiki/Generate_a_url_slug
*/
public static String slug(String input, String separator) {
String slug = adaptSlug(input, separator);
return slug.toLowerCase(Locale.ENGLISH);
}
/**
* Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
* use in URLs). Uses "-" as a whitespace separator.
*
* @param input
* text to generate the slug from
* @return the slug of the given text that contains alphanumeric symbols and "-" only
* @since 1.0
*/
public static String slug(String input) {
return slug(input, "-");
}
/**
* Creates a slug but does not change capitalization.
*
* @param input
* @param separator
* @return
*/
private static String adaptSlug(String input, String separator) {
String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
return NONLATIN.matcher(normalized).replaceAll("");
}
/**
* Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are
* nested within bigger ones, e.g. {@code } is nested under preceding {@code }.
*
* Only headings with IDs are included in the hierarchy. The result elements contain ID and
* heading text for each heading. The hierarchy is useful to generate a Table of Contents for a
* page.
*
*
* @param content
* HTML content to extract heading hierarchy from
* @return a list of top-level heading items (with id and text). The remaining headings are
* nested within these top-level items. Empty list if no headings are in the content.
* @since 1.0
*/
public List extends IdElement> headingTree(String content) {
Element body = parseContent(content);
List headIds = concat(HEADINGS, "[id]", true);
// select all headings that have an ID
List headings = body.select(StringUtil.join(headIds, ", "));
List headingItems = new ArrayList();
for (Element heading : headings) {
headingItems.add(new HeadingItem(heading.id(), heading.text(), headingIndex(heading)));
}
List topHeadings = new ArrayList();
Stack parentHeadings = new Stack();
for (HeadingItem heading : headingItems) {
while (!parentHeadings.isEmpty()
&& parentHeadings.peek().headingIndex >= heading.headingIndex) {
parentHeadings.pop();
}
if (parentHeadings.isEmpty()) {
// top level heading - no parents
topHeadings.add(heading);
} else {
// add to the children of topmost stack parent
parentHeadings.peek().children.add(heading);
}
// push the heading onto stack
parentHeadings.push(heading);
}
return topHeadings;
}
/**
* Retrieves numeric index of a heading.
*
* @param element
* @return
*/
private static int headingIndex(Element element) {
String tagName = element.tagName();
if (tagName.startsWith("h")) {
try {
return Integer.parseInt(tagName.substring(1));
} catch (Exception ex) {
throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
}
} else {
throw new IllegalArgumentException("Must be a header tag: " + tagName);
}
}
private static class HeadingItem implements IdElement {
private final String id;
private final String text;
private final int headingIndex;
private final List children = new ArrayList();
public HeadingItem(String id, String text, int headingIndex) {
this.id = id;
this.text = text;
this.headingIndex = headingIndex;
}
@Override
public String getId() {
return id;
}
@Override
public String getText() {
return text;
}
@Override
public List getItems() {
return Collections.unmodifiableList(children);
}
}
/**
* Representation of a HTML element with ID and a text content. Other such elements can be
* nested within.
*
* @author Andrius Velykis
* @since 1.0
*/
public interface IdElement {
/**
* Retrieves the ID of the HTML element (attribute {@code id})
*
* @return element {@code id} value
*/
public String getId();
/**
* Retrieves the text contents of the HTML element (rendered for display)
*
* @return text contents of the element
*/
public String getText();
/**
* Retrieves the children of the HTML element (nested within the element)
*
* @return nested items within the element
*/
public List extends IdElement> getItems();
}
/**
* A generic method to use jsoup parser on an arbitrary HTML body fragment. Allows writing
* HTML manipulations in the template without adding Java code to the class.
*
* @param content
* HTML content to parse
* @return the wrapper element for the parsed content (i.e. the body element as if the content
* was body contents).
* @since 1.0
*/
public static Element parseBodyFragment(String content) {
Document doc = Jsoup.parseBodyFragment(content);
return doc.body();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy