lt.velykis.maven.skins.reflow.HtmlTool Maven / Gradle / Ivy

Go to download
/* 
 * Copyright 2012 Andrius Velykis
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lt.velykis.maven.skins.reflow;

import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Pattern;

import org.apache.velocity.tools.ToolContext;
import org.apache.velocity.tools.config.DefaultKey;
import org.apache.velocity.tools.generic.SafeConfig;
import org.apache.velocity.tools.generic.ValueParser;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;

/**
 * An Apache Velocity tool that provides utility methods to manipulate HTML code using
 * jsoup HTML5 parser.
 * 
 * The methods utilise CSS
 * selectors to refer to specific elements for manipulation.
 * 
 * 
 * @author Andrius Velykis
 * @since 1.0
 * @see jsoup HTML parser
 * @see jsoup CSS selectors
 */
@DefaultKey("htmlTool")
public class HtmlTool extends SafeConfig {
	
	/** A list of all HTML heading classes (h1-6) */
	private static List HEADINGS = Collections.unmodifiableList(
			Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
	
	
	
	/** Enum indicating separator handling strategy for document partitioning. */
	public enum JoinSeparator {
		/**
		 * Keep separators at the start of partitions. The first partition will not have a
		 * separator.
		 */
		AFTER,
		/**
		 * Keep separators at the end of partitions. The last partition will not have a separator.
		 */
		BEFORE,
		/** Drop separators altogether. */
		NO
	}
	
	private String outputEncoding = "UTF-8";
	
	/**
	 * {@inheritDoc}
	 * 
	 * @see SafeConfig#configure(ValueParser)
	 */
	@Override
	protected void configure(ValueParser values) {

		// retrieve the Velocity context for output encoding
		Object velocityContext = values.get("velocityContext");

		if (!(velocityContext instanceof ToolContext)) {
			return;
		}

		ToolContext ctxt = (ToolContext) velocityContext;
		
		// get the output encoding
		Object outputEncodingObj = ctxt.get("outputEncoding");
		if (outputEncodingObj instanceof String) {
			this.outputEncoding = (String) outputEncodingObj;
		}
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector. The
	 * separators themselves are dropped from the results.
	 * 
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators.
	 * @return a list of HTML partitions split on separator locations, but without the separators.
	 * @since 1.0
	 * @see #split(String, String, JoinSeparator)
	 */
	public List split(String content, String separatorCssSelector) {
		return split(content, separatorCssSelector, JoinSeparator.NO);
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector. The
	 * separators are kept as first elements of the partitions.
	 * 
	 * Note that the first part is removed if the split was successful. This is because the first
	 * part does not include the separator.
	 * 
	 * 
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators
	 * @return a list of HTML partitions split on separator locations (except the first one), with
	 *         separators at the beginning of each partition
	 * @since 1.0
	 * @see #split(String, String, JoinSeparator)
	 */
	public List splitOnStarts(String content, String separatorCssSelector) {

		List result = split(content, separatorCssSelector, JoinSeparator.AFTER);

		if (result == null || result.size() <= 1) {
			// no result or just one part - return what we have
			return result;
		}

		// otherwise, drop the first part - the first split will be the first 'start'
		// e.g. if we split on headings, the first part will contain everything
		// before the first heading.
		return result.subList(1, result.size());
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector. The
	 * separators are either dropped or joined with before/after depending on the indicated
	 * separator strategy.
	 * 
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators
	 * @param separatorStrategy
	 *            strategy to drop or keep separators, one of "after", "before" or "no"
	 * @return a list of HTML partitions split on separator locations.
	 * @since 1.0
	 * @see #split(String, String, JoinSeparator)
	 */
	public List split(String content, String separatorCssSelector,
			String separatorStrategy) {

		JoinSeparator sepStrategy;
		if ("before".equals(separatorStrategy)) {
			sepStrategy = JoinSeparator.BEFORE;
		} else if ("after".equals(separatorStrategy)) {
			sepStrategy = JoinSeparator.AFTER;
		} else {
			sepStrategy = JoinSeparator.NO;
		}

		return split(content, separatorCssSelector, sepStrategy);
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector.The
	 * separators are either dropped or joined with before/after depending on the indicated
	 * separator strategy.
	 * 
	 * Note that splitting algorithm tries to resolve nested elements so that returned partitions
	 * are self-contained HTML elements. The nesting is normally contained within the first
	 * applicable partition.
	 * 
	 * 
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators
	 * @param separatorStrategy
	 *            strategy to drop or keep separators
	 * @return a list of HTML partitions split on separator locations. If no splitting occurs,
	 *         returns the original content as the single element of the list
	 * @since 1.0
	 */
	public List split(String content, String separatorCssSelector,
			JoinSeparator separatorStrategy) {

		Element body = parseContent(content);

		List separators = body.select(separatorCssSelector);
		if (separators.size() > 0) {
			List> partitions = split(separators, separatorStrategy, body);

			List sectionHtml = new ArrayList();

			for (List partition : partitions) {
				sectionHtml.add(outerHtml(partition));
			}

			return sectionHtml;
		} else {
			// nothing to split
			return Collections.singletonList(content);
		}
	}

	/**
	 * Recursively splits the {@code parent} element based on the given {@code separators}. If a
	 * separator is encountered in the parent, it is split on that position. The outstanding nested
	 * elements go with the first of the partitions in each case.
	 * 
	 * @param separators
	 * @param separatorStrategy
	 * @param parent
	 * @return list of partitions (as lists of root elements for each partition). Partition can be
	 *         an empty list, e.g. if the separator is at the start of the content.
	 */
	private static List> split(Collection separators,
			JoinSeparator separatorStrategy, Element parent) {

		List> partitions = new LinkedList>();

		for (Element child : parent.children()) {

			if (separators.contains(child)) {
				// split here and do not go deeper

				// first ensure there was a partition before
				// otherwise the split is not recognised on an outer level
				getLastPartition(partitions);

				if (separatorStrategy == JoinSeparator.BEFORE) {
					// add to the last partition
					getLastPartition(partitions).add(child);
				}

				// add an empty new partition
				List newPartition = new LinkedList();
				partitions.add(newPartition);

				if (separatorStrategy == JoinSeparator.AFTER) {
					// add to the new partition
					newPartition.add(child);
				}

			} else {
				// go deeper
				List> childPartitions = split(separators, separatorStrategy, child);

				// add the child to the last partition
				getLastPartition(partitions).add(child);

				if (childPartitions.size() > 1) {
					// more than one partition:
					// only keep the first partition elements in the child
					// so for all other partitions, remove them from their parents

					List allChildren = child.children();
					List firstPartition = childPartitions.get(0);

					allChildren.removeAll(firstPartition);
					for (Element removeChild : allChildren) {
						removeChild.remove();
					}

					// add the remaining partitions
					for (List nextPartition : childPartitions.subList(1, childPartitions.size())) {
						partitions.add(nextPartition);
					}
				}
			}
		}

		return partitions;
	}

	/**
	 * Retrieves the last partition (as list of elements) or creates a new one if there was none
	 * before.
	 * 
	 * @param partitions
	 * @return
	 */
	private static List getLastPartition(List> partitions) {
		if (partitions.isEmpty()) {
			List newPartition = new LinkedList();
			partitions.add(newPartition);
			return newPartition;
		} else {
			return partitions.get(partitions.size() - 1);
		}
	}

	/**
	 * Outputs the list of partition root elements to HTML.
	 * 
	 * @param elements
	 * @return
	 */
	private static String outerHtml(List elements) {

		switch (elements.size()) {
		case 0:
			return "";
		case 1:
			return elements.get(0).outerHtml();
		default: {
			// more than one element
			// wrap into  which we will remove afterwards
			Element root = new Element(Tag.valueOf("div"), "");
			for (Element elem : elements) {
				root.appendChild(elem);
			}

			return root.html();
		}
		}
	}
	
	
	
	/**
	 * Reorders elements in HTML content so that selected elements are found at the top of the
	 * content. Can be limited to a certain amount, e.g. to bring just the first of selected
	 * elements to the top.
	 * 
	 * @param content
	 *            HTML content to reorder
	 * @param selector
	 *            CSS selector for elements to bring to top of the content
	 * @param amount
	 *            Maximum number of elements to reorder
	 * @return HTML content with reordered elements, or the original content if no such elements
	 *         found.
	 * @since 1.0
	 */
	public String reorderToTop(String content, String selector, int amount) {
		return reorderToTop(content, selector, amount, null);
	}
	
	/**
	 * Reorders elements in HTML content so that selected elements are found at the top of the
	 * content. Can be limited to a certain amount, e.g. to bring just the first of selected
	 * elements to the top.
	 * 
	 * @param content
	 *            HTML content to reorder
	 * @param selector
	 *            CSS selector for elements to bring to top of the content
	 * @param amount
	 *            Maximum number of elements to reorder
	 * @param wrapRemaining
	 *            HTML to wrap the remaining (non-reordered) part
	 * @return HTML content with reordered elements, or the original content if no such elements
	 *         found.
	 * @since 1.0
	 */
	public String reorderToTop(String content, String selector, int amount,
			String wrapRemaining) {

		// extract the elements and then prepend them to the remaining body
		List extracted = extractElements(content, selector, amount);

		if (extracted.size() > 1) {

			Element body = extracted.get(0);
			
			if (wrapRemaining != null) {
				wrapInner(body, wrapRemaining);
			}
			
			List elements = extracted.subList(1, extracted.size());

			// now prepend extracted elements to the body (in backwards to preserve original order)
			for (int index = elements.size() - 1; index >= 0; index--) {
				body.prependChild(elements.get(index));
			}

			return body.html();
		} else {
			// nothing to reorder
			return content;
		}
	}
	
	private static Element wrapInner(Element element, String html) {

		// wrap everything into an additional  for wrapping
		// otherwise there may be problems, e.g. with  element
		Element topDiv = new Element(Tag.valueOf("div"), "");
		for (Element topElem : element.children()) {
			// add all elements in the body to the `topDiv`
			topElem.remove();
			topDiv.appendChild(topElem);
		}

		// add topDiv to the body
		element.appendChild(topDiv);

		// wrap topDiv
		topDiv.wrap(html);
		// now unwrap topDiv - will remove it from the hierarchy
		topDiv.unwrap();
		
		return element;
	}
	
	/**
	 * Extracts elements from the HTML content.
	 * 
	 * @param content
	 * @param selector
	 * @param amount
	 * @return the remainder and a list of extracted elements. The main body (remainder after
	 *         extraction) is always returned as the first element of the list.
	 */
	private List extractElements(String content, String selector, int amount) {

		Element body = parseContent(content);

		List elements = body.select(selector);
		if (elements.size() > 0) {

			elements = filterParents(elements);

			if (amount >= 0) {
				// limit to the indicated amount
				elements = elements.subList(0, Math.min(amount, elements.size()));
			}

			// remove all from their parents
			for (Element element : elements) {
				element.remove();
			}
		}

		List results = new ArrayList();
		// first element is the body
		results.add(body);
		results.addAll(elements);
		return results;
	}
	
	/**
	 * Filters the list of elements to only contain parent elements. This is to avoid both parent
	 * and child being in the list of elements.
	 * 
	 * @param elements
	 * @return
	 */
	private static List filterParents(List elements) {
		List filtered = new ArrayList();
		for (Element element : elements) {
			// get the intersection of parents and selected elements
			List parentsInter = element.parents();
			parentsInter.retainAll(elements);
			if (parentsInter.isEmpty()) {
				// no intersection - element's parents are not in the selected list
				filtered.add(element);
			}
		}

		return filtered;
	}

	/**
	 * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML
	 * elements and the remainder of HTML content, with these elements removed. Can be limited to a
	 * certain amount, e.g. to extract just the first of selected elements.
	 * 
	 * @param content
	 *            HTML content to extract elements from
	 * @param selector
	 *            CSS selector for elements to extract
	 * @param amount
	 *            Maximum number of elements to extract
	 * @return HTML content of the extracted elements together with the remainder of the original
	 *         content. If no elements are found, the remainder contains the original content.
	 * @since 1.0
	 */
	public ExtractResult extract(String content, String selector, int amount) {

		List extracted = extractElements(content, selector, amount);

		if (extracted.size() > 1) {

			// first element is the remaining body, the rest are extracted
			Element body = extracted.get(0);
			List elements = extracted.subList(1, extracted.size());

			// convert to HTML
			List elementStr = new ArrayList();
			for (Element el : elements) {
				elementStr.add(el.outerHtml());
			}

			return new DefaultExtractResult(elementStr, body.html());
		} else {
			// nothing to extract
			return new DefaultExtractResult(Collections. emptyList(), content);
		}
	}
	
	/**
	 * A container to carry element extraction results. Contains the extracted element HTML
	 * code and the remainder of the body content with elements removed.
	 * 
	 * @author Andrius Velykis
	 * @since 1.0
	 */
	public static interface ExtractResult {
		
		/**
		 * Retrieves the extracted HTML elements.
		 * 
		 * @return List of HTML of extracted elements. Can be empty if no elements found.
		 */
		public List getExtracted();

		/**
		 * Retrieves the content from which elements were extracted.
		 * 
		 * @return The HTML content with extracted elements removed.
		 */
		public String getRemainder();
	}
	
	private static class DefaultExtractResult implements ExtractResult {
		private final List extracted;
		private final String remainder;
		
		public DefaultExtractResult(List extracted, String remainder) {
			this.extracted = extracted;
			this.remainder = remainder;
		}
		
		@Override
		public List getExtracted() {
			return Collections.unmodifiableList(extracted);
		}
		
		@Override
		public String getRemainder() {
			return remainder;
		}
	}
	
	
	/**
	 * Sets attribute to the given value on elements in HTML.
	 * 
	 * @param content
	 *            HTML content to set attributes on
	 * @param selector
	 *            CSS selector for elements to modify
	 * @param attributeKey
	 *            Attribute name
	 * @param value
	 *            Attribute value
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String setAttr(String content, String selector, String attributeKey, String value) {

		Element body = parseContent(content);
		
		List elements = body.select(selector);
		if (elements.size() > 0) {
			
			for (Element element : elements) {
				element.attr(attributeKey, value);
			} 
			
			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}

	/**
	 * Parses body fragment to the {@code } element.
	 * 
	 * @param content
	 * @return the {@code body} element of the parsed content
	 */
	private Element parseContent(String content) {
		Document doc = Jsoup.parseBodyFragment(content);
		doc.outputSettings().charset(outputEncoding);
		return doc.body();
	}
	
	/**
	 * Retrieves attribute value on elements in HTML. Will return all attribute values for the
	 * selector, since there can be more than one element.
	 * 
	 * @param content
	 *            HTML content to read attributes from
	 * @param selector
	 *            CSS selector for elements to find
	 * @param attributeKey
	 *            Attribute name
	 * @return Attribute values for all matching elements. If no elements are found, empty list is
	 *         returned.
	 * @since 1.0
	 */
	public List getAttr(String content, String selector, String attributeKey) {

		Element body = parseContent(content);
		
		List elements = body.select(selector);
		List attrs = new ArrayList();
		
		for (Element element : elements) {
			String attrValue = element.attr(attributeKey);
			attrs.add(attrValue);
		}
		
		return attrs;
	}
	
	/**
	 * Adds given class names to the elements in HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to add classes to
	 * @param classNames
	 *            Names of classes to add to the selected elements
	 * @param amount
	 *            Maximum number of elements to modify
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String addClass(String content, String selector, List classNames, int amount) {

		Element body = parseContent(content);
		
		List elements = body.select(selector);
		if (amount >= 0) {
			// limit to the indicated amount
			elements = elements.subList(0, Math.min(amount, elements.size()));
		}
		
		if (elements.size() > 0) {
			
			for (Element element : elements) {
				for (String className : classNames) {
					element.addClass(className);
				}
			} 
			
			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}
	
	/**
	 * Adds given class names to the elements in HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to add classes to
	 * @param classNames
	 *            Names of classes to add to the selected elements
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String addClass(String content, String selector, List classNames) {
		return addClass(content, selector, classNames, -1);
	}
	
	/**
	 * Adds given class to the elements in HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to add the class to
	 * @param className
	 *            Name of class to add to the selected elements
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String addClass(String content, String selector, String className) {
		return addClass(content, selector, Collections.singletonList(className));
	}
	
	/**
	 * Wraps elements in HTML with the given HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to wrap
	 * @param wrapHtml
	 *            HTML to use for wrapping the selected elements
	 * @param amount
	 *            Maximum number of elements to modify
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String wrap(String content, String selector, String wrapHtml, int amount) {

		Element body = parseContent(content);
		
		List elements = body.select(selector);
		if (amount >= 0) {
			// limit to the indicated amount
			elements = elements.subList(0, Math.min(amount, elements.size()));
		}
		
		if (elements.size() > 0) {
			
			for (Element element : elements) {
				element.wrap(wrapHtml);
			} 
			
			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}
	
	/**
	 * Removes elements from HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to remove
	 * @return HTML content with removed elements. If no elements are found, the original content is
	 *         returned.
	 * @since 1.0
	 */
	public String remove(String content, String selector) {

		Element body = parseContent(content);
		
		List elements = body.select(selector);
		if (elements.size() > 0) {
			for (Element element : elements) {
				element.remove();
			}
			
			return body.html();
		} else {
			// nothing changed
			return content;
		}
	}
	
	/**
	 * Replaces elements in HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to replace
	 * @param replacement
	 *            HTML replacement (must parse to a single element)
	 * @return HTML content with replaced elements. If no elements are found, the original content is
	 *         returned.
	 * @since 1.0
	 */
	public String replace(String content, String selector, String replacement) {
		return replaceAll(content, Collections.singletonMap(selector, replacement));
	}
	
	/**
	 * Replaces elements in HTML.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @param replacements
	 *            Map of CSS selectors to their replacement HTML texts. CSS selectors find elements
	 *            to be replaced with the HTML in the mapping. The HTML must parse to a single
	 *            element.
	 * @return HTML content with replaced elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String replaceAll(String content, Map replacements) {

		Element body = parseContent(content);
		
		boolean modified = false;
		for (Entry replacementEntry : replacements.entrySet()) {
			String selector = replacementEntry.getKey();
			String replacement = replacementEntry.getValue();
			
			List elements = body.select(selector);
			if (elements.size() > 0) {
				
				// take the first child
				Element replacementElem = parseContent(replacement).child(0);
				
				if (replacementElem != null) {
					for (Element element : elements) {
						element.replaceWith(replacementElem.clone());
					}
					
					modified = true;
				}
			}
		}
		
		if (modified) {
			return body.html();
		} else {
			// nothing changed
			return content;
		}
	}
	
	/**
	 * Retrieves text content of the selected elements in HTML. Renders the element's text as it
	 * would be displayed on the web page (including its children).
	 * 
	 * @param content
	 *            HTML content with the elements
	 * @param selector
	 *            CSS selector for elements to extract contents
	 * @return A list of element texts as rendered to display. Empty list if no elements are found.
	 * @since 1.0
	 */
	public List text(String content, String selector) {

		Element body = parseContent(content);
		
		List elements = body.select(selector);
		List texts = new ArrayList();
		
		for (Element element : elements) {
			texts.add(element.text());
		}
		
		return texts;
	}
	
	/**
	 * Transforms the given HTML content by moving anchor ({@code }) names to
	 * IDs for heading elements.
	 * 
	 * The anchors are used to indicate positions within a HTML page. In HTML5, however, the
	 * {@code name} attribute is no longer supported on {@code }) tag. The positions within pages
	 * are indicated using {@code id} attribute instead, e.g. {@code 
}.
	 * 
	 * 
	 * The method finds anchors inside, immediately before or after the heading tags and uses their
	 * name as heading {@code id} instead. The anchors themselves are removed.
	 * 
	 * 
	 * @param content
	 *            HTML content to modify
	 * @return HTML content with modified elements. Anchor names are used for adjacent headings, and
	 *         anchor tags are removed. If no elements are found, the original content is returned.
	 * @since 1.0
	 */
	public String headingAnchorToId(String content) {

		Element body = parseContent(content);
		
		// selectors for headings without IDs
		List headNoIds = concat(HEADINGS, ":not([id])", true);
		
		// selector for anchor with name attribute only
		String nameA = "a[name]:not([href])";
		
		// select all headings that have inner named anchor
		List headingsInnerA = body.select(StringUtil.join(
				concat(headNoIds, ":has(" + nameA + ")", true), ", "));
		
		boolean modified = false;
		for (Element heading : headingsInnerA) {
			List anchors = heading.select(nameA);
			// take first
			if (!anchors.isEmpty()) {
				anchorToId(heading, anchors.get(0));
				modified = true;
			}
		}
		
		// select all headings that have a preceding named anchor
		List headingsPreA = body.select(StringUtil.join(
				concat(headNoIds, nameA + " + ", false), ", "));
		
		for (Element heading : headingsPreA) {
			Element anchor = heading.previousElementSibling();
			if (anchor != null) {
				anchorToId(heading, anchor);
				modified = true;
			}
		}
		
		// select all headings that are followed by a named anchor
		// no selector available for that, so first select the anchors
		// then retrieve the headings
		List anchorsPreH = body.select(StringUtil.join(
				concat(headNoIds, " + " + nameA, true), ", "));
		
		for (Element anchor : anchorsPreH) {
			Element heading = anchor.previousElementSibling();
			if (heading != null) {
				anchorToId(heading, anchor);
				modified = true;
			}
		}
		
		if (modified) {
			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}
	
	/**
	 * Moves anchor name to heading id, if one does not exist. Removes the anchor.
	 * 
	 * @param heading
	 * @param anchor
	 */
	private static void anchorToId(Element heading, Element anchor) {
		
		if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
			String aName = anchor.attr("name");
			if (!aName.isEmpty()) {
				// set the anchor name as heading ID
				heading.attr("id", aName);
				
				// remove the anchor
				anchor.remove();
			}
		}
	}
	
	
	/**
	 * Utility method to concatenate a String to a list of Strings. The text can be either appended
	 * or prepended.
	 * 
	 * @param elements
	 *            list of elements to append/prepend the text to
	 * @param text
	 *            the given text to append/prepend
	 * @param append
	 *            if {@code true}, text will be appended to the elements. If {@code false}, it will
	 *            be prepended
	 * @return list of elements with the text appended/prepended
	 * @since 1.0
	 */
	public static List concat(List elements, String text, boolean append) {
		List concats = new ArrayList();
		
		for (String element : elements) {
			concats.add(append ? element + text : text + element);
		}
		
		return concats;
	}
	
	
	/**
	 * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that
	 * do not have one.
	 * 
	 * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a
	 * heading tag without an {@code id} is found, its "slug" is generated automatically based on
	 * the heading contents and used as the ID.
	 * 
	 * 
	 * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS
	 * selectors, e.g. ":", ".", etc. The symbols are removed.
	 * 
	 * 
	 * @param content
	 *            HTML content to modify
	 * @return HTML content with all heading elements having {@code id} attributes. If all headings
	 *         were with IDs already, the original content is returned.
	 * @since 1.0
	 */
	public String ensureHeadingIds(String content, String idSeparator) {

		Element body = parseContent(content);
		
		// first find all existing IDs (to avoid generating duplicates)
		List idElems = body.select("*[id]");
		Set ids = new HashSet();
		boolean modified = false;
		for (Element idElem : idElems) {
			
			// fix all existing IDs - remove colon and other symbols which mess up jQuery
			String id = idElem.id();
			idElem.attr("id", adaptSlug(id, idSeparator));
			modified = true;
			
			ids.add(idElem.id());
		}
		
		List headNoIds = concat(HEADINGS, ":not([id])", true);
		
		// select all headings that do not have an ID
		List headingsNoId = body.select(StringUtil.join(headNoIds, ", "));
		
		if (!headingsNoId.isEmpty() || modified) {
			for (Element heading : headingsNoId) {
				
				String headingText = heading.text();
				String headingSlug = slug(headingText, idSeparator);
				// also limit slug to 50 symbols
				if (headingSlug.length() > 50) {
					headingSlug = headingSlug.substring(0, 50);
				}
				String headingId = generateUniqueId(ids, headingSlug);
				
				heading.attr("id", headingId);
			}
			
			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}
	
	/**
	 * Generated a unique ID within the given set of IDs. Appends an incrementing number for
	 * duplicates.
	 * 
	 * @param ids
	 * @param idBase
	 * @return
	 */
	private static String generateUniqueId(Set ids, String idBase) {
		String id = idBase;
		int counter = 1;
		while (ids.contains(id)) {
			id = idBase + String.valueOf(counter++);
		}
		
		// put the newly generated one into the set
		ids.add(id);
		return id;
	}
	
	/**
	 * Fixes table heads: wraps rows with {@code } (table heading) elements into {@code }
	 * element if they are currently in {@code }.
	 * 
	 * @param content
	 *            HTML content to modify
	 * @return HTML content with all table heads fixed. If all heads were correct, the original
	 *         content is returned.
	 * @since 1.0
	 */
	public String fixTableHeads(String content) {

		Element body = parseContent(content);
		
		// select rows with  tags within 
		List tableHeadRows = body.select("table > tbody > tr:has(th)");
		if (tableHeadRows.size() > 0) {
			for (Element row : tableHeadRows) {
				
				// get the row's table
				Element table = row.parent().parent();
				
				// remove row from its original position
				row.remove();
				
				// create table header element with the row
				Element thead = new Element(Tag.valueOf("thead"), "");
				thead.appendChild(row);
				// add at the beginning of the table
				table.prependChild(thead);
			}
			
			return body.html();
		} else {
			// nothing changed
			return content;
		}
	}
	
	
	private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
	private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
	
	/**
	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
	 * use in URLs).
	 * 
	 * @param input
	 *            text to generate the slug from
	 * @param separator
	 *            separator for whitespace replacement
	 * @return the slug of the given text that contains alphanumeric symbols and separator only
	 * @since 1.0
	 * @see http://www.codecodex.com/wiki/Generate_a_url_slug
	 */
	public static String slug(String input, String separator) {
		String slug = adaptSlug(input, separator);
		return slug.toLowerCase(Locale.ENGLISH);
	}
	
	/**
	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
	 * use in URLs). Uses "-" as a whitespace separator.
	 * 
	 * @param input
	 *            text to generate the slug from
	 * @return the slug of the given text that contains alphanumeric symbols and "-" only
	 * @since 1.0
	 */
	public static String slug(String input) {
		return slug(input, "-");
	}
	
	/**
	 * Creates a slug but does not change capitalization.
	 * 
	 * @param input
	 * @param separator
	 * @return
	 */
	private static String adaptSlug(String input, String separator) {
		String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
		String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
		return NONLATIN.matcher(normalized).replaceAll("");
	}
	
	
	/**
	 * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are
	 * nested within bigger ones, e.g. {@code 
} is nested under preceding {@code }.
	 * 
	 * Only headings with IDs are included in the hierarchy. The result elements contain ID and
	 * heading text for each heading. The hierarchy is useful to generate a Table of Contents for a
	 * page.
	 * 
	 * 
	 * @param content
	 *            HTML content to extract heading hierarchy from
	 * @return a list of top-level heading items (with id and text). The remaining headings are
	 *         nested within these top-level items. Empty list if no headings are in the content.
	 * @since 1.0
	 */
	public List headingTree(String content) {

		Element body = parseContent(content);

		List headIds = concat(HEADINGS, "[id]", true);

		// select all headings that have an ID
		List headings = body.select(StringUtil.join(headIds, ", "));

		List headingItems = new ArrayList();
		for (Element heading : headings) {
			headingItems.add(new HeadingItem(heading.id(), heading.text(), headingIndex(heading)));
		}

		List topHeadings = new ArrayList();
		Stack parentHeadings = new Stack();

		for (HeadingItem heading : headingItems) {

			while (!parentHeadings.isEmpty()
					&& parentHeadings.peek().headingIndex >= heading.headingIndex) {
				parentHeadings.pop();
			}

			if (parentHeadings.isEmpty()) {
				// top level heading - no parents
				topHeadings.add(heading);
			} else {
				// add to the children of topmost stack parent
				parentHeadings.peek().children.add(heading);
			}

			// push the heading onto stack
			parentHeadings.push(heading);
		}

		return topHeadings;
	}

	/**
	 * Retrieves numeric index of a heading.
	 * 
	 * @param element
	 * @return
	 */
	private static int headingIndex(Element element) {
		String tagName = element.tagName();
		if (tagName.startsWith("h")) {
			try {
				return Integer.parseInt(tagName.substring(1));
			} catch (Exception ex) {
				throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
			}
		} else {
			throw new IllegalArgumentException("Must be a header tag: " + tagName);
		}
	}

	private static class HeadingItem implements IdElement {
		private final String id;
		private final String text;
		private final int headingIndex;

		private final List children = new ArrayList();

		public HeadingItem(String id, String text, int headingIndex) {
			this.id = id;
			this.text = text;
			this.headingIndex = headingIndex;
		}

		@Override
		public String getId() {
			return id;
		}

		@Override
		public String getText() {
			return text;
		}

		@Override
		public List getItems() {
			return Collections.unmodifiableList(children);
		}
	}

	/**
	 * Representation of a HTML element with ID and a text content. Other such elements can be
	 * nested within.
	 * 
	 * @author Andrius Velykis
	 * @since 1.0
	 */
	public interface IdElement {

		/**
		 * Retrieves the ID of the HTML element (attribute {@code id})
		 * 
		 * @return element {@code id} value
		 */
		public String getId();

		/**
		 * Retrieves the text contents of the HTML element (rendered for display)
		 * 
		 * @return text contents of the element
		 */
		public String getText();

		/**
		 * Retrieves the children of the HTML element (nested within the element)
		 * 
		 * @return nested items within the element
		 */
		public List getItems();
	}
	
	
	/**
	 * A generic method to use jsoup parser on an arbitrary HTML body fragment. Allows writing
	 * HTML manipulations in the template without adding Java code to the class.
	 * 
	 * @param content
	 *            HTML content to parse
	 * @return the wrapper element for the parsed content (i.e. the body element as if the content
	 *         was body contents).
	 * @since 1.0
	 */
	public static Element parseBodyFragment(String content) {

		Document doc = Jsoup.parseBodyFragment(content);
		return doc.body();
	}
	
}