All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sangupta.jerry.util.HtmlUtils Maven / Gradle / Ivy

The newest version!
/**
 *
 * jerry - Common Java Functionality
 * Copyright (c) 2012-2017, Sandeep Gupta
 *
 * http://sangupta.com/projects/jerry-core
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * 		http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */


package com.sangupta.jerry.util;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.Map.Entry;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.SourceFormatter;
import net.htmlparser.jericho.StartTag;

/**
 * Utility functions around HTML code.
 *
 * @author sangupta
 *
 */
public abstract class HtmlUtils {

	/**
	 * Tidy the HTML source by reformatting the entire HTML. This is
	 * particularly useful when the application needs to emit HTML.
	 *
	 * @param htmlSource
	 *            the unformatted HTML source
	 *
	 * @return the formatted HTML source.
	 *
	 */
	public static String tidyHtml(String htmlSource) {
		if(htmlSource == null) {
			return htmlSource;
		}

		try {
			Source source = new Source(htmlSource) ;
			StringWriter writer = new StringWriter();
			new SourceFormatter(source).setIndentString("  ").setTidyTags(true).writeTo(writer);
			writer.close();

			return writer.toString();
		} catch(Exception e) {
			e.printStackTrace();
		}

		return htmlSource;
	}

	/**
	 * Return all tags that start with the given name.
	 *
	 * @param htmlBody
	 *            the HTML of the page
	 *
	 * @param tagName
	 *            the name of the tag being looked for
	 *
	 * @return an {@link ArrayList} of all starting tags
	 */
	public static List getAllTags(String htmlBody, String tagName) {
		if(AssertUtils.isEmpty(htmlBody)) {
			return null;
		}

		Source source = null;
		try {
			source = new Source(htmlBody);

			List tags = source.getAllStartTags(tagName);
			return tags;
		} catch(Exception e) {
			// we are unable to parse the page body
			// and extract the links
		}

		return null;
	}

	/**
	 * Extract the values of the specified attribute 'attributeToExtract' of all
	 * tags that start with the given 'tagName' in the HTML body. Also, if
	 * matching attributes are specified then they must match the given values.
	 * Also, 'ignoreMissingAttributes' this defines what to do if the matching
	 * attribute is not present in the given attributes of the tag.
	 *
	 * @param htmlBody
	 *            the html body represented as string
	 *
	 * @param tagName
	 *            the tag name to look for
	 *
	 * @param attributeToExtract
	 *            the attribute to extract
	 *
	 * @param matchingAttributes
	 *            the map of matching attributes and values, that need to be
	 *            matched if present
	 *
	 * @param ignoreMissingAttributes
	 *            should we ignore any missing attribute when matching list of
	 *            given attributes
	 *
	 * @return the list of all values, of all such matching tags
	 *
	 */
	public static List getAttributeForAllTags(final String htmlBody, final String tagName, final String attributeToExtract, final Map matchingAttributes, final boolean ignoreMissingAttributes) {
		List tags = getAllTags(htmlBody, tagName);
		if(AssertUtils.isEmpty(tags)) {
			return null;
		}

		final List values = new ArrayList();
		final boolean checkMatching = AssertUtils.isNotEmpty(matchingAttributes);

		for(StartTag tag : tags) {
			Attributes attributes = tag.getAttributes();
			if(attributes == null || attributes.isEmpty()) {
				continue;
			}

			// check if we have the matching attributes
			boolean readyToExtract = true;
			if(checkMatching) {
				for(Entry entry : matchingAttributes.entrySet()) {
					final String attributeName = entry.getKey();
					final String attributeValue = entry.getValue();

					Attribute attribute = attributes.get(attributeName);
					if(attribute != null) {
						// not a matching value
						if(!(attribute.getValue().equalsIgnoreCase(attributeValue))) {
							// attribute value does not match
							readyToExtract = false;
							break;
						}
					} else {
						if(!ignoreMissingAttributes) {
							// we cannot ignore missing attributes
							// check for next tag
							readyToExtract = false;
							break;
						}
					}
				}
			}

			// extract the value of the tag that we need
			if(readyToExtract) {
				Attribute attribute = attributes.get(attributeToExtract);
				if(attribute != null) {
					values.add(attribute.getValue());
				}
			}
		}

		// if the set is still empty - return null
		if(values.isEmpty()) {
			return null;
		}

		return values;
	}

	/**
	 * Strip the given HTML content to specified text length. All opening
	 * tags are then closed to make sure that the HTML is perfectly safe.
	 *
	 * Tags such as br are skipped for closing.
	 *
	 * @param content the HTML content that you want to trim down
	 * @param length the desired length of the text field
	 * @return the HTML code that contains text trimmed down to said length
	 */
	public static String trimHTML(final String content, final int length) {
		int currentIndex = 0;
		int chosenTextLength = 0;
		String tag;
		Stack tags = new Stack();
		do {
			int index = content.indexOf('<', currentIndex);
			if(index > currentIndex) {
				chosenTextLength += (index - currentIndex - 1);
				currentIndex = index;
			}

			if(chosenTextLength >= length) {
				break;
			}

			if(index != -1) {
				index = content.indexOf('>', index);
				tag = content.substring(currentIndex + 1, index);
				if(!tag.startsWith("/")) {
					if(tag.endsWith("/")) {
						tag = tag.substring(0, tag.length() - 1);
					}

					tags.push(tag.trim());
				} else {
					tag = tag.substring(1);
					do {
						if(tags.size() == 0) {
							break;
						}

						String pop = tags.pop();
						if(pop.equalsIgnoreCase(tag)) {
							break;
						}
					} while(true);
				}

				currentIndex = index;
			}

			if(index == -1) {
				break;
			}
		} while(true);

		// this implies that the content did not have
		// any HTML tag in it
		if(chosenTextLength == 0) {
			// trim the text to last available word within the length
			String text;
			if(length < content.length()) {
				text = content.substring(0, length);

				int index = text.lastIndexOf(' ');
				if(index > -1) {
					text = text.substring(0, index);
				}
			} else {
				text = content;
			}

			return text;
		}

		if(chosenTextLength > length) {
			int subtract = chosenTextLength - length;
			currentIndex = currentIndex - subtract;
		}

		if(tags.size() == 0) {
			return content.substring(0, currentIndex);
		}

		StringBuilder builder = new StringBuilder(content.substring(0, currentIndex));
		int size = tags.size();
		for(int index = 0; index < size; index++) {
			tag = tags.pop();

			if(!"br".equalsIgnoreCase(tag)) {
				builder.append("');
			}
		}

		return builder.toString();
	}

	/**
	 * Convert the entries in the map to a string object separated by a
	 * <br /> tag.
	 *
	 * @param map
	 *            the values to convert
	 *
	 * @return the string representation
	 */
	public static String mapAsHtmlString(Map map) {
		if(AssertUtils.isEmpty(map)) {
			return StringUtils.BLANK_STRING;
		}

		StringBuilder builder = new StringBuilder();
		for(Entry entry : map.entrySet()) {
			builder.append(entry.getKey());
			builder.append(": ");
			builder.append(entry.getValue());
			builder.append("
"); } return builder.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy