net.htmlparser.jericho.TextExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html Show documentation
Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.
There is a newer version: 3.4
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.2
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;
import java.net.*;

/**
 * Extracts the textual content from HTML markup.
 * 
 * The output is ideal for feeding into a text search engine such as Apache Lucene,
 * especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to true.
 * 

 * Use one of the following methods to obtain the output:
 * 

 *  {@link #writeTo(Writer)}
 *  {@link #appendTo(Appendable)}
 *  {@link #toString()}
 *  {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}
 * 
 * 
 * The process removes all of the tags and
 * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
 * A space character is included in the output where a normal tag is present in the source,
 * unless the tag belongs to an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
 * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to a space despite being an inline-level element.
 * 

 * Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
 * is ignored.
 * 

 * Setting the {@link #setExcludeNonHTMLElements(boolean) ExcludeNonHTMLElements} property results in the exclusion of any content within a
 * non-HTML element.
 * 

 * See the {@link #excludeElement(StartTag)} method for details on how to implement a more complex mechanism to determine whether the
 * {@linkplain Element#getContent() content} of each {@link Element} is to be excluded from the output.
 * 

 * All tags that are not normal tags, such as {@linkplain TagType#isServerTag() server tags},
 * {@linkplain StartTagType#COMMENT comments} etc., are removed from the output without adding white space to the output.
 * 

 * Note that segments on which the {@link Segment#ignoreWhenParsing()} method has been called are treated as text rather than markup,
 * resulting in their inclusion in the output.
 * To remove specific segments before extracting the text, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
 * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment to be removed.
 * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
 * and perform the text extraction on this new source object.
 * 

 * Extracting the text from an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
 * 

 * To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
 * 

 *  Example:
 *  Using the default settings, the source segment:

 *   "<div><b>O</b>ne</div><div title="Two"><b>Th</b><script>//a script </script>ree</div>"

 *   produces the text "One Two Three".
 * 
 */
public class TextExtractor implements CharStreamSource {
	private final Segment segment;
	private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
	private boolean includeAttributes=false;
	private boolean excludeNonHTMLElements=false;

	private static final Map map; // maps each possibly included attribute name to an AttributeIncludeChecker instance, initialised in static block below.

	/**
	 * Constructs a new TextExtractor based on the specified {@link Segment}.
	 * @param segment  the segment from which the text will be extracted.
	 * @see Segment#getTextExtractor()
	 */
	public TextExtractor(final Segment segment) {
		this.segment=segment;
	}

	// Documentation inherited from CharStreamSource
	public void writeTo(final Writer writer) throws IOException {
		appendTo(writer);
		writer.flush();
	}

	// Documentation inherited from CharStreamSource
	public void appendTo(final Appendable appendable) throws IOException {
		appendable.append(toString());
	}

	// Documentation inherited from CharStreamSource
	public long getEstimatedMaximumOutputLength() {
		return segment.length();
	}

	// Documentation inherited from CharStreamSource
	public String toString() {
		return new Processor(segment,getConvertNonBreakingSpaces(),getIncludeAttributes(),getExcludeNonHTMLElements()).toString();
	}

	/**
	 * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
	 * 
	 * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the TextExtractor is instantiated.
	 *
	 * @param convertNonBreakingSpaces  specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
	 * @return this TextExtractor instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getConvertNonBreakingSpaces()
	 */
	public TextExtractor setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
		this.convertNonBreakingSpaces=convertNonBreakingSpaces;
		return this;
	}

	/**
	 * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
	 * 

	 * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
	 * 
	 * @return true if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise false.
	 */
	public boolean getConvertNonBreakingSpaces() {
		return convertNonBreakingSpaces;
	}

	/**
	 * Sets whether any attribute values are included in the output.
	 * 

	 * If the value of this property is true, then each attribute still has to match the conditions implemented in the
	 * {@link #includeAttribute(StartTag,Attribute)} method in order for its value to be included in the output.
	 * 

	 * The default value is false.
	 *
	 * @param includeAttributes  specifies whether any attribute values are included in the output.
	 * @return this TextExtractor instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getIncludeAttributes()
	 */
	public TextExtractor setIncludeAttributes(boolean includeAttributes) {
		this.includeAttributes=includeAttributes;
		return this;
	}
	
	/**
	 * Indicates whether any attribute values are included in the output.
	 * 

	 * See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
	 * 
	 * @return true if any attribute values are included in the output, otherwise false.
	 */
	public boolean getIncludeAttributes() {
		return includeAttributes;
	}

	/**
	 * Indicates whether the value of the specified {@linkplain Attribute attribute} in the specified {@linkplain StartTag start tag} is included in the output.
	 * 

	 * This method is ignored if the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to false, in which case
	 * no attribute values are included in the output.
	 * 

	 * If the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to true, every attribute of every
	 * start tag encountered in the segment is checked using this method to determine whether the value of the attribute should be included in the output.
	 * 

	 * The default implementation of this method returns true if the {@linkplain Attribute#getName() name} of the specified {@linkplain Attribute attribute}
	 * is one of
	 * title,
	 * alt,
	 * label,
	 * summary,
	 * content*, or
	 * href,
	 * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each attribute.
	 * 

	 * * The value of a content attribute is only included if a 
	 * name attribute is also present in the specified start tag,
	 * as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
	 * http-equiv attribute.
	 * 

	 * 

	 *  Example:
	 *  
	 *   To include only the value of title and
	 *   alt attributes:


	 *   
	 *    final Set includeAttributeNames=new HashSet(Arrays.asList(new String[] {"title","alt"}));

	 *    TextExtractor textExtractor=new TextExtractor(segment) {

	 *        public boolean includeAttribute(StartTag startTag, Attribute attribute) {

	 *            return includeAttributeNames.contains(attribute.getKey());

	 *        }

	 *    };

	 *    textExtractor.setIncludeAttributes(true);

	 *    String extractedText=textExtractor.toString();
	 *   
	 *  
	 * 
	 * @param startTag  the start tag of the element to check for inclusion.
	 * @return  if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise false.
	 */
	public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
		AttributeIncludeChecker attributeIncludeChecker=map.get(attribute.getKey());
		if (attributeIncludeChecker==null) return false;
		return attributeIncludeChecker.includeAttribute(startTag,attribute);
	}

	/**
	 * Sets whether the content of non-HTML elements is excluded from the output.
	 * 
	 * The default value is false, meaning that content from all elements meeting the other criteria is included.
	 *
	 * @param excludeNonHTMLElements  specifies whether content non-HTML elements is excluded from the output.
	 * @return this TextExtractor instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getExcludeNonHTMLElements()
	 */
	public TextExtractor setExcludeNonHTMLElements(boolean excludeNonHTMLElements) {
		this.excludeNonHTMLElements=excludeNonHTMLElements;
		return this;
	}
	
	/**
	 * Indicates whether the content of non-HTML elements is excluded from the output.
	 * 

	 * See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
	 * 
	 * @return true if the content of non-HTML elements is excluded from the output, otherwise false.
	 */
	public boolean getExcludeNonHTMLElements() {
		return excludeNonHTMLElements;
	}

	/**
	 * Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
	 * 

	 * During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
	 * {@linkplain StartTag#getElement() associated element} should be excluded from the output.
	 * 

	 * The default implementation of this method is to always return false, so that every element is included,
	 * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
	 * 

	 * All elements nested inside an excluded element are also implicitly excluded, as are all
	 * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
	 * Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
	 * 

	 * 

	 *  Example:
	 *  
	 *   To extract the text from a segment, excluding any text inside elements with the attribute class="NotIndexed":


	 *   
	 *    TextExtractor textExtractor=new TextExtractor(segment) {

	 *        public boolean excludeElement(StartTag startTag) {

	 *            return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));

	 *        }

	 *    };

	 *    String extractedText=textExtractor.toString();
	 *   
	 *  
	 * 
	 * @param startTag  the start tag of the element to check for inclusion.
	 * @return  if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise false.
	 */
	public boolean excludeElement(final StartTag startTag) {
		return false;
	}

	private static interface AttributeIncludeChecker {
		boolean includeAttribute(final StartTag startTag, final Attribute attribute);
	}

	private static AttributeIncludeChecker ALWAYS_INCLUDE=new AttributeIncludeChecker() {
		public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
			return true;
		}
	};

	private static AttributeIncludeChecker INCLUDE_IF_NAME_ATTRIBUTE_PRESENT=new AttributeIncludeChecker() {
		public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
			return startTag.getAttributes().get("name")!=null;
		}
	};

	static {
		map=new HashMap();
		map.put("title",ALWAYS_INCLUDE); // add title attribute
		map.put("alt",ALWAYS_INCLUDE); // add alt attribute (APPLET, AREA, IMG and INPUT elements)
		map.put("label",ALWAYS_INCLUDE); // add label attribute (OPTION and OPTGROUP elements)
		map.put("summary",ALWAYS_INCLUDE); // add summary attribute (TABLE element)
		map.put("content",INCLUDE_IF_NAME_ATTRIBUTE_PRESENT); // add content attribute (META element)
		map.put("href",ALWAYS_INCLUDE); // add href attribute (A, AREA and LINK elements)
		// don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
	}

	/**
	 * This class does the actual work, but is first passed final copies of all the parameters for efficiency.
	 * Note at present this is not implemented in a memory-efficient manner.
	 * Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (possible with java 5 support),
	 * the main algorithm can be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
	 */
	private final class Processor {
		private final Segment segment;
		private final Source source;
		private final boolean convertNonBreakingSpaces;
		private final boolean includeAttributes;
		private final boolean excludeNonHTMLElements;

		public Processor(final Segment segment, final boolean convertNonBreakingSpaces, final boolean includeAttributes, final boolean excludeNonHTMLElements) {
			this.segment=segment;
			source=segment.source;
			this.convertNonBreakingSpaces=convertNonBreakingSpaces;
			this.includeAttributes=includeAttributes;
			this.excludeNonHTMLElements=excludeNonHTMLElements;
		}

		public String toString() {
			final StringBuilder sb=new StringBuilder(segment.length());
			for (NodeIterator nodeIterator=new NodeIterator(segment); nodeIterator.hasNext();) {
				Segment segment=nodeIterator.next();
				if (segment instanceof Tag) {
					final Tag tag=(Tag)segment;
					if (tag.getTagType().isServerTag()) {
						// elementContainsMarkup should be made into a TagType property one day.
						// for the time being assume all server element content is code, although this is not true for some Mason elements.
						final boolean elementContainsMarkup=false;
						if (!elementContainsMarkup) {
							final Element element=tag.getElement();
							if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
						}
						continue;
					}
					if (tag.getTagType()==StartTagType.NORMAL) {
						final StartTag startTag=(StartTag)tag;
						if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || excludeElement(startTag) || (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) {
							nodeIterator.skipToPos(startTag.getElement().getEnd());
							continue;
						}
						if (includeAttributes) {
							for (Attribute attribute : startTag.getAttributes()) {
								if (includeAttribute(startTag,attribute)) sb.append(' ').append(attribute.getValueSegment()).append(' ');
							}
						}
					}
					// Treat both start and end tags not belonging to inline-level elements as whitespace:
					if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
				} else {
					sb.append(segment);
				}
			}
			final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb,convertNonBreakingSpaces);
			return decodedText;
		}
	}
}