src.au.id.jericho.lib.html.OutputDocument Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html
Jericho HTML Parser is a simple but powerful java library allowing analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions.
There is a newer version: 2.3
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 1.5
// Copyright (C) 2004 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.io.*;
import java.util.*;

/**
 * Represents a modified version of an original source text.
 * 
 * An OutputDocument represents an original source text that
 * has been modified by substituting segments of it with other text.
 * Each of these substitutions is registered by adding an {@link IOutputSegment} to the OutputDocument.
 * After all of the substitutions have been added, the modified text can be retrieved using the
 * {@link #output(Writer)} or {@link #toString()} methods.
 * 

 * The registered OutputSegments must not overlap each other, but may be adjacent.
 * 

 * The following example converts all externally referenced style sheets to internal style sheets:
 * 
 *  OutputDocument outputDocument=new OutputDocument(htmlText);
 *  Source source=new Source(htmlText);
 *  StringBuffer sb=new StringBuffer();
 *  List linkStartTags=source.findAllStartTags(Tag.LINK);
 *  for (Iterator i=linkStartTags.iterator(); i.hasNext();) {
 *    StartTag startTag=(StartTag)i.next();
 *    Attributes attributes=startTag.getAttributes();
 *    String rel=attributes.getValue("rel");
 *    if (!"stylesheet".equalsIgnoreCase(rel)) continue;
 *    String href=attributes.getValue("href");
 *    if (href==null) continue;
 *    String styleSheetContent;
 *    try {
 *      styleSheetContent=CommonTools.getString(new URL(href).openStream()); // note CommonTools.getString method is not defined here
 *    } catch (Exception ex) {
 *      continue; // don't convert if URL is invalid
 *    }
 *    sb.setLength(0);
 *    sb.append("<style");
 *    Attribute typeAttribute=attributes.get("type");
 *    if (typeAttribute!=null) sb.append(' ').append(typeAttribute);
 *    sb.append(">\n").append(styleSheetContent).append("\n</style>");
 *    outputDocument.add(new StringOutputSegment(startTag,sb.toString()));
 *  }
 *  String convertedHtmlText=outputDocument.toString();
 * 
 *
 * @see IOutputSegment
 * @see StringOutputSegment
 */
public final class OutputDocument {
	private CharSequence sourceText;
	private ArrayList outputSegments=new ArrayList();

	/**
	 * Constructs a new OutputDocument based on the specified source text.
	 * 
	 * Note that a {@link Source} object can be passed directly as an argument to this constructor
	 * as it implements the CharSequence interface.
	 *
	 * @param sourceText  the source text.
	 */
	public OutputDocument(CharSequence sourceText) {
	  if (sourceText==null) throw new IllegalArgumentException();
		this.sourceText=sourceText;
	}

	/**
	 * Returns the original source text upon which this OutputDocument is based.
	 * @return the original source text upon which this OutputDocument is based.
	 */
	public CharSequence getSourceText() {
		return sourceText;
	}

	/**
	 * Adds the specified {@linkplain IOutputSegment output segment} to this OutputDocument.
	 * 

	 * Note that for efficiency reasons no exception is thrown if the added output segment overlaps another,
	 * however in this case an {@link OverlappingOutputSegmentsException} will be thrown when the output is generated.
	 */
	public void add(IOutputSegment outputSegment) {
		outputSegments.add(outputSegment);
	}

	/**
	 * ***************************
	 */
	public void add(FormControl formControl) {
		formControl.addToOutputDocument(this);
	}

	/**
	 * ***************************
	 */
	public void add(FormFields formFields) {
		formFields.addToOutputDocument(this);
	}

	/**
	 * Outputs the final content of this OutputDocument to the specified Writer.
	 * 
	 * An {@link OverlappingOutputSegmentsException} is thrown if any of the output segments overlap.
	 * For efficiency reasons this condition is not caught when the offending output segment is added.
	 *
	 * @throws IOException  if an I/O exception occurs.
	 * @throws OverlappingOutputSegmentsException  if any of the output segments overlap.
	 */
	public void output(Writer writer) throws IOException {
		if (outputSegments.isEmpty()) {
			Util.appendTo(writer,sourceText);
			return;
		}
		int pos=0;
		Collections.sort(outputSegments,IOutputSegment.COMPARATOR);
		IOutputSegment lastOutputSegment=null;
		for (Iterator i=outputSegments.iterator(); i.hasNext();) {
			IOutputSegment outputSegment=(IOutputSegment)i.next();
			if (outputSegment==lastOutputSegment) continue; // silently ignore duplicate output segment
			if (outputSegment.getBegin()pos) Util.appendTo(writer,sourceText,pos,outputSegment.getBegin());
			outputSegment.output(writer);
			lastOutputSegment=outputSegment;
			pos=outputSegment.getEnd();
		}
		if (posOutputDocument as a String.
	 * @return the final content of this OutputDocument as a String.
	 * @throws OverlappingOutputSegmentsException  if any of the output segments overlap.
	 */
	public String toString() {
		StringWriter writer=new StringWriter((int)(sourceText.length()*1.5));
		try {
			output(writer);
		} catch (IOException ex) {throw new RuntimeException(ex);} // should never happen with StringWriter
		return writer.toString();
	}
}