All Downloads are FREE. Search and download functionalities are using the official Maven repository.

au.id.jericho.lib.html.OutputDocument Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.

There is a newer version: 3.4
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.4
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package au.id.jericho.lib.html;

import java.io.*;
import java.util.*;

/**
 * Represents a modified version of an original {@link Source} document.
 * 

* An OutputDocument represents an original source document that * has been modified by substituting segments of it with other text. * Each of these substitutions must be registered in the output document, * which is most commonly done using the various replace, remove or insert methods in this class. * These methods internally {@linkplain #register(OutputSegment) register} one or more {@link OutputSegment} objects to define each substitution. * * After all of the substitutions have been registered, the modified text can be retrieved using the * {@link #writeTo(Writer)} or {@link #toString()} methods. *

* The registered {@linkplain OutputSegment output segments} must not overlap each other, but may be adjacent. * Multiple output segments may be added at the same {@linkplain OutputSegment#getBegin() begin} position provided that they are all * zero-length, with the exception of one segment which may {@linkplain OutputSegment#getEnd() end} at a different position. *

* For efficiency reasons, violations of the above rules on overlapping segments do not throw an exception when the segment is registered, * but an {@link OverlappingOutputSegmentsException} is thrown when the {@linkplain #writeTo(Writer) output is generated}. *

* The following example converts all externally referenced style sheets to internal style sheets: *

*

 *  URL sourceUrl=new URL(sourceUrlString);
 *  String htmlText=Util.getString(new InputStreamReader(sourceUrl.openStream()));
 *  Source source=new Source(htmlText);
 *  OutputDocument outputDocument=new OutputDocument(source);
 *  StringBuffer sb=new StringBuffer();
 *  List linkStartTags=source.findAllStartTags(Tag.LINK);
 *  for (Iterator i=linkStartTags.iterator(); i.hasNext();) {
 *    StartTag startTag=(StartTag)i.next();
 *    Attributes attributes=startTag.getAttributes();
 *    String rel=attributes.getValue("rel");
 *    if (!"stylesheet".equalsIgnoreCase(rel)) continue;
 *    String href=attributes.getValue("href");
 *    if (href==null) continue;
 *    String styleSheetContent;
 *    try {
 *      styleSheetContent=Util.getString(new InputStreamReader(new URL(sourceUrl,href).openStream()));
 *    } catch (Exception ex) {
 *      continue; // don't convert if URL is invalid
 *    }
 *    sb.setLength(0);
 *    sb.append("<style");
 *    Attribute typeAttribute=attributes.get("type");
 *    if (typeAttribute!=null) sb.append(' ').append(typeAttribute);
 *    sb.append(">\n").append(styleSheetContent).append("\n</style>");
 *    outputDocument.replace(startTag,sb);
 *  }
 *  String convertedHtmlText=outputDocument.toString();
 * 
* * @see OutputSegment * @see StringOutputSegment */ public final class OutputDocument implements CharStreamSource { private CharSequence sourceText; private ArrayList outputSegments=new ArrayList(); /** * Constructs a new output document based on the specified source document. * @param source the source document. */ public OutputDocument(final Source source) { if (source==null) throw new IllegalArgumentException("source argument must not be null"); this.sourceText=source; } OutputDocument(final ParseText parseText) { this.sourceText=parseText; } /** * Returns the original source text upon which this output document is based. * @return the original source text upon which this output document is based. */ public CharSequence getSourceText() { return sourceText; } /** * Removes the specified {@linkplain Segment segment} from this output document. *

* This is equivalent to {@link #replace(Segment,CharSequence) replace}(segment,null). * * @param segment the segment to remove. */ public void remove(final Segment segment) { replace(segment,(CharSequence)null); } /** * Removes all the segments from this output document represented by the specified source {@linkplain Segment} objects. *

* This is equivalent to the following code:

	 *  for (Iterator i=segments.iterator(); i.hasNext();)
	 *    {@link #remove(Segment) remove}((Segment)i.next());
* * @param segments a collection of segments to remove, represented by source {@link Segment} objects. */ public void remove(final Collection segments) { for (Iterator i=segments.iterator(); i.hasNext();) remove((Segment)i.next()); } /** * Inserts the specified text at the specified character position in this output document. * @param pos the character position at which to insert the text. * @param text the replacement text. */ public void insert(final int pos, final CharSequence text) { register(new StringOutputSegment(pos,pos,text)); } /** * Replaces the specified {@linkplain Segment segment} in this output document with the specified text. *

* Specifying a null argument to the text parameter is exactly equivalent to specifying an empty string, * and results in the segment being completely removed from the output document. * * @param segment the segment to replace. * @param text the replacement text, or null to remove the segment. */ public void replace(final Segment segment, final CharSequence text) { replace(segment.getBegin(),segment.getEnd(),text); } /** * Replaces the specified segment of this output document with the specified text. *

* Specifying a null argument to the text parameter is exactly equivalent to specifying an empty string, * and results in the segment being completely removed from the output document. * * @param begin the character position at which to begin the replacement. * @param end the character position at which to end the replacement. * @param text the replacement text, or null to remove the segment. */ public void replace(final int begin, final int end, final CharSequence text) { register(new StringOutputSegment(begin,end,text)); } /** * Replaces the specified segment of this output document with the specified character. * * @param begin the character position at which to begin the replacement. * @param end the character position at which to end the replacement. * @param ch the replacement character. */ public void replace(final int begin, final int end, final char ch) { register(new CharOutputSegment(begin,end,ch)); } /** * Replaces the specified {@link FormControl} in this output document. *

* The effect of this method is to {@linkplain #register(OutputSegment) register} zero or more * {@linkplain OutputSegment output segments} in the output document as required to reflect * previous modifications to the control's state. * The state of a control includes its submission value, * {@linkplain FormControl#setOutputStyle(FormControlOutputStyle) output style}, and whether it has been * {@linkplain FormControl#setDisabled(boolean) disabled}. *

* The state of the form control should not be modified after this method is called, as there is no guarantee that * subsequent changes either will or will not be reflected in the final output. * A second call to this method with the same parameter is not allowed. * It is therefore recommended to call this method as the last action before the output is generated. *

* Although the specifics of the number and nature of the output segments added in any particular circumstance * is not defined in the specification, it can generally be assumed that only the minimum changes necessary * are made to the original document. If the state of the control has not been modified, calling this method * has no effect at all. * * @param formControl the form control to replace. * @see #replace(FormFields) */ public void replace(final FormControl formControl) { formControl.replaceInOutputDocument(this); } /** * {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls} * from the specified {@link FormFields} in this output document. *

* This is equivalent to the following code: *

for (Iterator i=formFields.{@link FormFields#getFormControls() getFormControls()}.iterator(); i.hasNext();)
	 *   {@link #replace(FormControl) replace}((FormControl)i.next());
*

* The state of any of the form controls in the specified form fields should not be modified after this method is called, * as there is no guarantee that subsequent changes either will or will not be reflected in the final output. * A second call to this method with the same parameter is not allowed. * It is therefore recommended to call this method as the last action before the output is generated. * * @param formFields the form fields to replace. * @see #replace(FormControl) */ public void replace(final FormFields formFields) { formFields.replaceInOutputDocument(this); } /** * Replaces the specified {@link Attributes} segment in this output document with the name/value entries * in the returned Map. * The returned map initially contains entries representing the attributes from the source document, * which can be modified before output. *

* The documentation of the {@link #replace(Attributes,Map)} method contains more information about the requirements * of the map entries. *

* Specifying a value of true as an argument to the convertNamesToLowerCase parameter * causes all original attribute names to be converted to lower case in the map. * This simplifies the process of finding/updating specific attributes since map keys are case sensitive. *

* Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} before * being loaded into the map. *

* This method is logically equivalent to:
* {@link #replace(Attributes,Map) replace}(attributes, attributes.{@link Attributes#populateMap(Map,boolean) populateMap(new LinkedHashMap(),convertNamesToLowerCase)}) *

* The use of LinkedHashMap to implement the map ensures (probably unnecessarily) that * existing attributes are output in the same order as they appear in the source document, and new * attributes are output in the same order as they are added. *

*

*
Example:
*
	 *  Source source=new Source(htmlDocument);
	 *  Attributes bodyAttributes
	 *    =source.findNextStartTag(0,Tag.BODY).getAttributes();
	 *  OutputDocument outputDocument=new OutputDocument(source);
	 *  Map attributesMap=outputDocument.replace(bodyAttributes,true);
	 *  attributesMap.put("bgcolor","green");
	 *  String htmlDocumentWithGreenBackground=outputDocument.toString();
* * @param attributes the Attributes segment defining the span of the segment and initial name/value entries of the returned map. * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map. * @return a Map containing the name/value entries to be output. * @see #replace(Attributes,Map) */ public Map replace(final Attributes attributes, boolean convertNamesToLowerCase) { AttributesOutputSegment attributesOutputSegment=new AttributesOutputSegment(attributes,convertNamesToLowerCase); register(attributesOutputSegment); return attributesOutputSegment.getMap(); } /** * Replaces the specified attributes segment in this source document with the name/value entries in the specified Map. *

* This method might be used if the Map containing the new attribute values * should not be preloaded with the same entries as the source attributes, or a map implementation * other than LinkedHashMap is required. * Otherwise, the {@link #replace(Attributes, boolean convertNamesToLowerCase)} method is generally more useful. *

* Keys in the map must be String objects, and values must implement the CharSequence interface. *

* An attribute with no value is represented by a map entry with a null value. *

* Attribute values are stored unencoded in the map, and are automatically * {@linkplain CharacterReference#encode(CharSequence) encoded} if necessary during output. *

* The use of invalid characters in attribute names results in unspecified behaviour. *

* Note that methods in the Attributes class treat attribute names as case insensitive, * whereas the Map treats them as case sensitive. * * @param attributes the Attributes object defining the span of the segment to replace. * @param map the Map containing the name/value entries. * @see #replace(Attributes, boolean convertNamesToLowerCase) */ public void replace(final Attributes attributes, final Map map) { register(new AttributesOutputSegment(attributes,map)); } /** * Replaces the specified segment of this output document with a string of spaces of the same length. *

* This method is used internally to implement the functionality available through the * {@link Segment#ignoreWhenParsing()} method. * It is included in the public API in the unlikely event it has other practical uses * for the developer. * To remove a segment from the output document completely, use the {@link #remove(Segment)} method instead. * * @param begin the character position at which to begin the replacement. * @param end the character position at which to end the replacement. */ public void replaceWithSpaces(final int begin, final int end) { register(new BlankOutputSegment(begin,end)); } /** * Registers the specified {@linkplain OutputSegment output segment} in this output document. *

* Use this method if you want to use a customised {@link OutputSegment} class. * * @param outputSegment the output segment to register. */ public void register(final OutputSegment outputSegment) { outputSegments.add(outputSegment); } /** * Writes the final content of this output document to the specified Writer. *

* An {@link OverlappingOutputSegmentsException} is thrown if any of the output segments overlap. * For efficiency reasons this condition is not caught when the offending output segment is {@linkplain #add(OutputSegment) added}. *

* If the output is required in the form of a Reader, use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead. * * @param writer the destination java.io.Writer for the output. * @throws IOException if an I/O exception occurs. * @throws OverlappingOutputSegmentsException if any of the output segments overlap. * @see #toString() */ public void writeTo(final Writer writer) throws IOException { try { if (outputSegments.isEmpty()) { Util.appendTo(writer,sourceText); return; } int pos=0; Collections.sort(outputSegments,OutputSegment.COMPARATOR); OutputSegment lastOutputSegment=null; for (final Iterator i=outputSegments.iterator(); i.hasNext();) { final OutputSegment outputSegment=(OutputSegment)i.next(); if (outputSegment==lastOutputSegment) continue; // silently ignore duplicate output segment if (outputSegment.getBegin()pos) Util.appendTo(writer,sourceText,pos,outputSegment.getBegin()); outputSegment.writeTo(writer); lastOutputSegment=outputSegment; pos=outputSegment.getEnd(); } if (posString. * @return the final content of this output document as a String. * @throws OverlappingOutputSegmentsException if any of the output segments overlap. * @see #writeTo(Writer) */ public String toString() { return CharStreamSourceUtil.toString(this); } /** * Constructs a new output document based on the specified source text. *

* This constructor has been deprecated as of version 2.2 in favour of the {@link #OutputDocument(Source)} method * as most of the methods in this class assume that the argument supplied to this constructor is the entire source document. * * @param sourceText the source text. * @deprecated Use the {@link #OutputDocument(Source)} constructor instead. */ public OutputDocument(final CharSequence sourceText) { if (sourceText==null) throw new IllegalArgumentException("sourceText argument must not be null"); this.sourceText=sourceText; } /** * Registers the specified {@linkplain OutputSegment output segment} in this output document. *

* This method has been deprecated as of version 2.2 in favour of the identical {@link #register(OutputSegment)} method * in an effort to make this class and its methods more intuitive. * * @param outputSegment the output segment to register. * @deprecated Use the {@link #register(OutputSegment)} method instead. */ public void add(final OutputSegment outputSegment) { register(outputSegment); } /** * Replaces the specified {@link FormControl} in this output document. *

* This method has been deprecated as of version 2.2 in favour of the identical {@link #replace(FormControl)} method * in an effort to make this class and its methods more intuitive. * * @param formControl the form control to replace. * @deprecated Use the {@link #replace(FormControl)} method instead. */ public void add(final FormControl formControl) { replace(formControl); } /** * {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls} * from the specified {@link FormFields} in this output document. *

* This method has been deprecated as of version 2.2 in favour of the identical {@link #replace(FormFields)} method * in an effort to make this class and its methods more intuitive. * * @param formFields the form fields to replace. * @deprecated Use the {@link #replace(FormFields)} method instead. */ public void add(final FormFields formFields) { formFields.replaceInOutputDocument(this); } /** * Outputs the final content of this output document to the specified Writer. *

* This method has been deprecated as of version 2.2 in favour of the identical {@link #writeTo(Writer)} method in order for this class to implement {@link CharStreamSource}. * * @param writer the destination java.io.Writer for the output. * @throws IOException if an I/O exception occurs. * @throws OverlappingOutputSegmentsException if any of the output segments overlap. * @deprecated Use the {@link #writeTo(Writer)} method instead. */ public void output(final Writer writer) throws IOException { writeTo(writer); } /** * Returns a Reader that reads the final content of this output document. *

* This method has been deprecated as of version 2.2 in favour of calling the {@link CharStreamSourceUtil#getReader(CharStreamSource)} method, * passing this object as the argument. * * @return a Reader that reads the final content of this output document. * @throws OverlappingOutputSegmentsException if any of the output segments overlap. * @deprecated Use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead. */ public Reader getReader() { return CharStreamSourceUtil.getReader(this); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy