au.id.jericho.lib.html.OutputDocument Maven / Gradle / Ivy
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.6
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package au.id.jericho.lib.html;
import java.io.*;
import java.util.*;
/**
* Represents a modified version of an original {@link Source} document.
*
* An OutputDocument
represents an original source document that
* has been modified by substituting segments of it with other text.
* Each of these substitutions must be registered in the output document,
* which is most commonly done using the various replace
, remove
or insert
methods in this class.
* These methods internally {@linkplain #register(OutputSegment) register} one or more {@link OutputSegment} objects to define each substitution.
*
* After all of the substitutions have been registered, the modified text can be retrieved using the
* {@link #writeTo(Writer)} or {@link #toString()} methods.
*
* The registered {@linkplain OutputSegment output segments} may be adjacent and may also overlap.
* An output segment that is completely enclosed by another output segment is not included in the output.
*
* If unexpected results are being generated from an OutputDocument
, the {@link #getDebugInfo()} method provides information on each
* {@linkplain #getRegisteredOutputSegments() registered output segment}, which should provide enough information to determine the cause of the problem.
* In most cases the problem will be caused by overlapping output segments.
*
* The following example converts all externally referenced style sheets to internal style sheets:
*
*
* URL sourceUrl=new URL(sourceUrlString);
* String htmlText=Util.getString(new InputStreamReader(sourceUrl.openStream()));
* Source source=new Source(htmlText);
* OutputDocument outputDocument=new OutputDocument(source);
* StringBuffer sb=new StringBuffer();
* List linkStartTags=source.findAllStartTags(Tag.LINK);
* for (Iterator i=linkStartTags.iterator(); i.hasNext();) {
* StartTag startTag=(StartTag)i.next();
* Attributes attributes=startTag.getAttributes();
* String rel=attributes.getValue("rel");
* if (!"stylesheet".equalsIgnoreCase(rel)) continue;
* String href=attributes.getValue("href");
* if (href==null) continue;
* String styleSheetContent;
* try {
* styleSheetContent=Util.getString(new InputStreamReader(new URL(sourceUrl,href).openStream()));
* } catch (Exception ex) {
* continue; // don't convert if URL is invalid
* }
* sb.setLength(0);
* sb.append("<style");
* Attribute typeAttribute=attributes.get("type");
* if (typeAttribute!=null) sb.append(' ').append(typeAttribute);
* sb.append(">\n").append(styleSheetContent).append("\n</style>");
* outputDocument.replace(startTag,sb);
* }
* String convertedHtmlText=outputDocument.toString();
*
*
* @see OutputSegment
*/
public final class OutputDocument implements CharStreamSource {
private CharSequence sourceText;
private ArrayList outputSegments=new ArrayList();
/**
* Constructs a new output document based on the specified source document.
* @param source the source document.
*/
public OutputDocument(final Source source) {
if (source==null) throw new IllegalArgumentException("source argument must not be null");
this.sourceText=source;
}
OutputDocument(final ParseText parseText) {
this.sourceText=parseText;
}
/**
* Returns the original source text upon which this output document is based.
* @return the original source text upon which this output document is based.
*/
public CharSequence getSourceText() {
return sourceText;
}
/**
* Removes the specified {@linkplain Segment segment} from this output document.
*
* This is equivalent to {@link #replace(Segment,CharSequence) replace}(segment,null)
.
*
* @param segment the segment to remove.
*/
public void remove(final Segment segment) {
register(new RemoveOutputSegment(segment));
}
/**
* Removes all the segments from this output document represented by the specified source {@linkplain Segment} objects.
*
* This is equivalent to the following code:
* for (Iterator i=segments.iterator(); i.hasNext();)
* {@link #remove(Segment) remove}((Segment)i.next());
*
* @param segments a collection of segments to remove, represented by source {@link Segment} objects.
*/
public void remove(final Collection segments) {
for (Iterator i=segments.iterator(); i.hasNext();) remove((Segment)i.next());
}
/**
* Inserts the specified text at the specified character position in this output document.
* @param pos the character position at which to insert the text.
* @param text the replacement text.
*/
public void insert(final int pos, final CharSequence text) {
register(new StringOutputSegment(pos,pos,text));
}
/**
* Replaces the specified {@linkplain Segment segment} in this output document with the specified text.
*
* Specifying a null
argument to the text
parameter is exactly equivalent to specifying an empty string,
* and results in the segment being completely removed from the output document.
*
* @param segment the segment to replace.
* @param text the replacement text, or null
to remove the segment.
*/
public void replace(final Segment segment, final CharSequence text) {
replace(segment.getBegin(),segment.getEnd(),text);
}
/**
* Replaces the specified segment of this output document with the specified text.
*
* Specifying a null
argument to the text
parameter is exactly equivalent to specifying an empty string,
* and results in the segment being completely removed from the output document.
*
* @param begin the character position at which to begin the replacement.
* @param end the character position at which to end the replacement.
* @param text the replacement text, or null
to remove the segment.
*/
public void replace(final int begin, final int end, final CharSequence text) {
register(new StringOutputSegment(begin,end,text));
}
/**
* Replaces the specified segment of this output document with the specified character.
*
* @param begin the character position at which to begin the replacement.
* @param end the character position at which to end the replacement.
* @param ch the replacement character.
*/
public void replace(final int begin, final int end, final char ch) {
register(new CharOutputSegment(begin,end,ch));
}
/**
* Replaces the specified {@link FormControl} in this output document.
*
* The effect of this method is to {@linkplain #register(OutputSegment) register} zero or more
* {@linkplain OutputSegment output segments} in the output document as required to reflect
* previous modifications to the control's state.
* The state of a control includes its submission value,
* {@linkplain FormControl#setOutputStyle(FormControlOutputStyle) output style}, and whether it has been
* {@linkplain FormControl#setDisabled(boolean) disabled}.
*
* The state of the form control should not be modified after this method is called, as there is no guarantee that
* subsequent changes either will or will not be reflected in the final output.
* A second call to this method with the same parameter is not allowed.
* It is therefore recommended to call this method as the last action before the output is generated.
*
* Although the specifics of the number and nature of the output segments added in any particular circumstance
* is not defined in the specification, it can generally be assumed that only the minimum changes necessary
* are made to the original document. If the state of the control has not been modified, calling this method
* has no effect at all.
*
* @param formControl the form control to replace.
* @see #replace(FormFields)
*/
public void replace(final FormControl formControl) {
formControl.replaceInOutputDocument(this);
}
/**
* {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls}
* from the specified {@link FormFields} in this output document.
*
* This is equivalent to the following code:
*
for (Iterator i=formFields.{@link FormFields#getFormControls() getFormControls()}.iterator(); i.hasNext();)
* {@link #replace(FormControl) replace}((FormControl)i.next());
*
* The state of any of the form controls in the specified form fields should not be modified after this method is called,
* as there is no guarantee that subsequent changes either will or will not be reflected in the final output.
* A second call to this method with the same parameter is not allowed.
* It is therefore recommended to call this method as the last action before the output is generated.
*
* @param formFields the form fields to replace.
* @see #replace(FormControl)
*/
public void replace(final FormFields formFields) {
formFields.replaceInOutputDocument(this);
}
/**
* Replaces the specified {@link Attributes} segment in this output document with the name/value entries
* in the returned Map
.
* The returned map initially contains entries representing the attributes from the source document,
* which can be modified before output.
*
* The documentation of the {@link #replace(Attributes,Map)} method contains more information about the requirements
* of the map entries.
*
* Specifying a value of true
as an argument to the convertNamesToLowerCase
parameter
* causes all original attribute names to be converted to lower case in the map.
* This simplifies the process of finding/updating specific attributes since map keys are case sensitive.
*
* Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} before
* being loaded into the map.
*
* This method is logically equivalent to:
* {@link #replace(Attributes,Map) replace}(attributes, attributes.
{@link Attributes#populateMap(Map,boolean) populateMap(new LinkedHashMap(),convertNamesToLowerCase)})
*
* The use of LinkedHashMap
to implement the map ensures (probably unnecessarily) that
* existing attributes are output in the same order as they appear in the source document, and new
* attributes are output in the same order as they are added.
*
*
* - Example:
*
* Source source=new Source(htmlDocument);
* Attributes bodyAttributes
* =source.findNextStartTag(0,Tag.BODY).getAttributes();
* OutputDocument outputDocument=new OutputDocument(source);
* Map attributesMap=outputDocument.replace(bodyAttributes,true);
* attributesMap.put("bgcolor","green");
* String htmlDocumentWithGreenBackground=outputDocument.toString();
*
* @param attributes the Attributes
segment defining the span of the segment and initial name/value entries of the returned map.
* @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map.
* @return a Map
containing the name/value entries to be output.
* @see #replace(Attributes,Map)
*/
public Map replace(final Attributes attributes, boolean convertNamesToLowerCase) {
AttributesOutputSegment attributesOutputSegment=new AttributesOutputSegment(attributes,convertNamesToLowerCase);
register(attributesOutputSegment);
return attributesOutputSegment.getMap();
}
/**
* Replaces the specified attributes segment in this source document with the name/value entries in the specified Map
.
*
* This method might be used if the Map
containing the new attribute values
* should not be preloaded with the same entries as the source attributes, or a map implementation
* other than LinkedHashMap
is required.
* Otherwise, the {@link #replace(Attributes, boolean convertNamesToLowerCase)} method is generally more useful.
*
* Keys in the map must be String
objects, and values must implement the CharSequence
interface.
*
* An attribute with no value is represented by a map entry with a null
value.
*
* Attribute values are stored unencoded in the map, and are automatically
* {@linkplain CharacterReference#encode(CharSequence) encoded} if necessary during output.
*
* The use of invalid characters in attribute names results in unspecified behaviour.
*
* Note that methods in the Attributes
class treat attribute names as case insensitive,
* whereas the Map
treats them as case sensitive.
*
* @param attributes the Attributes
object defining the span of the segment to replace.
* @param map the Map
containing the name/value entries.
* @see #replace(Attributes, boolean convertNamesToLowerCase)
*/
public void replace(final Attributes attributes, final Map map) {
register(new AttributesOutputSegment(attributes,map));
}
/**
* Replaces the specified segment of this output document with a string of spaces of the same length.
*
* This method is most commonly used to remove segments of the document without affecting the character positions of the remaining elements.
*
* It is used internally to implement the functionality available through the {@link Segment#ignoreWhenParsing()} method.
*
* To remove a segment from the output document completely, use the {@link #remove(Segment)} method instead.
*
* @param begin the character position at which to begin the replacement.
* @param end the character position at which to end the replacement.
*/
public void replaceWithSpaces(final int begin, final int end) {
register(new BlankOutputSegment(begin,end));
}
/**
* Registers the specified {@linkplain OutputSegment output segment} in this output document.
*
* Use this method if you want to use a customised {@link OutputSegment} class.
*
* @param outputSegment the output segment to register.
*/
public void register(final OutputSegment outputSegment) {
outputSegments.add(outputSegment);
}
/**
* Writes the final content of this output document to the specified Writer
.
*
* The {@link #writeTo(Writer, int begin, int end)} method allows the output of a portion of the output document.
*
* If the output is required in the form of a Reader
, use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead.
*
* @param writer the destination java.io.Writer
for the output.
* @throws IOException if an I/O exception occurs.
* @see #toString()
*/
public void writeTo(final Writer writer) throws IOException {
writeTo(writer,0,sourceText.length());
}
/**
* Writes the specified portion of the final content of this output document to the specified Writer
.
*
* Any zero-length output segments located at begin
or end
are included in the output.
*
* @param writer the destination java.io.Writer
for the output.
* @param begin the character position at which to start the output, inclusive.
* @param end the character position at which to end the output, exclusive.
* @throws IOException if an I/O exception occurs.
* @see #writeTo(Writer)
*/
public void writeTo(final Writer writer, final int begin, final int end) throws IOException {
try {
if (outputSegments.isEmpty()) {
Util.appendTo(writer,sourceText,begin,end);
return;
}
int pos=begin;
Collections.sort(outputSegments,OutputSegment.COMPARATOR);
for (final Iterator i=outputSegments.iterator(); i.hasNext();) {
final OutputSegment outputSegment=(OutputSegment)i.next();
if (outputSegment.getEnd()end) break; // stop processing output segments if they are not longer in the desired output range
if (outputSegment.getBegin()==end && outputSegment.getEnd()>end) break; // stop processing output segments if they start at end unless they are zero length
if (outputSegment.getBegin()>pos) {
Util.appendTo(writer,sourceText,pos,outputSegment.getBegin());
}
if (outputSegment.getBegin()=0L ? estimatedMaximumOutputLength : -1L;
}
/**
* Returns the final content of this output document as a String
.
* @return the final content of this output document as a String
.
* @see #writeTo(Writer)
*/
public String toString() {
return CharStreamSourceUtil.toString(this);
}
/**
* Returns a string representation of this object useful for debugging purposes.
*
* The output includes details of all the {@link #getRegisteredOutputSegments() registered output segments}.
*
* @return a string representation of this object useful for debugging purposes.
*/
public String getDebugInfo() {
StringBuffer sb=new StringBuffer();
for (Iterator i=getRegisteredOutputSegments().iterator(); i.hasNext();) {
OutputSegment outputSegment=(OutputSegment)i.next();
if (outputSegment instanceof BlankOutputSegment)
sb.append("Replace with Spaces: ");
else if (outputSegment instanceof RemoveOutputSegment)
sb.append("Remove: ");
else
sb.append("Replace: ");
if (sourceText instanceof Source) {
Source source=(Source)sourceText;
sb.append('(');
source.getRowColumnVector(outputSegment.getBegin()).appendTo(sb);
sb.append('-');
source.getRowColumnVector(outputSegment.getEnd()).appendTo(sb);
sb.append(')');
} else {
sb.append("(p").append(outputSegment.getBegin()).append("-p").append(outputSegment.getEnd()).append(')');
}
sb.append(' ');
String outputFromSegment=outputSegment.toString();
if (outputFromSegment.length()<=20) {
sb.append(outputFromSegment);
} else {
sb.append(outputFromSegment.substring(0,20)).append("...");
}
sb.append(Config.NewLine);
}
return sb.toString();
}
/**
* Returns a list all of the {@linkplain #register(OutputSegment) registered} {@link OutputSegment} objects in this output document.
*
* The output segments are sorted in order of their {@linkplain OutputSegment#getBegin() starting position} in the document.
*
* The returned list is modifiable and any changes will affect the output generated by this OutputDocument
.
*
* @return a list all of the {@linkplain #register(OutputSegment) registered} {@link OutputSegment} objects in this output document.
*/
public List getRegisteredOutputSegments() {
Collections.sort(outputSegments,OutputSegment.COMPARATOR);
return outputSegments;
}
}