
vuegwt.shaded.net.htmlparser.jericho.Renderer Maven / Gradle / Ivy
Show all versions of vue-gwt-processors Show documentation
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.4
// Copyright (C) 2004-2013 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// 3. The Apache License version 2.0,
// included in this distribution in the file licence-apache-2.0.html
// or available at http://www.apache.org/licenses/LICENSE-2.0.html
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
/**
* Performs a simple rendering of HTML markup into text.
*
* This provides a human readable version of the segment content that is modelled on the way
* Mozilla Thunderbird and other email clients provide an automatic conversion of
* HTML content to text in their alternative MIME encoding of emails.
*
* The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in
* RFC3676.
*
* Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}.
* See the individual property descriptions for details.
*
* Use one of the following methods to obtain the output:
*
* - {@link #writeTo(Writer)}
* - {@link #appendTo(Appendable)}
* - {@link #toString()}
* - {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}
*
*
* The rendering of some constructs, especially tables, is very rudimentary.
* No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output.
*
* Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
*
* Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
*
* To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead.
*/
public class Renderer implements CharStreamSource {
private final Segment rootSegment;
private int maxLineLength=DEFAULT_LINE_LENGTH;
private int hrLineLength=DEFAULT_LINE_LENGTH-4;
private String newLine="\r\n";
private boolean includeHyperlinkURLs=true;
private boolean includeAlternateText=true;
private boolean decorateFontStyles=false;
private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
private int blockIndentSize=4;
private int listIndentSize=6;
private char[] listBullets=new char[] {'*','o','+','#'};
private boolean includeFirstElementTopMargin=false;
private String tableCellSeparator=" \t";
private static final int DEFAULT_LINE_LENGTH=76;
private static final int UNORDERED_LIST=-1;
private static Map ELEMENT_HANDLERS=new HashMap();
static {
ELEMENT_HANDLERS.put(HTMLElementName.A,A_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.ADDRESS,StandardBlockElementHandler.INSTANCE_0_0);
ELEMENT_HANDLERS.put(HTMLElementName.APPLET,AlternateTextElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.B,FontStyleElementHandler.INSTANCE_B);
ELEMENT_HANDLERS.put(HTMLElementName.BLOCKQUOTE,StandardBlockElementHandler.INSTANCE_1_1_INDENT);
ELEMENT_HANDLERS.put(HTMLElementName.BR,BR_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.BUTTON,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.CAPTION,StandardBlockElementHandler.INSTANCE_0_0);
ELEMENT_HANDLERS.put(HTMLElementName.CENTER,StandardBlockElementHandler.INSTANCE_1_1);
ELEMENT_HANDLERS.put(HTMLElementName.CODE,FontStyleElementHandler.INSTANCE_CODE);
ELEMENT_HANDLERS.put(HTMLElementName.DD,StandardBlockElementHandler.INSTANCE_0_0_INDENT);
ELEMENT_HANDLERS.put(HTMLElementName.DIR,ListElementHandler.INSTANCE_UL);
ELEMENT_HANDLERS.put(HTMLElementName.DIV,StandardBlockElementHandler.INSTANCE_0_0);
ELEMENT_HANDLERS.put(HTMLElementName.DT,StandardBlockElementHandler.INSTANCE_0_0);
ELEMENT_HANDLERS.put(HTMLElementName.EM,FontStyleElementHandler.INSTANCE_I);
ELEMENT_HANDLERS.put(HTMLElementName.FIELDSET,StandardBlockElementHandler.INSTANCE_1_1);
ELEMENT_HANDLERS.put(HTMLElementName.FORM,StandardBlockElementHandler.INSTANCE_1_1);
ELEMENT_HANDLERS.put(HTMLElementName.H1,StandardBlockElementHandler.INSTANCE_2_1);
ELEMENT_HANDLERS.put(HTMLElementName.H2,StandardBlockElementHandler.INSTANCE_2_1);
ELEMENT_HANDLERS.put(HTMLElementName.H3,StandardBlockElementHandler.INSTANCE_2_1);
ELEMENT_HANDLERS.put(HTMLElementName.H4,StandardBlockElementHandler.INSTANCE_2_1);
ELEMENT_HANDLERS.put(HTMLElementName.H5,StandardBlockElementHandler.INSTANCE_2_1);
ELEMENT_HANDLERS.put(HTMLElementName.H6,StandardBlockElementHandler.INSTANCE_2_1);
ELEMENT_HANDLERS.put(HTMLElementName.HEAD,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.HR,HR_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.I,FontStyleElementHandler.INSTANCE_I);
ELEMENT_HANDLERS.put(HTMLElementName.IMG,AlternateTextElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.INPUT,AlternateTextElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.LEGEND,StandardBlockElementHandler.INSTANCE_0_0);
ELEMENT_HANDLERS.put(HTMLElementName.LI,LI_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.MENU,ListElementHandler.INSTANCE_UL);
ELEMENT_HANDLERS.put(HTMLElementName.MAP,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.NOFRAMES,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.NOSCRIPT,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.OL,ListElementHandler.INSTANCE_OL);
ELEMENT_HANDLERS.put(HTMLElementName.P,StandardBlockElementHandler.INSTANCE_1_1);
ELEMENT_HANDLERS.put(HTMLElementName.PRE,PRE_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.SCRIPT,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.SELECT,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.STRONG,FontStyleElementHandler.INSTANCE_B);
ELEMENT_HANDLERS.put(HTMLElementName.STYLE,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.TEXTAREA,RemoveElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.TD,TD_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.TH,TD_ElementHandler.INSTANCE);
ELEMENT_HANDLERS.put(HTMLElementName.TR,StandardBlockElementHandler.INSTANCE_0_0);
ELEMENT_HANDLERS.put(HTMLElementName.U,FontStyleElementHandler.INSTANCE_U);
ELEMENT_HANDLERS.put(HTMLElementName.UL,ListElementHandler.INSTANCE_UL);
}
/**
* Constructs a new Renderer
based on the specified {@link Segment}.
* @param segment the segment containing the HTML to be rendered.
* @see Segment#getRenderer()
*/
public Renderer(final Segment segment) {
rootSegment=segment;
}
// Documentation inherited from CharStreamSource
public void writeTo(final Writer writer) throws IOException {
appendTo(writer);
writer.flush();
}
// Documentation inherited from CharStreamSource
public void appendTo(final Appendable appendable) throws IOException {
new Processor(this,rootSegment,getMaxLineLength(),getHRLineLength(),getNewLine(),getIncludeHyperlinkURLs(),getIncludeAlternateText(),getDecorateFontStyles(),getConvertNonBreakingSpaces(),getBlockIndentSize(),getListIndentSize(),getListBullets(),getTableCellSeparator()).appendTo(appendable);
}
// Documentation inherited from CharStreamSource
public long getEstimatedMaximumOutputLength() {
return rootSegment.length();
}
// Documentation inherited from CharStreamSource
public String toString() {
return CharStreamSourceUtil.toString(this);
}
/**
* Sets the column at which lines are to be wrapped.
*
* Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary.
*
* Setting this property automatically sets the {@link #setHRLineLength(int) HRLineLength} property to MaxLineLength - 4
.
*
* Setting this property to zero disables line wrapping completely, and leaves the value of {@link #setHRLineLength(int) HRLineLength} unchanged.
*
* A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length.
* In this case the line is wrapped immediately after the end of the word.
*
* The default value is 76
, which reflects the maximum line length for sending
* email data specified in RFC2049 section 3.5.
*
* @param maxLineLength the column at which lines are to be wrapped.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getMaxLineLength()
*/
public Renderer setMaxLineLength(final int maxLineLength) {
this.maxLineLength=maxLineLength;
if (maxLineLength>0) hrLineLength=Math.max(2,maxLineLength-4);
return this;
}
/**
* Returns the column at which lines are to be wrapped.
*
* See the {@link #setMaxLineLength(int)} method for a full description of this property.
*
* @return the column at which lines are to be wrapped, or zero if line wrapping is disabled.
*/
public int getMaxLineLength() {
return maxLineLength;
}
/**
* Sets the length of a horizontal line.
*
* The length determines the number of hyphen characters used to render {@link HTMLElementName#HR HR} elements.
*
* Setting this property to 0
disables line rendering, although it is still treated as a block boundary.
*
* This property is set automatically to MaxLineLength - 4
when the {@link #setMaxLineLength(int) MaxLineLength} property is set.
* The default value is 72
.
*
* @param hrLineLength the length of a horizontal line.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getHRLineLength()
*/
public Renderer setHRLineLength(final int hrLineLength) {
this.hrLineLength=hrLineLength;
return this;
}
/**
* Returns the length of a horizontal line.
*
* See the {@link #setHRLineLength(int)} method for a full description of this property.
*
* @return the length of a horizontal line.
*/
public int getHRLineLength() {
return hrLineLength;
}
/**
* Sets the string to be used to represent a newline in the output.
*
* The default value is "\r\n"
(CR+LF) regardless of the platform on which the library is running.
* This is so that the default configuration produces valid
* MIME plain/text output, which mandates the use of CR+LF for line breaks.
*
* Specifying a null
argument causes the output to use same new line string as is used in the source document, which is
* determined via the {@link Source#getNewLine()} method.
* If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
* or using the value from the static {@link Config#NewLine} property.
*
* @param newLine the string to be used to represent a newline in the output, may be null
.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getNewLine()
*/
public Renderer setNewLine(final String newLine) {
this.newLine=newLine;
return this;
}
/**
* Returns the string to be used to represent a newline in the output.
*
* See the {@link #setNewLine(String)} method for a full description of this property.
*
* @return the string to be used to represent a newline in the output.
*/
public String getNewLine() {
if (newLine==null) newLine=rootSegment.source.getBestGuessNewLine();
return newLine;
}
/**
* Sets whether hyperlink URLs are included in the output.
*
* The default value is true
.
*
* When this property is true
, the URL of each hyperlink is included in the output as determined by the implementation of the
* {@link #renderHyperlinkURL(StartTag)} method.
*
*
* - Example:
* -
*
* Assuming the default implementation of {@link #renderHyperlinkURL(StartTag)}, when this property is true
, the following HTML:
*
* <a href="http://jericho.htmlparser.net/">Jericho HTML Parser</a>
*
* produces the following output:
*
* Jericho HTML Parser <http://jericho.htmlparser.net/>
*
*
*
*
* @param includeHyperlinkURLs specifies whether hyperlink URLs are included in the output.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getIncludeHyperlinkURLs()
*/
public Renderer setIncludeHyperlinkURLs(final boolean includeHyperlinkURLs) {
this.includeHyperlinkURLs=includeHyperlinkURLs;
return this;
}
/**
* Indicates whether hyperlink URLs are included in the output.
*
* See the {@link #setIncludeHyperlinkURLs(boolean)} method for a full description of this property.
*
* @return true
if hyperlink URLs are included in the output, otherwise false
.
*/
public boolean getIncludeHyperlinkURLs() {
return includeHyperlinkURLs;
}
/**
* Renders the hyperlink URL from the specified {@link StartTag}.
*
* A return value of null
indicates that the hyperlink URL should not be rendered at all.
*
* The default implementation of this method returns null
if the href
attribute of the specified start tag
* starts with "javascript:
", is a relative or invalid URI, or is missing completely.
* In all other cases it returns the value of the href
attribute enclosed in angle brackets.
*
* See the documentation of the {@link #setIncludeHyperlinkURLs(boolean)} method for an example of how a hyperlink is rendered by the default implementation.
*
* This method can be overridden in a subclass to customise the rendering of hyperlink URLs.
*
* Rendering of hyperlink URLs can be disabled completely without overriding this method by setting the
* {@link #setIncludeHyperlinkURLs(boolean) IncludeHyperlinkURLs} property to false
.
*
*
* - Example:
* -
* To render hyperlink URLs without the enclosing angle brackets:
*
* Renderer renderer=new Renderer(segment) {
* public String renderHyperlinkURL(StartTag startTag) {
* String href=startTag.getAttributeValue("href");
* if (href==null || href.startsWith("javascript:")) return null;
* try {
* URI uri=new URI(href);
* if (!uri.isAbsolute()) return null;
* } catch (URISyntaxException ex) {
* return null;
* }
* return href;
* }
* };
* String renderedSegment=renderer.toString();
*
*
*
* @param startTag the start tag of the hyperlink element, must not be null
.
* @return The rendered hyperlink URL from the specified {@link StartTag}, or null
if the hyperlink URL should not be rendered.
*/
public String renderHyperlinkURL(final StartTag startTag) {
final String href=startTag.getAttributeValue("href");
if (href==null || href.startsWith("javascript:")) return null;
try {
URI uri=new URI(href);
if (!uri.isAbsolute()) return null;
} catch (URISyntaxException ex) {
return null;
}
return '<'+href+'>';
}
/**
* Sets whether the alternate text of a tag that has an alt
attribute is included in the output.
*
* The default value is true
.
* Note that this is not conistent with common email clients such as Mozilla Thunderbird which do not render alternate text at all,
* even when a tag specifies alternate text.
*
* When this property is true
, the alternate text is included in the output as determined by the implementation of the
* {@link #renderAlternateText(StartTag)} method.
*
*
* - Example:
* -
*
* Assuming the default implementation of {@link #renderAlternateText(StartTag)}, when this property is true
, the following HTML:
*
* <img src="smiley.png" alt="smiley face" />
*
* produces the following output:
*
* [smiley face]
*
*
*
*
* @param includeAlternateText specifies whether the alternate text of a tag that has an alt
attribute is included in the output.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getIncludeAlternateText()
*/
public Renderer setIncludeAlternateText(final boolean includeAlternateText) {
this.includeAlternateText=includeAlternateText;
return this;
}
/**
* Indicates whether the alternate text of a tag that has an alt
attribute is included in the output.
*
* See the {@link #setIncludeAlternateText(boolean)} method for a full description of this property.
*
* @return true
if the alternate text of a tag that has an alt
attribute is included in the output, otherwise false
.
*/
public boolean getIncludeAlternateText() {
return includeAlternateText;
}
/**
* Renders the alternate text of the specified start tag.
*
* A return value of null
indicates that the alternate text is not to be rendered at all.
*
* The default implementation of this method returns null
if the alt
attribute of the specified start tag is missing or empty, or if the
* specified start tag is from an {@link HTMLElementName#AREA AREA} element.
* In all other cases it returns the value of the alt
attribute enclosed in square brackets […]
.
*
* See the documentation of the {@link #setIncludeAlternateText(boolean)} method for an example of how alternate text is rendered by the default implementation.
*
* This method can be overridden in a subclass to customise the rendering of alternate text.
*
* Rendering of alternate text can be disabled completely without overriding this method by setting the
* {@link #setIncludeAlternateText(boolean) IncludeAlternateText} property to false
.
*
*
* - Example:
* -
* To render alternate text with double angle quotation marks instead of square brackets:
*
* Renderer renderer=new Renderer(segment) {
* public String renderAlternateText(StartTag startTag) {
* if (startTag.getName()==HTMLElementName.AREA) return null;
* String alt=startTag.getAttributeValue("alt");
* if (alt==null || alt.length()==0) return null;
* return '?'+alt+'?';
* }
* };
* String renderedSegment=renderer.toString();
*
*
*
* @param startTag the start tag containing an alt
attribute, must not be null
.
* @return The rendered alternate text, or null
if the alternate text should not be rendered.
*/
public String renderAlternateText(final StartTag startTag) {
if (startTag.getName()==HTMLElementName.AREA) return null;
final String alt=startTag.getAttributeValue("alt");
if (alt==null || alt.length()==0) return null;
return '['+alt+']';
}
/**
* Sets whether decoration characters are to be included around the content of some
* font style elements and
* phrase elements.
*
* The default value is false
.
*
* Below is a table summarising the decorated elements.
*
*
*
* Elements Character Example Output
* {@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG} *
*bold text*
* {@link HTMLElementName#I I} and {@link HTMLElementName#EM EM} /
/italic text/
* {@link HTMLElementName#U U} _
_underlined text_
* {@link HTMLElementName#CODE CODE} |
|code|
*
*
* @param decorateFontStyles specifies whether decoration characters are to be included around the content of some font style elements.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getDecorateFontStyles()
*/
public Renderer setDecorateFontStyles(final boolean decorateFontStyles) {
this.decorateFontStyles=decorateFontStyles;
return this;
}
/**
* Indicates whether decoration characters are to be included around the content of some
* font style elements and
* phrase elements.
*
* See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property.
*
* @return true
if decoration characters are to be included around the content of some font style elements, otherwise false
.
*/
public boolean getDecorateFontStyles() {
return decorateFontStyles;
}
/**
* Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to spaces.
*
* The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the Renderer
is instantiated.
*
* @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to spaces.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getConvertNonBreakingSpaces()
*/
public Renderer setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
this.convertNonBreakingSpaces=convertNonBreakingSpaces;
return this;
}
/**
* Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to spaces.
*
* See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
*
* @return true
if non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to spaces, otherwise false
.
*/
public boolean getConvertNonBreakingSpaces() {
return convertNonBreakingSpaces;
}
/**
* Sets the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
*
* At present this applies to {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} and {@link HTMLElementName#DD DD} elements.
*
* The default value is 4
.
*
* @param blockIndentSize the size of the indent.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getBlockIndentSize()
*/
public Renderer setBlockIndentSize(final int blockIndentSize) {
this.blockIndentSize=blockIndentSize;
return this;
}
/**
* Returns the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
*
* See the {@link #setBlockIndentSize(int)} method for a full description of this property.
*
* @return the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
*/
public int getBlockIndentSize() {
return blockIndentSize;
}
/**
* Sets the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
*
* The default value is 6
.
*
* This applies to {@link HTMLElementName#LI LI} elements inside both {@link HTMLElementName#UL UL} and {@link HTMLElementName#OL OL} elements.
*
* The bullet or number of the list item is included as part of the indent.
*
* @param listIndentSize the size of the indent.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getListIndentSize()
*/
public Renderer setListIndentSize(final int listIndentSize) {
this.listIndentSize=listIndentSize;
return this;
}
/**
* Returns the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
*
* See the {@link #setListIndentSize(int)} method for a full description of this property.
*
* @return the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
*/
public int getListIndentSize() {
return listIndentSize;
}
/**
* Sets the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
*
* The values in the default array are *
, o
, +
and #
.
*
* If the nesting of rendered lists goes deeper than the length of this array, the bullet characters start repeating from the first in the array.
*
* WARNING: If any of the characters in the default array are modified, this will affect all other instances of this class using the default array.
*
* @param listBullets an array of characters to be used as bullets, must have at least one entry.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getListBullets()
*/
public Renderer setListBullets(final char[] listBullets) {
if (listBullets==null || listBullets.length==0) throw new IllegalArgumentException("listBullets argument must be an array of at least one character");
this.listBullets=listBullets;
return this;
}
/**
* Returns the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
*
* See the {@link #setListBullets(char[])} method for a full description of this property.
*
* @return the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
*/
public char[] getListBullets() {
return listBullets;
}
/**
* Sets whether the top margin of the first element is rendered.
*
* The default value is false
.
*
* If this property is set to true
, then the source "<h1>Heading</h1>
" would be rendered as "\r\n\r\nHeading
",
* assuming all other default settings.
* If this property is false
, then the same source would be rendered as "Heading
".
*
* Note that the bottom margin of the last element is never rendered.
*
* @param includeFirstElementTopMargin specifies whether the top margin of the first element is rendered.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getIncludeFirstElementTopMargin()
*/
public Renderer setIncludeFirstElementTopMargin(final boolean includeFirstElementTopMargin) {
this.includeFirstElementTopMargin=includeFirstElementTopMargin;
return this;
}
/**
* Indicates whether the top margin of the first element is rendered.
*
* See the {@link #setIncludeFirstElementTopMargin(boolean)} method for a full description of this property.
*
* @return true
if the top margin of the first element is rendered, otherwise false
.
*/
public boolean getIncludeFirstElementTopMargin() {
return includeFirstElementTopMargin;
}
/**
* Sets the string that is to separate table cells.
*
* The default value is " \t"
(a space followed by a tab).
*
* @param tableCellSeparator the string that is to separate table cells.
* @return this Renderer
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getTableCellSeparator()
*/
public Renderer setTableCellSeparator(final String tableCellSeparator) {
this.tableCellSeparator=tableCellSeparator;
return this;
}
/**
* Returns the string that is to separate table cells.
*
* See the {@link #setTableCellSeparator(String)} method for a full description of this property.
*
* @return the string that is to separate table cells.
*/
public String getTableCellSeparator() {
return tableCellSeparator;
}
/**
* Sets the default top margin of an HTML block element with the specified name.
*
* The top margin is the number of blank lines that are to be inserted above the rendered block.
*
* As this is a static method, the setting affects all instances of the Renderer
class.
*
* The htmlElementName
argument must be one of the following:
* {@link HTMLElementName#ADDRESS ADDRESS},
* {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE},
* {@link HTMLElementName#CAPTION CAPTION},
* {@link HTMLElementName#CENTER CENTER},
* {@link HTMLElementName#DD DD},
* {@link HTMLElementName#DIR DIR},
* {@link HTMLElementName#DIV DIV},
* {@link HTMLElementName#DT DT},
* {@link HTMLElementName#FIELDSET FIELDSET},
* {@link HTMLElementName#FORM FORM},
* {@link HTMLElementName#H1 H1},
* {@link HTMLElementName#H2 H2},
* {@link HTMLElementName#H3 H3},
* {@link HTMLElementName#H4 H4},
* {@link HTMLElementName#H5 H5},
* {@link HTMLElementName#H6 H6},
* {@link HTMLElementName#HR HR},
* {@link HTMLElementName#LEGEND LEGEND},
* {@link HTMLElementName#LI LI},
* {@link HTMLElementName#MENU MENU},
* {@link HTMLElementName#OL OL},
* {@link HTMLElementName#P P},
* {@link HTMLElementName#PRE PRE},
* {@link HTMLElementName#TR TR},
* {@link HTMLElementName#UL UL}
*
* @param htmlElementName (required) the case insensitive name of a supported HTML block element.
* @param topMargin the new top margin of the specified element.
* @throws UnsupportedOperationException if an unsupported element name is specified.
*/
public static void setDefaultTopMargin(String htmlElementName, final int topMargin) {
htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newTopMargin(topMargin));
}
/**
* Returns the default top margin of an HTML block element with the specified name.
*
* See the {@link #setDefaultTopMargin(String htmlElementName, int topMargin)} method for a full description of this property.
*
* @param htmlElementName (required) the case insensitive name of a supported HTML block element.
* @return the default top margin of an HTML block element with the specified name.
* @throws UnsupportedOperationException if an unsupported element name is specified.
*/
public static int getDefaultTopMargin(final String htmlElementName) {
return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getTopMargin();
}
/**
* Sets the default bottom margin of an HTML block element with the specified name.
*
* The bottom margin is the number of blank lines that are to be inserted below the rendered block.
*
* As this is a static method, the setting affects all instances of the Renderer
class.
*
* The htmlElementName
argument must be one of the following:
* {@link HTMLElementName#ADDRESS ADDRESS},
* {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE},
* {@link HTMLElementName#CAPTION CAPTION},
* {@link HTMLElementName#CENTER CENTER},
* {@link HTMLElementName#DD DD},
* {@link HTMLElementName#DIR DIR},
* {@link HTMLElementName#DIV DIV},
* {@link HTMLElementName#DT DT},
* {@link HTMLElementName#FIELDSET FIELDSET},
* {@link HTMLElementName#FORM FORM},
* {@link HTMLElementName#H1 H1},
* {@link HTMLElementName#H2 H2},
* {@link HTMLElementName#H3 H3},
* {@link HTMLElementName#H4 H4},
* {@link HTMLElementName#H5 H5},
* {@link HTMLElementName#H6 H6},
* {@link HTMLElementName#HR HR},
* {@link HTMLElementName#LEGEND LEGEND},
* {@link HTMLElementName#LI LI},
* {@link HTMLElementName#MENU MENU},
* {@link HTMLElementName#OL OL},
* {@link HTMLElementName#P P},
* {@link HTMLElementName#PRE PRE},
* {@link HTMLElementName#TR TR},
* {@link HTMLElementName#UL UL}
*
* @param htmlElementName (required) the case insensitive name of a supported HTML block element.
* @param bottomMargin the new bottom margin of the specified element.
* @throws UnsupportedOperationException if an unsupported element name is specified.
*/
public static void setDefaultBottomMargin(String htmlElementName, final int bottomMargin) {
htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newBottomMargin(bottomMargin));
}
/**
* Returns the default bottom margin of an HTML block element with the specified name.
*
* See the {@link #setDefaultBottomMargin(String htmlElementName, int bottomMargin)} method for a full description of this property.
*
* @param htmlElementName (required) the case insensitive name of a supported HTML block element.
* @return the default bottom margin of an HTML block element with the specified name.
* @throws UnsupportedOperationException if an unsupported element name is specified.
*/
public static int getDefaultBottomMargin(final String htmlElementName) {
return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getBottomMargin();
}
/**
* Sets the default value of whether an HTML block element of the specified name is indented.
*
* As this is a static method, the setting affects all instances of the Renderer
class.
*
* The htmlElementName
argument must be one of the following:
* {@link HTMLElementName#ADDRESS ADDRESS},
* {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE},
* {@link HTMLElementName#CAPTION CAPTION},
* {@link HTMLElementName#CENTER CENTER},
* {@link HTMLElementName#DD DD},
* {@link HTMLElementName#DIR DIR},
* {@link HTMLElementName#DIV DIV},
* {@link HTMLElementName#DT DT},
* {@link HTMLElementName#FIELDSET FIELDSET},
* {@link HTMLElementName#FORM FORM},
* {@link HTMLElementName#H1 H1},
* {@link HTMLElementName#H2 H2},
* {@link HTMLElementName#H3 H3},
* {@link HTMLElementName#H4 H4},
* {@link HTMLElementName#H5 H5},
* {@link HTMLElementName#H6 H6},
* {@link HTMLElementName#HR HR},
* {@link HTMLElementName#LEGEND LEGEND},
* {@link HTMLElementName#MENU MENU},
* {@link HTMLElementName#OL OL},
* {@link HTMLElementName#P P},
* {@link HTMLElementName#PRE PRE},
* {@link HTMLElementName#TR TR},
* {@link HTMLElementName#UL UL}
*
* @param htmlElementName (required) the case insensitive name of a supported HTML block element.
* @param indent whether the the specified element is indented.
* @throws UnsupportedOperationException if an unsupported element name is specified.
*/
public static void setDefaultIndent(String htmlElementName, final boolean indent) {
htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException();
ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newIndent(indent));
}
/**
* Returns the default value of whether an HTML block element of the specified name is indented.
*
* See the {@link #setDefaultIndent(String htmlElementName, boolean indent)} method for a full description of this property.
*
* @param htmlElementName (required) the case insensitive name of a supported HTML block element.
* @return the default value of whether an HTML block element of the specified name is indented.
* @throws UnsupportedOperationException if an unsupported element name is specified.
*/
public static boolean isDefaultIndent(String htmlElementName) {
htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException();
return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).isIndent();
}
private static AbstractBlockElementHandler getAbstractBlockElementHandler(String htmlElementName) {
ElementHandler elementHandler=ELEMENT_HANDLERS.get(htmlElementName);
if (elementHandler==null || !(elementHandler instanceof AbstractBlockElementHandler)) throw new UnsupportedOperationException("Cannot set block properties on element "+htmlElementName);
return (AbstractBlockElementHandler)elementHandler;
}
/** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
private static final class Processor {
private final Renderer renderer;
private final Segment rootSegment;
private final Source source;
private final int maxLineLength;
private final int hrLineLength;
private final String newLine;
private final boolean includeHyperlinkURLs;
private final boolean includeAlternateText;
private final boolean decorateFontStyles;
private final boolean convertNonBreakingSpaces;
private final int blockIndentSize;
private final int listIndentSize;
private final char[] listBullets;
private final String tableCellSeparator;
private Appendable appendable;
private int renderedIndex; // keeps track of where rendering is up to in case of overlapping elements
private boolean atStartOfLine;
private boolean skipInitialNewLines;
private int col;
private int listIndentLevel;
private int indentSize;
private int blockVerticalMargin; // minimum number of blank lines to output at the current block boundary, or NO_MARGIN (-1) if we are not currently at a block boundary.
private boolean preformatted;
private boolean lastCharWhiteSpace;
private final boolean ignoreInitialWhiteSpace=false; // can remove this at some stage once we're sure it won't be used.
private boolean bullet;
private int listBulletNumber;
private static final int NO_MARGIN=-1;
public Processor(final Renderer renderer, final Segment rootSegment, final int maxLineLength, final int hrLineLength, final String newLine, final boolean includeHyperlinkURLs, final boolean includeAlternateText, final boolean decorateFontStyles, final boolean convertNonBreakingSpaces, final int blockIndentSize, final int listIndentSize, final char[] listBullets, final String tableCellSeparator) {
this.renderer=renderer;
this.rootSegment=rootSegment;
source=rootSegment.source;
this.maxLineLength=maxLineLength;
this.hrLineLength=hrLineLength;
this.newLine=newLine;
this.includeHyperlinkURLs=includeHyperlinkURLs;
this.includeAlternateText=includeAlternateText;
this.decorateFontStyles=decorateFontStyles;
this.convertNonBreakingSpaces=convertNonBreakingSpaces;
this.blockIndentSize=blockIndentSize;
this.listIndentSize=listIndentSize;
this.listBullets=listBullets;
this.tableCellSeparator=tableCellSeparator;
}
public void appendTo(final Appendable appendable) throws IOException {
reset();
this.appendable=appendable;
List elements=rootSegment instanceof Element ? Collections.singletonList((Element)rootSegment) : rootSegment.getChildElements();
appendSegmentProcessingChildElements(rootSegment.begin,rootSegment.end,elements);
}
private void reset() {
renderedIndex=0;
atStartOfLine=true;
skipInitialNewLines=!renderer.includeFirstElementTopMargin;
col=0;
listIndentLevel=0;
indentSize=0;
blockVerticalMargin=NO_MARGIN;
preformatted=false;
lastCharWhiteSpace=false;
//ignoreInitialWhiteSpace=false;
bullet=false;
}
private void appendElementContent(final Element element) throws IOException {
final int contentEnd=element.getContentEnd();
if (element.isEmpty() || renderedIndex>=contentEnd) return;
final int contentBegin=element.getStartTag().end;
appendSegmentProcessingChildElements(Math.max(renderedIndex,contentBegin),contentEnd,element.getChildElements());
}
private void appendSegmentProcessingChildElements(final int begin, final int end, final List childElements) throws IOException {
int index=begin;
for (Element childElement : childElements) {
if (index>=childElement.end) continue;
if (index=end) break;
appendSegment(index,tag.begin);
index=tag.end;
}
appendSegment(index,end);
}
private void appendSegment(int begin, final int end) throws IOException {
assert begin<=end;
if (begin=end) return;
try {
if (preformatted)
appendPreformattedSegment(begin,end);
else
appendNonPreformattedSegment(begin,end);
} finally {
if (renderedIndex=renderedIndex;
if (isBlockBoundary()) appendBlockVerticalMargin();
final String text=CharacterReference.decode(source.subSequence(begin,end),false,convertNonBreakingSpaces);
for (int i=0; i=renderedIndex;
final String text=CharacterReference.decodeCollapseWhiteSpace(source.subSequence(begin,end),convertNonBreakingSpaces);
if (text.length()==0) {
// collapsed text is zero length but original segment wasn't, meaning it consists purely of white space.
if (!ignoreInitialWhiteSpace) lastCharWhiteSpace=true;
return;
}
appendNonPreformattedText(text,Segment.isWhiteSpace(source.charAt(begin)),Segment.isWhiteSpace(source.charAt(end-1)));
}
private void appendText(final String text) throws IOException {
assert text.length()>0;
appendNonPreformattedText(text,Segment.isWhiteSpace(text.charAt(0)),Segment.isWhiteSpace(text.charAt(text.length()-1)));
}
private void appendNonPreformattedText(final String text, final boolean isWhiteSpaceAtStart, final boolean isWhiteSpaceAtEnd) throws IOException {
if (isBlockBoundary()) {
appendBlockVerticalMargin();
} else if (lastCharWhiteSpace || (isWhiteSpaceAtStart && !ignoreInitialWhiteSpace)) {
// output white space only if not on a block boundary
append(' ');
}
int textIndex=0;
int i=0;
lastCharWhiteSpace=false;
//ignoreInitialWhiteSpace=false;
while (true) {
for (; i" or "From ".
if (i+1') continue;
if (i+60 && col+i-textIndex+1>=maxLineLength) {
if (lastCharWhiteSpace && (listIndentLevel|indentSize)==0) append(' ');
startNewLine(0);
} else if (lastCharWhiteSpace) {
append(' ');
}
append(text,textIndex,i);
if (i==text.length()) break;
lastCharWhiteSpace=true;
textIndex=++i;
}
lastCharWhiteSpace=isWhiteSpaceAtEnd;
}
private boolean isBlockBoundary() {
return blockVerticalMargin!=NO_MARGIN;
}
private void appendBlockVerticalMargin() throws IOException {
assert blockVerticalMargin!=NO_MARGIN;
if (skipInitialNewLines) {
// at first text after element or start of document
skipInitialNewLines=false;
final int indentCol=indentSize+listIndentLevel*listIndentSize;
if (col==indentCol) {
atStartOfLine=false; // no need to call appendIndent() from appendTextInit().
} else {
// there was an indenting block since the or start of document
if (bullet || col>indentCol) {
// just start new line as normal if the last indenting block is another , or if the current column is already past the required indent
startNewLine(0);
} else {
// just append spaces to get the column up to the required indent
while (indentCol>col) {
appendable.append(' ');
col++;
}
atStartOfLine=false; // make sure appendIndent() isn't called again from appendTextInit()
}
}
} else {
startNewLine(blockVerticalMargin);
}
blockVerticalMargin=NO_MARGIN;
}
private void blockBoundary(final int verticalMargin) throws IOException {
// Set a block boundary with the given vertical margin. The vertical margin is the minimum number of blank lines to output between the blocks.
// This method can be called multiple times at a block boundary, and the next textual output will output the number of blank lines determined by the
// maximum vertical margin of all the method calls.
if (blockVerticalMargin0; i--) appendable.append(' ');
if (bullet) {
for (int i=(listIndentLevel-1)*listIndentSize; i>0; i--) appendable.append(' ');
if (listBulletNumber==UNORDERED_LIST) {
for (int i=listIndentSize-2; i>0; i--) appendable.append(' ');
appendable.append(listBullets[(listIndentLevel-1)%listBullets.length]).append(' ');
} else {
String bulletNumberString=Integer.toString(listBulletNumber);
for (int i=listIndentSize-bulletNumberString.length()-2; i>0; i--) appendable.append(' ');
appendable.append(bulletNumberString).append(". ");
}
bullet=false;
} else {
for (int i=listIndentLevel*listIndentSize; i>0; i--) appendable.append(' ');
}
col=indentSize+listIndentLevel*listIndentSize;
atStartOfLine=false;
}
private Processor append(final char ch) throws IOException {
appendTextInit();
appendable.append(ch);
col++;
return this;
}
private Processor append(final String text) throws IOException {
appendTextInit();
appendable.append(text);
col+=text.length();
return this;
}
private void append(final CharSequence text, final int begin, final int end) throws IOException {
appendTextInit();
for (int i=begin; ix.maxLineLength-x.indentSize) leftMargin=x.maxLineLength-x.indentSize;
x.indentSize+=leftMargin;
processBlockContent(x,element);
x.indentSize-=leftMargin;
x.blockBoundary(RendererCSS.getBottomMargin(element,bottomMargin));
}
public AbstractBlockElementHandler newTopMargin(int topMargin) {
return newInstance(topMargin,this.bottomMargin,this.indent);
}
public int getTopMargin() {
return topMargin;
}
public AbstractBlockElementHandler newBottomMargin(int bottomMargin) {
return newInstance(this.topMargin,bottomMargin,this.indent);
}
public int getBottomMargin() {
return bottomMargin;
}
public AbstractBlockElementHandler newIndent(boolean indent) {
return newInstance(this.topMargin,this.bottomMargin,indent);
}
public boolean isIndent() {
return indent;
}
abstract protected void processBlockContent(Processor x, Element element) throws IOException;
abstract protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent);
}
private static final class StandardBlockElementHandler extends AbstractBlockElementHandler {
public static final ElementHandler INSTANCE_0_0=new StandardBlockElementHandler(0,0,false);
public static final ElementHandler INSTANCE_1_1=new StandardBlockElementHandler(1,1,false);
public static final ElementHandler INSTANCE_2_1=new StandardBlockElementHandler(2,1,false);
public static final ElementHandler INSTANCE_0_0_INDENT=new StandardBlockElementHandler(0,0,true);
public static final ElementHandler INSTANCE_1_1_INDENT=new StandardBlockElementHandler(1,1,true);
private StandardBlockElementHandler(int topMargin, int bottomMargin, boolean indent) {
super(topMargin,bottomMargin,indent);
}
protected void processBlockContent(Processor x, Element element) throws IOException {
x.appendElementContent(element);
}
protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent) {
return new StandardBlockElementHandler(topMargin,bottomMargin,indent);
}
}
private static final class A_ElementHandler implements ElementHandler {
public static final ElementHandler INSTANCE=new A_ElementHandler();
public void process(Processor x, Element element) throws IOException {
if (!x.includeHyperlinkURLs) {
x.appendElementContent(element);
return;
}
String renderedHyperlinkURL=x.renderer.renderHyperlinkURL(element.getStartTag());
if (renderedHyperlinkURL==null) {
x.appendElementContent(element);
return;
}
String href=element.getAttributeValue("href");
final boolean displayContent=href==null || !getInformalURL(href).equals(getInformalURL(element.getContent().toString())); // only display the content if it is not the same as the URL
int linkLength=renderedHyperlinkURL.length();
if (displayContent) {
x.appendElementContent(element);
linkLength++; // allow for space after content
}
if (x.maxLineLength>0 && x.col+linkLength>=x.maxLineLength) {
x.startNewLine(0);
} else if (displayContent) {
x.append(' ');
}
x.append(renderedHyperlinkURL);
x.lastCharWhiteSpace=true;
}
}
private static final String getInformalURL(String url) {
if (url.startsWith("http://")) url=url.substring(7);
if (url.endsWith("/")) url=url.substring(0,url.length()-1);
return url;
}
private static final class BR_ElementHandler implements ElementHandler {
public static final ElementHandler INSTANCE=new BR_ElementHandler();
public void process(Processor x, Element element) throws IOException {
if (x.isBlockBoundary() && !x.atStartOfLine && !x.skipInitialNewLines) x.newLine(); // add an extra new line if we're at a block boundary and aren't already at the start of the next line and it's not the first element after
x.newLine();
x.blockBoundary(0);
}
}
private static final class HR_ElementHandler extends AbstractBlockElementHandler {
public static final ElementHandler INSTANCE=new HR_ElementHandler();
private HR_ElementHandler() {
this(0,0,false);
}
private HR_ElementHandler(int topMargin, int bottomMargin, boolean indent) {
super(topMargin,bottomMargin,indent);
}
protected void processBlockContent(Processor x, Element element) throws IOException {
x.appendBlockVerticalMargin();
if (x.hrLineLength>0) {
x.append('-');
for (int i=x.col; i