net.htmlparser.jericho.SourceFormatter Maven / Gradle / Ivy
Show all versions of jericho-html Show documentation
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.4
// Copyright (C) 2004-2013 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// 3. The Apache License version 2.0,
// included in this distribution in the file licence-apache-2.0.html
// or available at http://www.apache.org/licenses/LICENSE-2.0.html
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
import java.net.*;
/**
* Formats HTML source by laying out each non-inline-level element on a new line with an appropriate indent.
*
* Any indentation present in the original source text is removed.
*
* Use one of the following methods to obtain the output:
*
* - {@link #writeTo(Writer)}
* - {@link #appendTo(Appendable)}
* - {@link #toString()}
* - {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}
*
*
* The output text is functionally equivalent to the original source and should be rendered identically unless specified below.
*
* The following points describe the process in general terms.
* Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
*
*
* - Every element that is not an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level element} appears on a new line
* with an indent corresponding to its {@linkplain Element#getDepth() depth} in the document element hierarchy.
*
- The indent is formed by writing n repetitions of the string specified in the {@link #setIndentString(String) IndentString} property,
* where n is the depth of the indentation.
*
- The {@linkplain Element#getContent() content} of an indented element starts on a new line and is indented at a depth one greater than that of the element,
* with the end tag appearing on a new line at the same depth as the start tag.
* If the content contains only text and {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements},
* it may continue on the same line as the start tag. Additionally, if the output content contains no new lines, the end tag may also continue on the same line.
*
- The content of preformatted elements such as {@link HTMLElementName#PRE PRE} and {@link HTMLElementName#TEXTAREA TEXTAREA} are not indented,
* nor is the white space modified in any way.
*
- Only {@linkplain StartTagType#NORMAL normal} and {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} elements are indented.
* All others are treated as {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
*
- White space and indentation inside HTML {@linkplain StartTagType#COMMENT comments}, {@linkplain StartTagType#CDATA_SECTION CDATA sections}, or any
* {@linkplain TagType#isServerTag() server tag} is preserved,
* but with the indentation of new lines starting at a depth one greater than that of the surrounding text.
*
- White space and indentation inside {@link HTMLElementName#SCRIPT SCRIPT} elements is preserved,
* but with the indentation of new lines starting at a depth one greater than that of the
SCRIPT
element.
* - If the {@link #setTidyTags(boolean) TidyTags} property is set to
true
,
* every tag in the document is replaced with the output from its {@link Tag#tidy()} method.
* If this property is set to false
, the tag from the original text is used, including all white space,
* but with any new lines indented at a depth one greater than that of the element.
* - If the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} property
* is set to
true
, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
* located outside of a tag is replaced with a single space in the output.
* White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
* - If the {@link #setIndentAllElements(boolean) IndentAllElements} property
* is set to
true
, every element appears indented on a new line, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
* This generates output that is a good representation of the actual document element hierarchy,
* but is very likely to introduce white space that compromises the functional equivalency of the document.
* - The {@link #setNewLine(String) NewLine} property specifies the character sequence
* to use for each newline in the output document.
*
- If the source document contains {@linkplain TagType#isServerTag() server tags}, the functional equivalency of the output document may be compromised.
*
*
* Formatting an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
*/
public final class SourceFormatter implements CharStreamSource {
private final Segment segment;
private String indentString="\t";
private boolean tidyTags=false;
private boolean collapseWhiteSpace=false;
private boolean removeLineBreaks=false;
private boolean indentAllElements=false;
private String newLine=null;
/**
* Constructs a new SourceFormatter
based on the specified {@link Segment}.
* @param segment the segment containing the HTML to be formatted.
* @see Source#getSourceFormatter()
*/
public SourceFormatter(final Segment segment) {
this.segment=segment;
}
// Documentation inherited from CharStreamSource
public void writeTo(final Writer writer) throws IOException {
appendTo(writer);
writer.flush();
}
// Documentation inherited from CharStreamSource
public void appendTo(final Appendable appendable) throws IOException {
new Processor(segment,getIndentString(),getTidyTags(),getCollapseWhiteSpace(),getRemoveLineBreaks(),getIndentAllElements(),getIndentAllElements(),getNewLine()).appendTo(appendable);
}
// Documentation inherited from CharStreamSource
public long getEstimatedMaximumOutputLength() {
return segment.length()*2;
}
// Documentation inherited from CharStreamSource
public String toString() {
return CharStreamSourceUtil.toString(this);
}
/**
* Sets the string to be used for indentation.
*
* The default value is a string containing a single tab character (U+0009).
*
* The most commonly used indent strings are "\t"
(single tab), " "
(single space), " "
(2 spaces), and " "
(4 spaces).
*
* @param indentString the string to be used for indentation, must not be null
.
* @return this SourceFormatter
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getIndentString()
*/
public SourceFormatter setIndentString(final String indentString) {
if (indentString==null) throw new IllegalArgumentException("indentString property must not be null");
this.indentString=indentString;
return this;
}
/**
* Returns the string to be used for indentation.
*
* See the {@link #setIndentString(String)} method for a full description of this property.
*
* @return the string to be used for indentation.
*/
public String getIndentString() {
return indentString;
}
/**
* Sets whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
*
* The default value is false
.
*
* If this property is set to false
, the tag from the original text is used, including all white space,
* but with any new lines indented at a depth one greater than that of the element.
*
* @param tidyTags specifies whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
* @return this SourceFormatter
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getTidyTags()
*/
public SourceFormatter setTidyTags(final boolean tidyTags) {
this.tidyTags=tidyTags;
return this;
}
/**
* Indicates whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
*
* See the {@link #setTidyTags(boolean)} method for a full description of this property.
*
* @return true
if the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method, otherwise false
.
*/
public boolean getTidyTags() {
return tidyTags;
}
/**
* Sets whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
*
* The default value is false
.
*
* If this property is set to true
, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
* located outside of a tag is replaced with a single space in the output.
* White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
*
* @param collapseWhiteSpace specifies whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
* @return this SourceFormatter
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getCollapseWhiteSpace()
*/
public SourceFormatter setCollapseWhiteSpace(final boolean collapseWhiteSpace) {
this.collapseWhiteSpace=collapseWhiteSpace;
return this;
}
/**
* Indicates whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
*
* See the {@link #setCollapseWhiteSpace(boolean collapseWhiteSpace)} method for a full description of this property.
*
* @return true
if {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed, otherwise false
.
*/
public boolean getCollapseWhiteSpace() {
return collapseWhiteSpace;
}
/**
* Sets whether all non-essential line breaks are removed.
*
* The default value is false
.
*
* If this property is set to true
, only essential line breaks are retained in the output.
*
* Setting this property automatically engages the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} option, regardless of its property setting.
*
* It is recommended to set the {@link #setTidyTags(boolean) TidyTags} property when this option is used so that non-essential line breaks are also removed from tags.
*
* @param removeLineBreaks specifies whether all non-essential line breaks are removed.
* @return this SourceFormatter
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getRemoveLineBreaks()
*/
SourceFormatter setRemoveLineBreaks(final boolean removeLineBreaks) {
this.removeLineBreaks=removeLineBreaks;
return this;
}
/**
* Indicates whether all non-essential line breaks are removed.
*
* See the {@link #setRemoveLineBreaks(boolean removeLineBreaks)} method for a full description of this property.
*
* @return true
if all non-essential line breaks are removed, otherwise false
.
*/
boolean getRemoveLineBreaks() {
return removeLineBreaks;
}
/**
* Sets whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
*
* The default value is false
.
*
* If this property is set to true
, every element appears indented on a new line, including
* {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
*
* This generates output that is a good representation of the actual document element hierarchy,
* but is very likely to introduce white space that compromises the functional equivalency of the document.
*
* @param indentAllElements specifies whether all elements are to be indented.
* @return this SourceFormatter
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getIndentAllElements()
*/
public SourceFormatter setIndentAllElements(final boolean indentAllElements) {
this.indentAllElements=indentAllElements;
return this;
}
/**
* Indicates whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
*
* See the {@link #setIndentAllElements(boolean)} method for a full description of this property.
*
* @return true
if all elements are to be indented, otherwise false
.
*/
public boolean getIndentAllElements() {
return indentAllElements;
}
/**
* Sets the string to be used to represent a newline in the output.
*
* The default is to use the same new line string as is used in the source document, which is determined via the {@link Source#getNewLine()} method.
* If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
* or using the value from the static {@link Config#NewLine} property.
*
* Specifying a null
argument resets the property to its default value, which is to use the same new line string as is used in the source document.
*
* @param newLine the string to be used to represent a newline in the output, may be null
.
* @return this SourceFormatter
instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getNewLine()
*/
public SourceFormatter setNewLine(final String newLine) {
this.newLine=newLine;
return this;
}
/**
* Returns the string to be used to represent a newline in the output.
*
* See the {@link #setNewLine(String)} method for a full description of this property.
*
* @return the string to be used to represent a newline in the output.
*/
public String getNewLine() {
if (newLine==null) newLine=segment.source.getBestGuessNewLine();
return newLine;
}
/** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
private static final class Processor {
private final Segment segment;
private final CharSequence sourceText;
private final String indentString;
private final boolean tidyTags;
private final boolean collapseWhiteSpace;
private final boolean removeLineBreaks; // Indicates whether all non-essential line breaks are removed. Must be used with collapseWhiteSpace=true.
private final boolean indentAllElements;
private final boolean indentScriptElements; // at present this parameter is tied to indentAllElements. SCRIPT elements need to be inline to keep functional equivalency of output
private final String newLine;
private Appendable appendable;
private Tag nextTag;
private int index;
public Processor(final Segment segment, final String indentString, final boolean tidyTags, final boolean collapseWhiteSpace, final boolean removeLineBreaks, final boolean indentAllElements, final boolean indentScriptElements, final String newLine) {
this.segment=segment;
sourceText=segment.source.toString();
this.indentString=indentString;
this.tidyTags=tidyTags;
this.collapseWhiteSpace=collapseWhiteSpace || removeLineBreaks;
this.removeLineBreaks=removeLineBreaks;
this.indentAllElements=indentAllElements;
this.indentScriptElements=indentScriptElements;
this.newLine=newLine;
}
public void appendTo(final Appendable appendable) throws IOException {
this.appendable=appendable;
if (segment instanceof Source) ((Source)segment).fullSequentialParse();
nextTag=segment.source.getNextTag(segment.begin);
index=segment.begin;
appendContent(segment.end,segment.getChildElements(),0);
}
private void appendContent(final int end, final List childElements, final int depth) throws IOException {
assert index<=end;
for (Element element : childElements) {
final int elementBegin=element.begin;
if (elementBegin>=end) break;
if (indentAllElements) {
appendText(elementBegin,depth);
appendElement(element,depth,end,false,false);
} else {
if (inlinable(element)) continue; // skip over elements that can be inlined.
appendText(elementBegin,depth);
final String elementName=element.getName();
if (elementName==HTMLElementName.PRE || elementName==HTMLElementName.TEXTAREA) {
appendElement(element,depth,end,true,true);
} else if (elementName==HTMLElementName.SCRIPT) {
appendElement(element,depth,end,true,false);
} else {
appendElement(element,depth,end,false,!removeLineBreaks && containsOnlyInlineLevelChildElements(element));
}
}
}
appendText(end,depth);
assert index==end;
}
private boolean inlinable(final Element element) {
// returns true if the specified element should be inlined
final StartTagType startTagType=element.getStartTag().getStartTagType();
// if (startTagType==StartTagType.DOCTYPE_DECLARATION) return false; // this was removed because it caused an extra line break if the DOCTYPE is preceeded by a server tag
if (startTagType!=StartTagType.NORMAL) return true;
// element is a normal type
final String elementName=element.getName();
if (elementName==HTMLElementName.SCRIPT) return !indentScriptElements;
if (removeLineBreaks && !HTMLElements.getElementNames().contains(elementName)) return true; // inline non-HTML elements if removing line breaks
if (!HTMLElements.getInlineLevelElementNames().contains(elementName)) return false;
// element is inline type
if (elementName==HTMLElementName.TEXTAREA) return false; // TEXTAREA is theoretically inlinable but we want to format its content in the same was as PRE, and this is easiest when the entire element is treated like a block PRE element.
if (removeLineBreaks) return true;
return containsOnlyInlineLevelChildElements(element); // only inline if it doesn't illegally contain non-inline elements
}
private void appendText(final int end, int depth) throws IOException {
assert index<=end;
if (index==end) return;
while (Segment.isWhiteSpace(sourceText.charAt(index))) if (++index==end) return; // trim whitespace.
appendIndent(depth);
if (collapseWhiteSpace) {
appendTextCollapseWhiteSpace(end,depth);
} else {
appendTextInline(end,depth,false);
}
appendFormattingNewLine();
assert index==end;
}
private void appendElement(final Element element, final int depth, final int end, final boolean preformatted, boolean renderContentInline) throws IOException {
assert index==element.begin;
assert indexendTag.begin) {
if (!renderContentInline) appendIndent(depth);
assert index==endTag.begin;
appendTag(endTag,depth,end);
appendFormattingNewLine();
} else if (renderContentInline) {
appendFormattingNewLine();
}
assert index==Math.min(element.end,end) : index;
}
private void updateNextTag() {
// ensures that nextTag is up to date
while (nextTag!=null) {
if (nextTag.begin>=index) return;
nextTag=nextTag.getNextTag();
}
}
private void appendIndentedScriptContent(final int end, final int depth) throws IOException {
assert index=textLength) return; // trim whitespace.
appendEssentialNewLine();
appendIndent(subsequentLineDepth);
i=appendSpecifiedLine(text,i);
} while (i=textLength) return i;
}
}
private boolean appendTextInline(final int end, int depth, final boolean increaseIndentAfterFirstLineBreak) throws IOException {
// returns true if all text was on one line, otherwise false
assert index=end) {
assert index<=end;
return;
}
if (!singleLineContent) {
appendEssentialNewLine(); // some server or client side scripting languages might need the final new line
appendIndent(depth);
}
assert index==endTag.begin;
appendTag(endTag,depth,end);
}
assert index<=end;
}
private void appendIndent(final int depth) throws IOException {
if (!removeLineBreaks) for (int x=0; x childElements=element.getChildElements();
if (childElements.isEmpty()) return true;
for (Element childElement : childElements) {
final String elementName=childElement.getName();
if (elementName==HTMLElementName.SCRIPT || !HTMLElements.getInlineLevelElementNames().contains(elementName)) return false;
if (!containsOnlyInlineLevelChildElements(childElement)) return false;
}
return true;
}
}
}