net.sf.saxon.serialize.HTMLIndenter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.serialize;
import net.sf.saxon.event.ProxyReceiver;
import net.sf.saxon.event.Receiver;
import net.sf.saxon.event.ReceiverOption;
import net.sf.saxon.expr.parser.Loc;
import net.sf.saxon.lib.SaxonOutputKeys;
import net.sf.saxon.om.AttributeMap;
import net.sf.saxon.om.FingerprintedQName;
import net.sf.saxon.om.NamespaceMap;
import net.sf.saxon.om.NodeName;
import net.sf.saxon.s9api.Location;
import net.sf.saxon.str.IndentWhitespace;
import net.sf.saxon.str.UnicodeString;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.type.SchemaType;
import net.sf.saxon.z.IntIterator;
import java.util.*;
/**
* HTMLIndenter: This ProxyReceiver indents HTML elements, by adding whitespace
* character data where appropriate.
* The character data is never added when within an inline element.
* The string used for indentation defaults to three spaces
*
*/
public class HTMLIndenter extends ProxyReceiver {
// TODO: some of the logic in this class is probably redundant, e.g. the "sameLine" flag. However,
// indentation is under-tested in the W3C test suites, so it's safest to avoid unnecessary changes.
// The specification has complex rules for deciding whether something is an inline element and whether
// indentation is suppressed: the rules for matching names depend on whether it's HTML or XHTML, and which
// version. But since the rules are of the form: "if the name matches X, then indentation is not allowed,
// otherwise indentation is allowed but not required", there is no harm in having spurious matches. So we
// simply do a case-blind match on the local part of the name, which catches all cases where indentation
// is not allowed, and is very unlikely to upset anyone by not indenting things that we could have indented.
// We make one exception (see bug 3877): we don't treat "link" as an inline element under any circumstances,
// though HTML5 has some complex rules that treat it as a phrasal element under some conditions, based on the
// value of the "rel" attribute.
final private static String[] formattedTags = {"pre", "script", "style", "textarea", "title", "xmp"};
// "xmp" is obsolete but still encountered!
// When elements are classified as inline, indenting whitespace is not added adjacent to the element.
// See Saxon bug 3839 and W3C bug 30276. We use a list of inline elements that is the union of
// the HTML4 and HTML5 lists, on the basis that no harm is done treating an element as inline
// even if the spec doesn't require us to do so. This also means we include elements such as
// "ins", "del", and "area" that are sometimes inline and sometimes not.
final private static String[] inlineTags = {
"a", "abbr", "acronym", "applet", "area",
"audio", "b", "basefont", "bdi", "bdo", "big", "br", "button", "canvas", "cite", "code", "data",
"datalist", "del", "dfn", "em", "embed", "font", "i", "iframe", "img", "input", "ins",
"kbd", "label", /*"link" -- excluded, see bug 3877,*/ "map",
"mark", "math", "meter", "noscript", "object", "output", "picture",
"progress", "q", "ruby", "s", "samp", "script", "select", "small", "span",
"strike", "strong", "sub", "sup", "svg", "template", "textarea",
"time", "tt", "u", "var", "video", "wbr"};
final private static Set inlineTable = new HashSet<>(70);
final private static Set formattedTable = new HashSet<>(10);
static {
Collections.addAll(inlineTable, inlineTags);
Collections.addAll(formattedTable, formattedTags);
}
protected char[] indentChars = {'\n', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '};
private final static int IS_INLINE = 1;
private final static int IS_FORMATTED = 2;
private final static int IS_SUPPRESSED = 4;
private String method;
private int level = 0;
private boolean sameLine = false;
private boolean inFormattedTag = false;
private boolean afterInline = false;
//private boolean afterFormatted = true; // to prevent a newline at the start
private boolean afterEndElement = false;
private int[] propertyStack = new int[20];
private Set suppressed = null;
public HTMLIndenter(Receiver next, String method) {
super(next);
}
/**
* Set the properties for this indenter
*
* @param props the serialization properties
*/
public void setOutputProperties(Properties props) {
String s = props.getProperty(SaxonOutputKeys.SUPPRESS_INDENTATION);
if (s != null) {
suppressed = new HashSet<>(8);
StringTokenizer st = new StringTokenizer(s, " \t\r\n");
while (st.hasMoreTokens()) {
String eqName = st.nextToken();
suppressed.add(FingerprintedQName.fromEQName(eqName).getLocalPart().toLowerCase());
}
}
}
/**
* Classify an element name as inline, formatted, or both or neither.
* This method is overridden in the XHTML indenter
*
* @param name the element name
* @return a bit-significant integer containing flags IS_INLINE and/or IS_FORMATTED
*/
public int classifyTag(NodeName name) {
int r = 0;
if (inlineTable.contains(name.getLocalPart().toLowerCase())) {
r |= IS_INLINE;
}
if (formattedTable.contains(name.getLocalPart().toLowerCase())) {
r |= IS_FORMATTED;
}
if (suppressed != null && suppressed.contains(name.getLocalPart().toLowerCase())) {
r |= IS_SUPPRESSED;
}
return r;
}
/**
* Output element start tag
*/
@Override
public void startElement(NodeName elemName, SchemaType type,
AttributeMap attributes, NamespaceMap namespaces,
Location location, int properties) throws XPathException {
int withinSuppressed = level == 0 ? 0 : (propertyStack[level - 1] & IS_SUPPRESSED);
int tagProps = classifyTag(elemName) | withinSuppressed;
if (level >= propertyStack.length) {
propertyStack = Arrays.copyOf(propertyStack, level * 2);
}
propertyStack[level] = tagProps;
boolean inlineTag = (tagProps & IS_INLINE) != 0;
if (!inlineTag && !inFormattedTag && !afterInline && /*!afterFormatted &&*/ withinSuppressed == 0 && level != 0) {
indent();
}
nextReceiver.startElement(elemName, type, attributes, namespaces, location, properties);
inFormattedTag = inFormattedTag || ((tagProps & IS_FORMATTED) != 0);
level++;
sameLine = true;
afterInline = false;
//afterFormatted = false;
afterEndElement = false;
}
/**
* Output element end tag
*/
@Override
public void endElement() throws XPathException {
level--;
boolean thisInline = (propertyStack[level] & IS_INLINE) != 0;
boolean thisFormatted = (propertyStack[level] & IS_FORMATTED) != 0;
boolean thisSuppressed = (propertyStack[level] & IS_SUPPRESSED) != 0;
if (afterEndElement && !thisInline && !thisSuppressed && !afterInline &&
!sameLine && !inFormattedTag) {
indent();
afterInline = false;
//afterFormatted = false;
} else {
afterInline = thisInline;
//afterFormatted = thisFormatted;
}
nextReceiver.endElement();
inFormattedTag = inFormattedTag && !thisFormatted;
sameLine = false;
afterEndElement = true;
}
/**
* Output character data
*/
@Override
public void characters(UnicodeString chars, Location locationId, int properties) throws XPathException {
int withinSuppressed = level == 0 ? 0 : (propertyStack[level - 1] & IS_SUPPRESSED);
if (inFormattedTag ||
withinSuppressed>0 ||
ReceiverOption.contains(properties, ReceiverOption.USE_NULL_MARKERS) ||
ReceiverOption.contains(properties, ReceiverOption.DISABLE_ESCAPING)) {
// don't split the text if in a tag such as , or if the text contains the result of
// expanding a character map or was produced using disable-output-escaping
nextReceiver.characters(chars, locationId, properties);
} else {
// otherwise try to split long lines into multiple lines
UnicodeString t = chars.tidy();
int lastNL = 0;
IntIterator iter = t.codePoints();
int i = 0;
while (iter.hasNext()) {
int ch = iter.next();
if (ch == '\n' || (i - lastNL > getLineLength() && ch == ' ')) {
sameLine = false;
nextReceiver.characters(t.substring(lastNL, i), locationId, properties);
indent();
lastNL = i + 1;
while (lastNL < t.length() && t.codePointAt(lastNL) == ' ') {
lastNL++;
}
}
i++;
}
if (lastNL < t.length()) {
nextReceiver.characters(t.substring(lastNL, t.length()), locationId, properties);
}
}
afterInline = false;
afterEndElement = false;
}
/**
* Output a processing instruction
*/
@Override
public void processingInstruction(String target, UnicodeString data, Location locationId, int properties) throws XPathException {
if (afterEndElement && level != 0 && (propertyStack[level - 1] & IS_INLINE) == 0) {
indent();
}
nextReceiver.processingInstruction(target, data, locationId, properties);
afterEndElement = false;
}
/**
* Output a comment
*/
@Override
public void comment(UnicodeString chars, Location locationId, int properties) throws XPathException {
if (afterEndElement && level != 0 && (propertyStack[level - 1] & IS_INLINE) == 0) {
indent();
}
nextReceiver.comment(chars, locationId, properties);
afterEndElement = false;
}
/**
* Get the maximum length of lines, after which long lines will be word-wrapped
*
* @return the maximum line length
*/
protected int getLineLength() {
return 80;
}
/**
* Output white space to reflect the current indentation level
*
* @throws net.sf.saxon.trans.XPathException if an error occurs downstream in the pipeline
*/
private void indent() throws XPathException {
int spaces = level * getIndentation();
// if (spaces + 1 >= indentChars.length) {
// int increment = 5 * getIndentation();
// if (spaces + 1 > indentChars.length + increment) {
// increment += spaces + 1;
// }
// char[] c2 = new char[indentChars.length + increment];
// System.arraycopy(indentChars, 0, c2, 0, indentChars.length);
// Arrays.fill(c2, indentChars.length, c2.length, ' ');
// indentChars = c2;
// }
// nextReceiver.characters(new Twine16(indentChars, 0, spaces + 1),
// Loc.NONE, ReceiverOption.NONE);
nextReceiver.characters(IndentWhitespace.of(1, spaces),
Loc.NONE, ReceiverOption.NONE);
sameLine = false;
}
/**
* Get the number of spaces to be used for indentation
*
* @return the number of spaces to be added to the indentation for each level
*/
protected int getIndentation() {
return 3;
}
}