All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.serialize.HTMLEmitter Maven / Gradle / Ivy

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.serialize;

import net.sf.saxon.event.ReceiverOptions;
import net.sf.saxon.expr.parser.Location;
import net.sf.saxon.lib.SaxonOutputKeys;
import net.sf.saxon.om.NodeName;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.tiny.CompressedWhitespace;
import net.sf.saxon.type.SchemaType;

import javax.xml.transform.OutputKeys;
import java.util.Stack;

/**
 * This class generates HTML output
 *
 * @author Michael H. Kay
 */

public abstract class HTMLEmitter extends XMLEmitter {

    /**
     * Preferred character representations
     */

    private static final int REP_NATIVE = 0;
    private static final int REP_ENTITY = 1;
    private static final int REP_DECIMAL = 2;
    private static final int REP_HEX = 3;

    private int nonASCIIRepresentation = REP_NATIVE;
    private int excludedRepresentation = REP_ENTITY;

    private int inScript;
    protected int version = 4;
    private String parentElement;
    private String uri;
    private boolean escapeNonAscii = false;
    private Stack nodeNameStack = new Stack();

    /**
     * Decode preferred representation
     *
     * @param rep string containing preferred representation (native, entity, decimal, or hex)
     * @return integer code for the preferred representation
     */

    private static int representationCode(String rep) {
        if (rep.equalsIgnoreCase("native")) return REP_NATIVE;
        if (rep.equalsIgnoreCase("entity")) return REP_ENTITY;
        if (rep.equalsIgnoreCase("decimal")) return REP_DECIMAL;
        if (rep.equalsIgnoreCase("hex")) return REP_HEX;
        return REP_ENTITY;
    }

    /**
     * Table of HTML tags that have no closing tag
     */

    static HTMLTagHashSet emptyTags = new HTMLTagHashSet(31);


    protected static void setEmptyTag(String tag) {
        emptyTags.add(tag);
    }

    protected static boolean isEmptyTag(String tag) {
        return emptyTags.contains(tag);
    }

    /**
     * Table of boolean attributes
     */

    // we use two HashMaps to avoid unnecessary string concatenations

    // Sizes must be large enough: this hash set cannot grow beyond the initial size
    private static HTMLTagHashSet booleanAttributes = new HTMLTagHashSet(43);
    private static HTMLTagHashSet booleanCombinations = new HTMLTagHashSet(57);

    // See http://www.w3.org/TR/html5/index.html#attributes-1 (checked 2014-01-07)

    static {
        setBooleanAttribute("*", "hidden"); // HTML5
        setBooleanAttribute("area", "nohref");
        setBooleanAttribute("audio", "autoplay"); // HTML5
        setBooleanAttribute("audio", "controls"); // HTML5
        setBooleanAttribute("audio", "loop"); // HTML5
        setBooleanAttribute("audio", "muted"); // HTML5
        setBooleanAttribute("button", "disabled");
        setBooleanAttribute("button", "autofocus"); // HTML5
        setBooleanAttribute("button", "formnovalidate"); //HTML5
        setBooleanAttribute("details", "open"); // HTML5
        setBooleanAttribute("dialog", "open"); // HTML5
        setBooleanAttribute("dir", "compact");
        setBooleanAttribute("dl", "compact");
        setBooleanAttribute("fieldset", "disabled"); //HTML5
        setBooleanAttribute("form", "novalidate"); // HTML5
        setBooleanAttribute("frame", "noresize");
        setBooleanAttribute("hr", "noshade");
        setBooleanAttribute("img", "ismap");
        setBooleanAttribute("input", "checked");
        setBooleanAttribute("input", "disabled");
        setBooleanAttribute("input", "multiple"); //HTML5
        setBooleanAttribute("input", "readonly");
        setBooleanAttribute("input", "required"); //HTML5
        setBooleanAttribute("input", "autofocus"); // HTML5
        setBooleanAttribute("input", "formnovalidate"); //HTML5
        setBooleanAttribute("iframe", "seamless"); // HTML5
        setBooleanAttribute("keygen", "autofocus"); // HTML5
        setBooleanAttribute("keygen", "disabled"); //HTML5
        setBooleanAttribute("menu", "compact");
        setBooleanAttribute("object", "declare");
        setBooleanAttribute("object", "typemustmatch"); // HTML5
        setBooleanAttribute("ol", "compact");
        setBooleanAttribute("ol", "reversed"); // HTML5
        setBooleanAttribute("optgroup", "disabled");
        setBooleanAttribute("option", "selected");
        setBooleanAttribute("option", "disabled");
        setBooleanAttribute("script", "defer");
        setBooleanAttribute("script", "async");   // HTML5
        setBooleanAttribute("select", "multiple");
        setBooleanAttribute("select", "disabled");
        setBooleanAttribute("select", "autofocus"); // HTML5
        setBooleanAttribute("select", "required"); // HTML5
        setBooleanAttribute("style", "scoped"); // HTML5
        setBooleanAttribute("td", "nowrap");
        setBooleanAttribute("textarea", "disabled");
        setBooleanAttribute("textarea", "readonly");
        setBooleanAttribute("textarea", "autofocus"); // HTML5
        setBooleanAttribute("textarea", "required"); // HTML5
        setBooleanAttribute("th", "nowrap");
        setBooleanAttribute("track", "default"); // HTML5
        setBooleanAttribute("ul", "compact");
        setBooleanAttribute("video", "autoplay"); // HTML5
        setBooleanAttribute("video", "controls"); // HTML5
        setBooleanAttribute("video", "loop"); // HTML5
        setBooleanAttribute("video", "muted"); // HTML5
    }

    private static void setBooleanAttribute(String element, String attribute) {
        booleanAttributes.add(attribute);
        booleanCombinations.add(element + '+' + attribute);
    }

    private static boolean isBooleanAttribute(String element, String attribute, String value) {
        return attribute.equalsIgnoreCase(value) &&
                booleanAttributes.contains(attribute) &&
                ( booleanCombinations.contains(element + '+' + attribute) ||
                  booleanCombinations.contains("*+" + attribute));
    }

    /**
     * Constructor
     */

    public HTMLEmitter() {

    }

    /**
     * Say that all non-ASCII characters should be escaped, regardless of the character encoding
     *
     * @param escape true if all non ASCII characters should be escaped
     */

    public void setEscapeNonAscii(Boolean escape) {
        escapeNonAscii = escape;
    }

    /**
     * Decide whether an element is "serialized as an HTML element" in the language of the 3.0 specification
     *
     * @return true if the element
     */

    protected abstract boolean isHTMLElement(NodeName name);

    /**
     * Output start of document
     */

    public void open() throws XPathException {
    }

    protected void openDocument() throws XPathException {
        if (writer == null) {
            makeWriter();
        }
        if (started) {
            return;
        }
        started = true;
        // This method is sometimes called twice, especially during an identity transform
        // This check stops two DOCTYPE declarations being output.


        String byteOrderMark = outputProperties.getProperty(SaxonOutputKeys.BYTE_ORDER_MARK);

        if ("yes".equals(byteOrderMark) &&
                "UTF-8".equalsIgnoreCase(outputProperties.getProperty(OutputKeys.ENCODING))) {
            try {
                writer.write('\uFEFF');
            } catch (java.io.IOException err) {
                // Might be an encoding exception; just ignore it
            }
        }
        inScript = -1000000;
    }

    /**
     * Output the document type declaration
     *
     * @param displayName The element name
     * @param systemId    The DOCTYPE system identifier
     * @param publicId    The DOCTYPE public identifier
     */

    protected void writeDocType(NodeName name, String displayName, String systemId, String publicId) throws XPathException {
        super.writeDocType(name, displayName, systemId, publicId);
    }

    /**
     * Output element start tag
     */

    public void startElement(NodeName elemName, SchemaType typeCode, Location location, int properties) throws XPathException {

        super.startElement(elemName, typeCode, location, properties);
        uri = elemName.getURI();
        parentElement = elementStack.peek();
        if (elemName.hasURI("") &&
                (parentElement.equalsIgnoreCase("script") ||
                        parentElement.equalsIgnoreCase("style"))) {
            inScript = 0;
        }
        inScript++;
        nodeNameStack.push(elemName);
    }

    public void startContent() throws XPathException {
        closeStartTag();                   // prevent  syntax
    }

    /**
     * Write attribute name=value pair. Overrides the XML behaviour if the name and value
     * are the same (we assume this is a boolean attribute to be minimised), or if the value is
     * a URL.
     */

    protected void writeAttribute(NodeName elCode, String attname, CharSequence value, int properties) throws XPathException {
        try {
            if (uri.isEmpty()) {
                if (isBooleanAttribute(parentElement, attname, value.toString())) {
                    writer.write(attname);
                    return;
                }
            }
            super.writeAttribute(elCode, attname, value, properties);
        } catch (java.io.IOException err) {
            throw new XPathException(err);
        }
    }


    /**
     * Escape characters. Overrides the XML behaviour
     */

    protected void writeEscape(final CharSequence chars, final boolean inAttribute)
            throws java.io.IOException, XPathException {

        int segstart = 0;
        final boolean[] specialChars = (inAttribute ? specialInAtt : specialInText);

        if (chars instanceof CompressedWhitespace) {
            ((CompressedWhitespace) chars).writeEscape(specialChars, writer);
            return;
        }
        boolean disabled = false;

        while (segstart < chars.length()) {
            int i = segstart;

            // find a maximal sequence of "ordinary" characters

            if (escapeNonAscii) {
                char c;
                while (i < chars.length() && (c = chars.charAt(i)) < 127 && !specialChars[c]) {
                    i++;
                }
            } else {
                char c;
                while (i < chars.length() &&
                        ((c = chars.charAt(i)) < 127 ? !specialChars[c] : (characterSet.inCharset(c) && c > 160)
                        )
                        ) {
                    i++;
                }
            }

            // if this was the whole string, output the string and quit

            if (i == chars.length()) {
                if (segstart == 0) {
                    writeCharSequence(chars);
                } else {
                    writeCharSequence(chars.subSequence(segstart, i));
                }
                return;
            }

            // otherwise, output this sequence and continue
            if (i > segstart) {
                writeCharSequence(chars.subSequence(segstart, i));
            }

            final char c = chars.charAt(i);
            if (c == 0) {
                // used to switch escaping on and off
                disabled = !disabled;
            } else if (disabled) {
                writer.write(c);
            } else if (c <= 127) {
                // handle a special ASCII character
                if (inAttribute) {
                    if (c == '<') {
                        writer.write('<');      // not escaped
                    } else if (c == '>') {
                        writer.write(">");   // recommended for older browsers
                    } else if (c == '&') {
                        if (i + 1 < chars.length() && chars.charAt(i + 1) == '{') {
                            writer.write('&');                   // not escaped if followed by '{'
                        } else {
                            writer.write("&");
                        }
                    } else if (c == '\"') {
                        writer.write(""");
                    } else if (c == '\n') {
                        writer.write("
");
                    } else if (c == '\t') {
                        writer.write("	");
                    } else if (c == '\r') {
                        writer.write("
");
                    }
                } else {
                    if (c == '<') {
                        writer.write("<");
                    } else if (c == '>') {
                        writer.write(">");  // changed to allow for "]]>"
                    } else if (c == '&') {
                        writer.write("&");
                    } else if (c == '\r') {
                        writer.write("
");
                    }
                }

            } else if (c < 160) {
                // these control characters are illegal in HTML
                XPathException err = new XPathException("Illegal HTML character: decimal " + (int) c);
                err.setErrorCode("SERE0014");
                throw err;

            } else if (c == 160) {
                // always output NBSP as an entity reference
                writer.write(" ");

            } else if (c >= 55296 && c <= 56319) {  //handle surrogate pair

                //A surrogate pair is two consecutive Unicode characters.  The first
                //is in the range D800 to DBFF, the second is in the range DC00 to DFFF.
                //To compute the numeric value of the character corresponding to a surrogate
                //pair, use this formula (all numbers are hex):
                //(FirstChar - D800) * 400 + (SecondChar - DC00) + 10000

                // we'll trust the data to be sound
                int charval = (((int) c - 55296) * 1024) + ((int) chars.charAt(i + 1) - 56320) + 65536;
                characterReferenceGenerator.outputCharacterReference(charval, writer);
                i++;

            } else if (escapeNonAscii || !characterSet.inCharset(c)) {
                characterReferenceGenerator.outputCharacterReference(c, writer);
            } else {
                writer.write(c);
            }
            segstart = ++i;
        }

    }

    /**
     * Output an element end tag.
     */

    public void endElement() throws XPathException {
        NodeName nodeName = nodeNameStack.pop();
        String name = elementStack.peek();
        inScript--;
        if (inScript == 0) {
            inScript = -1000000;
        }

        if (isEmptyTag(name) && isHTMLElement(nodeName)) {
            // no end tag required
            elementStack.pop();
        } else {
            super.endElement();
        }

    }

    /**
     * Character data.
     */

    public void characters(CharSequence chars, Location locationId, int properties)
            throws XPathException {
        int options = properties;
        if (inScript > 0) {
            options |= ReceiverOptions.DISABLE_ESCAPING;
        }
        super.characters(chars, locationId, options);
    }

    /**
     * Handle a processing instruction.
     */

    public void processingInstruction(String target, CharSequence data, Location locationId, int properties)
            throws XPathException {
        if (!started) {
            openDocument();
        }
        for (int i = 0; i < data.length(); i++) {
            if (data.charAt(i) == '>') {
                XPathException err = new XPathException("A processing instruction in HTML must not contain a > character");
                err.setErrorCode("SERE0015");
                throw err;
            }
        }
        try {
            writer.write("');
        } catch (java.io.IOException err) {
            throw new XPathException(err);
        }
    }


}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy