org.w3c.tidy.Lexer Maven / Gradle / Ivy

Go to download
/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 *
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights.
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 *
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */
package org.w3c.tidy;

import java.io.PrintWriter;
import java.util.EnumSet;
import java.util.List;
import java.util.Stack;
import java.util.Vector;


/**
 * Lexer for html parser.
 * 
 * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
 * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
 * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
 * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
 * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
 * Not yet done: - Doctype subset and marked sections
 * 
 * @author Dave Raggett [email protected] 
 * @author Andy Quick [email protected]  (translation to Java)
 * @author Fabrizio Giustina
 * @version $Revision$ ($Author$)
 */
public class Lexer
{

    /**
     * state: ignore whitespace.
     */
    public static final short IGNORE_WHITESPACE = 0;

    /**
     * state: mixed content.
     */
    public static final short MIXED_CONTENT = 1;

    /**
     * state: preformatted.
     */
    public static final short PREFORMATTED = 2;

    /**
     * state: ignore markup.
     */
    public static final short IGNORE_MARKUP = 3;

    /**
     * URI for XHTML 1.0 transitional DTD.
     */
    private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";

    /**
     * URI for XHTML 1.0 strict DTD.
     */
    private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";

    /**
     * URI for XHTML 1.0 frameset DTD.
     */
    private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";

    /**
     * URI for XHTML 1.1.
     */
    private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";

    /**
     * URI for XHTML Basic 1.0.
     */
    // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
    /**
     * xhtml namespace.
     */
    private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
    
    private int highSurrogate = 0;

    /**
     * lists all the known versions.
     */
    private static final Lexer.W3CVersionInfo[] W3CVERSION = {
        new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, HtmlVersion.HTML40_STRICT),
        new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, HtmlVersion.HTML40_LOOSE),
        new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, HtmlVersion.FRAMESET),
        new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, HtmlVersion.HTML40_STRICT),
        new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, HtmlVersion.HTML40_LOOSE),
        new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, HtmlVersion.FRAMESET),
        new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, HtmlVersion.HTML32),
        new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, HtmlVersion.HTML32),
        new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, HtmlVersion.HTML32),
        new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, HtmlVersion.HTML20),
        new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, HtmlVersion.XHTML11),
		new W3CVersionInfo("HTML5", null, null, HtmlVersion.HTML5)
	};

    /**
     * getToken state: content.
     */
    private static final short LEX_CONTENT = 0;

    /**
     * getToken state: gt.
     */
    private static final short LEX_GT = 1;

    /**
     * getToken state: endtag.
     */
    private static final short LEX_ENDTAG = 2;

    /**
     * getToken state: start tag.
     */
    private static final short LEX_STARTTAG = 3;

    /**
     * getToken state: comment.
     */
    private static final short LEX_COMMENT = 4;

    /**
     * getToken state: doctype.
     */
    private static final short LEX_DOCTYPE = 5;

    /**
     * getToken state: procinstr.
     */
    private static final short LEX_PROCINSTR = 6;

    /**
     * getToken state: cdata.
     */
    private static final short LEX_CDATA = 8;

    /**
     * getToken state: section.
     */
    private static final short LEX_SECTION = 9;

    /**
     * getToken state: asp.
     */
    private static final short LEX_ASP = 10;

    /**
     * getToken state: jste.
     */
    private static final short LEX_JSTE = 11;

    /**
     * getToken state: php.
     */
    private static final short LEX_PHP = 12;

    /**
     * getToken state: xml declaration.
     */
    private static final short LEX_XMLDECL = 13;

    /**
     * file stream.
     */
    protected StreamIn in;

    /**
     * error output stream.
     */
    protected PrintWriter errout;

    /**
     * for accessibility errors.
     */
    protected short badAccess;

    /**
     * for bad style errors.
     */
    protected short badLayout;

    /**
     * for bad char encodings.
     */
    protected short badChars;

    /**
     * for mismatched/mispositioned form tags.
     */
    protected short badForm;

    /**
     * count of warnings in this document.
     */
    protected short warnings;

    /**
     * count of errors.
     */
    protected short errors;

    /**
     * lines seen.
     */
    protected int lines;

    /**
     * at start of current token.
     */
    protected int columns;

    /**
     * used to collapse contiguous white space.
     */
    protected boolean waswhite;

    /**
     * true after token has been pushed back.
     */
    protected boolean pushed;

    /**
     * when space is moved after end tag.
     */
    protected boolean insertspace;

    /**
     * Netscape compatibility.
     */
    protected boolean excludeBlocks;

    /**
     * true if moved out of table.
     */
    protected boolean exiled;

    /**
     * true if xmlns attribute on html element.
     */
    protected boolean isvoyager;

    /**
     * bit vector of HTML versions.
     */
    private EnumSet versions;

    /**
     * version as given by doctype (if any).
     */
    protected HtmlVersion doctype;

    /**
     * set if html or PUBLIC is missing.
     */
    protected boolean badDoctype;

    /**
     * start of current node.
     */
    protected int txtstart;

    /**
     * end of current node.
     */
    protected int txtend;

    /**
     * state of lexer's finite state machine.
     */
    protected short state;

    /**
     * current node.
     */
    protected Node token;

    /**
     * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
     * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
     */
    protected byte[] lexbuf;

    /**
     * allocated.
     */
    protected int lexlength;

    /**
     * used.
     */
    protected int lexsize;

    /**
     * Inline stack for compatibility with Mosaic. For deferring text node.
     */
    protected Node inode;

    /**
     * for inferring inline tags.
     */
    protected int insert;

    /**
     * stack.
     */
    protected Stack istack;

    /**
     * start of frame.
     */
    protected int istackbase;

    /**
     * used for cleaning up presentation markup.
     */
    protected Style styles;

    /**
     * configuration.
     */
    protected Configuration configuration;

    /**
     * already seen end body tag?
     */
    protected boolean seenEndBody;

    /**
     * already seen end html tag?
     */
    protected boolean seenEndHtml;

    /**
     * report.
     */
    protected Report report;

    /**
     * Root node is saved here.
     */
    protected Node root;

    /**
     * node list.
     */
    private List nodeList;

    /**
     * Instantiates a new Lexer.
     * @param in StreamIn
     * @param configuration configuation instance
     * @param report report instance, for reporting errors
     */
    public Lexer(StreamIn in, Configuration configuration, Report report)
    {
        this.report = report;
        this.in = in;
        this.lines = 1;
        this.columns = 1;
        this.state = LEX_CONTENT;
        this.versions = Dict.combine(Dict.VERS_ALL, Dict.VERS_PROPRIETARY);
        this.doctype = HtmlVersion.UNKNOWN;
        this.insert = -1;
        this.istack = new Stack<>();
        this.configuration = configuration;
        this.nodeList = new Vector<>();
    }

    /**
     * Creates a new node and add it to nodelist.
     * @return Node
     */
    public Node newNode()
    {
        Node node = new Node();
        this.nodeList.add(node);
        return node;
    }

    /**
     * Creates a new node and add it to nodelist.
     * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
     * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
     * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
     * @param textarray array of bytes contained in the Node
     * @param start start position
     * @param end end position
     * @return Node
     */
    public Node newNode(short type, byte[] textarray, int start, int end)
    {
        Node node = new Node(type, textarray, start, end);
        this.nodeList.add(node);
        return node;
    }

    /**
     * Creates a new node and add it to nodelist.
     * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
     * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
     * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
     * @param textarray array of bytes contained in the Node
     * @param start start position
     * @param end end position
     * @param element tag name
     * @return Node
     */
    public Node newNode(short type, byte[] textarray, int start, int end, String element)
    {
        Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
        this.nodeList.add(node);
        return node;
    }

    /**
     * Clones a node and add it to node list.
     * @param node Node
     * @return cloned Node
     */
    public Node cloneNode(Node node)
    {
        Node cnode = node.cloneNode(false);
        this.nodeList.add(cnode);
        for (AttVal att = cnode.attributes; att != null; att = att.next)
        {
            if (att.asp != null)
            {
                this.nodeList.add(att.asp);
            }
            if (att.php != null)
            {
                this.nodeList.add(att.php);
            }
        }
        return cnode;
    }

    /**
     * Clones an attribute value and add eventual asp or php node to node list.
     * @param attrs original AttVal
     * @return cloned AttVal
     */
    public AttVal cloneAttributes(AttVal attrs)
    {
        AttVal cattrs = (AttVal) attrs.clone();
        for (AttVal att = cattrs; att != null; att = att.next)
        {
            if (att.asp != null)
            {
                this.nodeList.add(att.asp);
            }
            if (att.php != null)
            {
                this.nodeList.add(att.php);
            }
        }
        return cattrs;
    }

    /**
     * Update oldtextarray in the current nodes.
     * @param oldtextarray previous text array
     * @param newtextarray new text array
     */
    protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
    {
        Node node;
        for (Object aNodeList : this.nodeList)
        {
            node = (Node) aNodeList;
            if (node.textarray == oldtextarray)
            {
                node.textarray = newtextarray;
            }
        }
    }

    /**
     * Adds a new line node. Used for creating preformatted text from Word2000.
     * @return new line node
     */
    public Node newLineNode()
    {
        Node node = newNode();

        node.textarray = this.lexbuf;
        node.start = this.lexsize;
        addCharToLexer('\n');
        node.end = this.lexsize;
        return node;
    }

    /**
     * Has end of input stream been reached?
     * @return true if end of input stream been reached
     */
    public boolean endOfInput()
    {
        return this.in.isEndOfStream();
    }

    /**
     * Adds a byte to lexer buffer.
     * @param c byte to add
     */
    public void addByte(int c)
    {
        if (this.lexsize + 1 >= this.lexlength)
        {
            while (this.lexsize + 1 >= this.lexlength)
            {
                if (this.lexlength == 0)
                {
                    this.lexlength = 8192;
                }
                else
                {
                    this.lexlength = this.lexlength * 2;
                }
            }

            byte[] temp = this.lexbuf;
            this.lexbuf = new byte[this.lexlength];
            if (temp != null)
            {
                System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
                updateNodeTextArrays(temp, this.lexbuf);
            }
        }

        this.lexbuf[this.lexsize++] = (byte) c;
        this.lexbuf[this.lexsize] = (byte) '\0'; // debug
    }

    /**
     * Substitute the last char in buffer.
     * @param c new char
     */
    public void changeChar(byte c)
    {
        if (this.lexsize > 0)
        {
            this.lexbuf[this.lexsize - 1] = c;
        }
    }

    /**
     * Store char c as UTF-8 encoded byte stream.
     * @param c char to store
     */
    public void addCharToLexer(int c)
    {
        // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
        // Fix by Pablo Mayrgundter 17-08-2004

        if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
            && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
                || c == 0x9
                || c == 0xA
                || c == 0xD // Then white-space.
                || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
            || (c >= 0x10000 && c <= 0x10FFFF)))
        {
        	if (0xD800 <= c && c <= 0xDBFF) {
        		// A high surrogate char.
        		highSurrogate = c - 0xD800;
        		return;
        	}
        	else if (0xDC00 <= c && c <= 0xDFFF) {
        		// A low surrogate char.
        		int lowSurrogate = c - 0xDC00;
        				
        		c = 0x10000 + 1024 * highSurrogate + lowSurrogate;
        		highSurrogate = 0;
        	}
        	else if (c == 0) {
        		// Silently ignore.
        		return;
        	}
        	else {
        		// Invalid char.
            	addErrorReplacement();
        		return;
        	}
        }

        int i = 0;
        int[] count = new int[]{0};
        byte[] buf = new byte[10]; // unsigned char

        boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
        if (err)
        {
        	addErrorReplacement();
        	return;
        }

        for (i = 0; i < count[0]; i++)
        {
            addByte(buf[i]); // uint
        }

    }

	/** 
     * Adds an invalid character replacement char 0xFFFD encoded as UTF-8 to the buffer.
	 */
	private void addErrorReplacement() {
		addByte((byte) 0xEF);
		addByte((byte) 0xBF);
		addByte((byte) 0xBD);
	}

    /**
     * Adds a string to lexer buffer.
     * @param str String to add
     */
    public void addStringToLexer(String str)
    {
        for (int i = 0; i < str.length(); i++)
        {
            addCharToLexer(str.charAt(i));
        }
    }

    /**
     * Parse an html entity.
     * @param mode mode
     */
    public void parseEntity(short mode)
    {
        // No longer attempts to insert missing ';' for unknown
        // entities unless one was present already, since this
        // gives unexpected results.
        //
        // For example: 
        // was tidied to: 
        // rather than: 
        //
        // My thanks for Maurice Buxton for spotting this.
        //
        // Also Randy Waki pointed out the following case for the
        // 04 Aug 00 version (bug #433012):
        //
        // For example: 
        // was tidied to: 
        // rather than: 
        //
        // where "lang" is a known entity (#9001), but browsers would
        // misinterpret "⟨" because it had a value > 256.
        //
        // So the case of an apparently known entity with a value > 256 and
        // missing a semicolon is handled specially.
        //
        // "ParseEntity" is also a bit of a misnomer - it handles entities and
        // numeric character references. Invalid NCR's are now reported.

        int start;
        boolean first = true;
        boolean semicolon = false;
        int c, ch, startcol;
        String str;

        start = this.lexsize - 1; // to start at "&"
        startcol = this.in.getCurcol() - 1;

        while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
        {
            if (c == ';')
            {
                semicolon = true;
                break;
            }

            if (first && c == '#')
            {
                // #431953 - start RJ
                if (!this.configuration.ncr
                    || "BIG5".equals(this.configuration.getInCharEncodingName())
                    || "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
                {
                    this.in.ungetChar(c);
                    return;
                }
                // #431953 - end RJ

                addCharToLexer(c);
                first = false;
                continue;
            }

            first = false;

            if (TidyUtils.isNamechar((char) c))
            {
                addCharToLexer(c);
                continue;
            }

            // otherwise put it back
            this.in.ungetChar(c);
            break;
        }

        str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);

        if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
        {
            report.entityError(this, Report.APOS_UNDEFINED, str, 39);
        }

        ch = EntityTable.getDefaultEntityTable().entityCode(str);

        // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
        // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
        // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
        // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
        // || (ch >= 0xE000 && ch <= 0xFFFD)))
        // {
        // this.lexsize = start;
        // return;
        // }

        // deal with unrecognized or invalid entities
        // #433012 - fix by Randy Waki 17 Feb 01
        // report invalid NCR's - Terry Teague 01 Sep 01
        if (ch <= 0 || (ch >= 256 && c != ';'))
        {
            // set error position just before offending character
            this.lines = this.in.getCurline();
            this.columns = startcol;

            if (this.lexsize > start + 1)
            {
                if (ch >= 128 && ch <= 159)
                {
                    // invalid numeric character reference
                    int c1 = 0;

                    if ("WIN1252".equals(configuration.replacementCharEncoding))
                    {
                        c1 = EncodingUtils.decodeWin1252(ch);
                    }
                    else if ("MACROMAN".equals(configuration.replacementCharEncoding))
                    {
                        c1 = EncodingUtils.decodeMacRoman(ch);
                    }

                    // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing

                    int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;

                    if (c != ';') /* issue warning if not terminated by ';' */
                    {
                        report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
                    }

                    report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);

                    if (c1 != 0)
                    {
                        // make the replacement
                        this.lexsize = start;
                        addCharToLexer(c1);
                        semicolon = false;
                    }
                    else
                    {
                        /* discard */
                        this.lexsize = start;
                        semicolon = false;
                    }

                }
                else
                {
                    report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
                }

                if (semicolon)
                {
                    addCharToLexer(';');
                }
            }
            else
            {
                // naked &
                report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
            }
        }
        else
        {
            // issue warning if not terminated by ';'
            if (c != ';')
            {
                // set error position just before offending character
                this.lines = this.in.getCurline();
                this.columns = startcol;
                report.entityError(this, Report.MISSING_SEMICOLON, str, c);
            }

            this.lexsize = start;

            if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
            {
                ch = ' ';
            }

            addCharToLexer(ch);

            if (ch == '&' && !this.configuration.quoteAmpersand)
            {
                addCharToLexer('a');
                addCharToLexer('m');
                addCharToLexer('p');
                addCharToLexer(';');
            }
        }
    }

    /**
     * Parses a tag name.
     * @return first char after the tag name
     */
    public char parseTagName()
    {
        int c;

        // fold case of first char in buffer
        c = this.lexbuf[this.txtstart];

        if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
        {
            c = TidyUtils.toLower((char) c);
            this.lexbuf[this.txtstart] = (byte) c;
        }

        while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
        {
            if (!TidyUtils.isNamechar((char) c))
            {
                break;
            }

            // fold case of subsequent chars
            if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
            {
                c = TidyUtils.toLower((char) c);
            }

            addCharToLexer(c);
        }

        this.txtend = this.lexsize;
        return (char) c;
    }

    /**
     * calls addCharToLexer for any char in the string.
     * @param str input String
     */
    public void addStringLiteral(String str)
    {
        int len = str.length();
        for (int i = 0; i < len; i++)
        {
            addCharToLexer(str.charAt(i));
        }
    }

    /**
     * calls addCharToLexer for any char in the string till len is reached.
     * @param str input String
     * @param len length of the substring to be added
     */
    void addStringLiteralLen(String str, int len)
    {
        int strlen = str.length();
        if (strlen < len)
        {
            len = strlen;
        }
        for (int i = 0; i < len; i++)
        {
            addCharToLexer(str.charAt(i));
        }
    }

    /**
     * Choose what version to use for new doctype.
     * @return html version constant
     */
    public HtmlVersion htmlVersion()
    {
        if (versions.contains(HtmlVersion.HTML20))
        {
            return HtmlVersion.HTML20;
        }

        if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
            && versions.contains(HtmlVersion.HTML32))
        {
            return HtmlVersion.HTML32;
        }
        if (versions.contains(HtmlVersion.XHTML11))
        {
            return HtmlVersion.XHTML11;
        }
        if (versions.contains(HtmlVersion.HTML40_STRICT))
        {
            return HtmlVersion.HTML40_STRICT;
        }

        if (versions.contains(HtmlVersion.HTML40_LOOSE))
        {
            return HtmlVersion.HTML40_LOOSE;
        }

        if (versions.contains(HtmlVersion.FRAMESET))
        {
            return HtmlVersion.FRAMESET;
        }

        return HtmlVersion.UNKNOWN;
    }

    /**
     * Choose what version to use for new doctype.
     * @return html version name
     */
    public String htmlVersionName()
    {
        HtmlVersion guessed;
        int j;

        guessed = apparentVersion();

        for (j = 0; j < W3CVERSION.length; ++j)
        {
            if (guessed == W3CVERSION[j].code)
            {
                if (this.isvoyager)
                {
                    return W3CVERSION[j].voyagerName;
                }

                return W3CVERSION[j].name;
            }
        }

        return null;
    }

    /**
     * Add meta element for Tidy. If the meta tag is already present, update release date.
     * @param root root node
     * @return true if the tag has been added
     */
    public boolean addGenerator(Node root)
    {
        AttVal attval;
        Node node;
        Node head = root.findHEAD(this.configuration.tt);

        if (head != null)
        {
            String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see jtidy.sourceforge.net";

            for (node = head.content; node != null; node = node.next)
            {
                if (node.tag == this.configuration.tt.tagMeta)
                {
                    attval = node.getAttrByName("name");

                    if (attval != null && "generator".equalsIgnoreCase(attval.value))
                    {
                        attval = node.getAttrByName("content");

                        if (attval != null
                            && attval.value != null
                            && attval.value.length() >= 9
                            && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
                        {
                            attval.value = meta;
                            return false;
                        }
                    }
                }
            }

            node = this.inferredTag("meta");
            node.addAttribute("content", meta);
            node.addAttribute("name", "generator");
            head.insertNodeAtStart(node);
            return true;
        }

        return false;
    }

    /**
     * Check system keywords (keywords should be uppercase).
     * @param doctype doctype node
     * @return true if doctype keywords are all uppercase
     */
    public boolean checkDocTypeKeyWords(Node doctype)
    {
        int len = doctype.end - doctype.start;
        String s = TidyUtils.getString(this.lexbuf, doctype.start, len);

        return !(TidyUtils.findBadSubString("SYSTEM", s, s.length())
            || TidyUtils.findBadSubString("PUBLIC", s, s.length())
            || TidyUtils.findBadSubString("//DTD", s, s.length())
            || TidyUtils.findBadSubString("//W3C", s, s.length())
            || TidyUtils.findBadSubString("//EN", s, s.length()));
    }

    /**
     * Examine DOCTYPE to identify version.
     * @param doctype doctype node
     * @return version code
     */
    public HtmlVersion findGivenVersion(Node doctype)
    {
        String p, s;
        int i, j;
        int len;
        String str1;
        String str2;

        // if root tag for doctype isn't html give up now
        str1 = TidyUtils.getString(this.lexbuf, doctype.start, 4);
        if (!("html".equalsIgnoreCase(str1)))
        {
            return HtmlVersion.UNKNOWN;
        }

        if (!checkDocTypeKeyWords(doctype))
        {
            report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
        }

        // give up if all we are given is the system id for the doctype
        str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
        if ("SYSTEM ".equalsIgnoreCase(str1))
        {
            // but at least ensure the case is correct
            if (!str1.substring(0, 6).equals("SYSTEM"))
            {
                System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
            }
            return HtmlVersion.UNKNOWN; // unrecognized
        }

        if ("PUBLIC ".equalsIgnoreCase(str1))
        {
            if (!str1.substring(0, 6).equals("PUBLIC"))
            {
                System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
            }
        }
        else if (str1.trim().isEmpty())
        {
        	return HtmlVersion.HTML5;
        }
        else
        {
            this.badDoctype = true;
        }

        for (i = doctype.start; i < doctype.end; ++i)
        {
            if (this.lexbuf[i] == (byte) '"')
            {
                str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
                str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
                if (str1.equals("-//W3C//DTD "))
                {
                    // compute length of identifier e.g. "HTML 4.0 Transitional"
                    for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
                    {
                        //
                    }
                    len = j - i - 13;
                    p = TidyUtils.getString(this.lexbuf, i + 13, len);

                    for (j = 1; j < W3CVERSION.length; ++j)
                    {
                        s = W3CVERSION[j].name;
                        if (len == s.length() && s.equals(p))
                        {
                            return W3CVERSION[j].code;
                        }
                    }

                    // else unrecognized version
                }
                else if (str2.equals("-//IETF//DTD "))
                {
                    // compute length of identifier e.g. "HTML 2.0"
                    for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
                    {
                        //
                    }
                    len = j - i - 14;

                    p = TidyUtils.getString(this.lexbuf, i + 14, len);
                    s = W3CVERSION[0].name;
                    if (len == s.length() && s.equals(p))
                    {
                        return W3CVERSION[0].code;
                    }

                    // else unrecognized version
                }
                break;
            }
        }

        return HtmlVersion.UNKNOWN;
    }

    /**
     * Fix xhtml namespace.
     * @param root root Node
     * @param profile current profile
     */
    public void fixHTMLNameSpace(Node root, String profile)
    {
        Node node;
        AttVal attr;

        node = root.content;
        while (node != null && node.tag != this.configuration.tt.tagHtml)
        {
            node = node.next;
        }

        if (node != null)
        {

            for (attr = node.attributes; attr != null; attr = attr.next)
            {
                if (attr.attribute.equals("xmlns"))
                {
                    break;
                }

            }

            if (attr != null)
            {
                if (!attr.value.equals(profile))
                {
                    report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
                    attr.value = profile;
                }
            }
            else
            {
                attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
                attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
                node.attributes = attr;
            }
        }
    }

    /**
     * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
     * html tag. Should also work for any comments, etc. that may precede the html tag.
     * @param root root node
     * @return new doctype node
     */
    Node newXhtmlDocTypeNode(Node root)
    {
        Node html = root.findHTML(this.configuration.tt);
        if (html == null)
        {
            return null;
        }

        Node newdoctype = newNode();
        newdoctype.setType(Node.DOCTYPE_TAG);
        newdoctype.next = html;
        newdoctype.parent = root;
        newdoctype.prev = null;

        if (html == root.content)
        {
            // No  declaration.
            root.content.prev = newdoctype;
            root.content = newdoctype;
            newdoctype.prev = null;
        }
        else
        {
            // we have an  declaration.
            newdoctype.prev = html.prev;
            newdoctype.prev.next = newdoctype;
        }
        html.prev = newdoctype;
        return newdoctype;
    }

    /**
     * Adds a new xhtml doctype to the document.
     * @param root root node
     * @return true if a doctype has been added
     */
    public boolean setXHTMLDocType(Node root)
    {
        String fpi = " ";
        String sysid = "";
        String dtdsub = null;
        Node doctype;
        int dtdlen = 0;

        doctype = root.findDocType();

        fixHTMLNameSpace(root, XHTML_NAMESPACE); // #427839 - fix by Evan Lenz 05 Sep 00

        if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
        {
            if (doctype != null)
            {
                Node.discardElement(doctype);
            }
            return true;
        }

        if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
        {
            // see what flavor of XHTML this document matches
            if (versions.contains(HtmlVersion.HTML40_STRICT))
            {
                // use XHTML strict
                fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
                sysid = VOYAGER_STRICT;
            }
            else if (versions.contains(HtmlVersion.FRAMESET))
            {
                // use XHTML frames
                fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
                sysid = VOYAGER_FRAMESET;
            }
            else if (TidyUtils.containsAny(versions, Dict.VERS_LOOSE))
            {
                fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
                sysid = VOYAGER_LOOSE;
            }
            else if (versions.contains(HtmlVersion.XHTML11))
            {
                // use XHTML 1.1
                fpi = "-//W3C//DTD XHTML 1.1//EN";
                sysid = VOYAGER_11;
            }
            else
            {
                // proprietary
                fpi = null;
                sysid = "";
                if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
                {
                    Node.discardElement(doctype);
                }
            }
        }
        else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
        {
            fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
            sysid = VOYAGER_STRICT;
        }
        else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
        {
            fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
            sysid = VOYAGER_LOOSE;
        }

        if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
        {
            fpi = this.configuration.docTypeStr;
            sysid = "";
        }

        if (fpi == null)
        {
            return false;
        }

        if (doctype != null)
        {
            // Look for internal DTD subset
            if (configuration.xHTML || configuration.xmlOut)
            {

                int len = doctype.end - doctype.start + 1;
                String start = TidyUtils.getString(this.lexbuf, doctype.start, len);

                int dtdbeg = start.indexOf('[');
                if (dtdbeg >= 0)
                {
                    int dtdend = start.substring(dtdbeg).indexOf(']');
                    if (dtdend >= 0)
                    {
                        dtdlen = dtdend + 1;
                        dtdsub = start.substring(dtdbeg);
                    }
                }
            }
        }
        else
        {
            if ((doctype = newXhtmlDocTypeNode(root)) == null)
            {
                return false;
            }
        }

        this.txtstart = this.lexsize;
        this.txtend = this.lexsize;

        // add public identifier
        addStringLiteral("html PUBLIC ");

        // check if the fpi is quoted or not
        if (fpi.charAt(0) == '"')
        {
            addStringLiteral(fpi);
        }
        else
        {
            addStringLiteral("\"");
            addStringLiteral(fpi);
            addStringLiteral("\"");
        }

        if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
        {
            addStringLiteral("\n\"");
        }
        else
        {
            // FG: don't wrap
            addStringLiteral(" \"");
        }

        // add system identifier
        addStringLiteral(sysid);
        addStringLiteral("\"");

        if (dtdlen > 0 && dtdsub != null)
        {
            addCharToLexer(' ');
            addStringLiteralLen(dtdsub, dtdlen);
        }

        this.txtend = this.lexsize;

        int length = this.txtend - this.txtstart;
        doctype.textarray = new byte[length];

        System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
        doctype.start = 0;
        doctype.end = length;

        return false;
    }

    /**
     * Return the html version used in document.
     * @return version code
     */
    public HtmlVersion apparentVersion()
    {
        switch (this.doctype)
        {
            case UNKNOWN :
                return htmlVersion();

            case HTML20 :
                if (versions.contains(HtmlVersion.HTML20))
                {
                    return HtmlVersion.HTML20;
                }

                break;

            case HTML32 :
                if (versions.contains(HtmlVersion.HTML32))
                {
                    return HtmlVersion.HTML32;
                }

                break; // to replace old version by new

            case HTML40_STRICT :
                if (versions.contains(HtmlVersion.HTML40_STRICT))
                {
                    return HtmlVersion.HTML40_STRICT;
                }

                break;

            case HTML40_LOOSE :
                if (versions.contains(HtmlVersion.HTML40_LOOSE))
                {
                    return HtmlVersion.HTML40_LOOSE;
                }

                break; // to replace old version by new

            case FRAMESET :
                if (versions.contains(HtmlVersion.FRAMESET))
                {
                    return HtmlVersion.FRAMESET;
                }

                break;

            case XHTML11 :
                if (versions.contains(HtmlVersion.XHTML11))
                {
                    return HtmlVersion.XHTML11;
                }

                break;

            case HTML5 :
            	if (versions.contains(HtmlVersion.HTML5))
            	{
            		return HtmlVersion.HTML5;
            	}
            	
            	break;
            default :
                // should never reach here
                break;
        }

        // kludge to avoid error appearing at end of file
        // it would be better to note the actual position
        // when first encountering the doctype declaration

        this.lines = 1;
        this.columns = 1;

        report.warning(this, null, null, Report.INCONSISTENT_VERSION);
        return this.htmlVersion();
    }

    /**
     * Fixup doctype if missing.
     * @param root root node
     * @return false if current version has not been identified
     */
    public boolean fixDocType(Node root)
    {
        Node doctype;
        HtmlVersion guessed = HtmlVersion.HTML40_STRICT;

        if (this.badDoctype)
        {
            report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
        }

        doctype = root.findDocType();

        if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
        {
            if (doctype != null)
            {
                Node.discardElement(doctype);
            }
            return true;
        }

        if (this.configuration.xmlOut)
        {
            return true;
        }

        if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
        {
            Node.discardElement(doctype);
            doctype = null;
            guessed = HtmlVersion.HTML40_STRICT;
        }
        else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
        {
            Node.discardElement(doctype);
            doctype = null;
            guessed = HtmlVersion.HTML40_LOOSE;
        }
        else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
        {
            if (doctype != null)
            {
                if (this.doctype == HtmlVersion.UNKNOWN)
                {
                    return false;
                }

                switch (this.doctype)
                {
                    case UNKNOWN :
                        return false;

                    case HTML20 :
                        if (versions.contains(HtmlVersion.HTML20))
                        {
                            return true;
                        }

                        break; // to replace old version by new

                    case HTML32 :
                        if (versions.contains(HtmlVersion.HTML32))
                        {
                            return true;
                        }

                        break; // to replace old version by new

                    case HTML40_STRICT :
                        if (versions.contains(HtmlVersion.HTML40_STRICT))
                        {
                            return true;
                        }

                        break; // to replace old version by new

                    case HTML40_LOOSE :
                        if (versions.contains(HtmlVersion.HTML40_LOOSE))
                        {
                            return true;
                        }

                        break; // to replace old version by new

                    case FRAMESET :
                        if (versions.contains(HtmlVersion.FRAMESET))
                        {
                            return true;
                        }

                        break; // to replace old version by new

                    case XHTML11 :
                        if (versions.contains(HtmlVersion.XHTML11))
                        {
                            return true;
                        }

                        break; // to replace old version by new
                    default :
                        // should never reach here
                        break;
                }

                // INCONSISTENT_VERSION warning is now issued by ApparentVersion()
            }

            // choose new doctype
            guessed = htmlVersion();
        }

        if (guessed == HtmlVersion.UNKNOWN)
        {
            return false;
        }

        // for XML use the Voyager system identifier
        if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
        {
            if (doctype != null)
            {
                Node.discardElement(doctype);
            }

            fixHTMLNameSpace(root, XHTML_NAMESPACE);

            // Namespace is the same for all XHTML variants
            // Also, don't return yet. Still need to add DOCTYPE declaration.
            //
            // for (i = 0; i < W3CVersion.length; ++i)
            // {
            // if (guessed == W3CVersion[i].code)
            // {
            // fixHTMLNameSpace(root, W3CVersion[i].profile);
            // break;
            // }
            // }
            // return true;
        }

        if (doctype == null)
        {
            if ((doctype = newXhtmlDocTypeNode(root)) == null)
            {
                return false;
            }
        }

        this.txtstart = this.lexsize;
        this.txtend = this.lexsize;

        // use the appropriate public identifier
        addStringLiteral("html PUBLIC ");

        if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
            && this.configuration.docTypeStr != null
            && this.configuration.docTypeStr.length() > 0)
        {
            // check if the fpi is quoted or not
            if (this.configuration.docTypeStr.charAt(0) == '"')
            {
                addStringLiteral(this.configuration.docTypeStr);
            }
            else
            {
                addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
                addStringLiteral(this.configuration.docTypeStr);
                addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
            }
        }
        else if (guessed == HtmlVersion.HTML20)
        {
            addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
        }
        else
        {
            addStringLiteral("\"-//W3C//DTD ");

            for (int i = 0; i < W3CVERSION.length; ++i)
            {
                if (guessed == W3CVERSION[i].code)
                {
                    addStringLiteral(W3CVERSION[i].name);
                    break;
                }
            }

            addStringLiteral("//EN\"");
        }

        this.txtend = this.lexsize;

        int length = this.txtend - this.txtstart;
        doctype.textarray = new byte[length];

        System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
        doctype.start = 0;
        doctype.end = length;

        return true;
    }

    /**
     * Ensure XML document starts with <?XML version="1.0"?>. Add encoding attribute if not using
     * ASCII or UTF-8 output.
     * @param root root node
     * @return always true
     */
    public boolean fixXmlDecl(Node root)
    {
        Node xml;
        AttVal version;
        AttVal encoding;

        if (root.content != null && root.content.type == Node.XML_DECL) {
            xml = root.content;
        } else {
            xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
            root.insertNodeAtStart(xml);
        }

        version = xml.getAttrByName("version");
        encoding = xml.getAttrByName("encoding");

        // We need to insert a check if declared encoding and output encoding mismatch
        // and fix the Xml declaration accordingly!!!
        if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
        {
            if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
            {
                xml.addAttribute("encoding", "iso-8859-1");
            }
            if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
            {
                xml.addAttribute("encoding", "iso-2022");
            }
        }

        if (version == null)
        {
            xml.addAttribute("version", "1.0");
        }

        return true;
    }

    /**
     * Generates and inserts a new node.
     * @param name tag name
     * @return generated node
     */
    public Node inferredTag(String name)
    {
        Node node;

        node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
        node.implicit = true;
        return node;
    }

    private static final int CDATA_INTERMEDIATE = 0;
    private static final int CDATA_STARTTAG = 1;
    private static final int CDATA_ENDTAG = 2;

    /**
     * Create a text node for the contents of a CDATA element like style or script which
     * ends with </foo> for some foo.
     * @param container container node
     * @return cdata node
     */
    public Node getCDATA(Node container)
    {
        int start = 0;
        int nested = 0;
        int state = CDATA_INTERMEDIATE;
        int c;
        boolean isEmpty = true;
        boolean matches = false;
        boolean hasSrc = container.getAttrByName("src") != null;

        this.lines = this.in.getCurline();
        this.columns = this.in.getCurcol();
        this.waswhite = false;
        this.txtstart = this.lexsize;
        this.txtend = this.lexsize;

        /* seen start tag, look for matching end tag */
        while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) {
        	addCharToLexer(c);
        	txtend = lexsize;

            if (state == CDATA_INTERMEDIATE) {
            	if (c != '<') {
                    if (isEmpty && !TidyUtils.isWhite((char) c)) {
                        isEmpty = false;
                    }
                    continue;
                }
            	c = in.readChar();
            	if (TidyUtils.isLetter((char) c)) {
            		/*