All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.w3c.tidy.Lexer Maven / Gradle / Ivy

Go to download

JTidy is a Java port of HTML Tidy, a HTML syntax checker and pretty printer. Like its non-Java cousin, JTidy can be used as a tool for cleaning up malformed and faulty HTML. In addition, JTidy provides a DOM interface to the document that is being processed, which effectively makes you able to use JTidy as a DOM parser for real-world HTML.

There is a newer version: 1.0.5
Show newest version
/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 *
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights.
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 *
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */
package org.w3c.tidy;

import java.io.PrintWriter;
import java.util.List;
import java.util.Stack;
import java.util.Vector;


/**
 * Lexer for html parser.
 * 

* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2 * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case. * Not yet done: - Doctype subset and marked sections *

* @author Dave Raggett [email protected] * @author Andy Quick [email protected] (translation to Java) * @author Fabrizio Giustina * @version $Revision$ ($Author$) */ public class Lexer { /** * state: ignore whitespace. */ public static final short IGNORE_WHITESPACE = 0; /** * state: mixed content. */ public static final short MIXED_CONTENT = 1; /** * state: preformatted. */ public static final short PREFORMATTED = 2; /** * state: ignore markup. */ public static final short IGNORE_MARKUP = 3; /** * URI for XHTML 1.0 transitional DTD. */ private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; /** * URI for XHTML 1.0 strict DTD. */ private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; /** * URI for XHTML 1.0 frameset DTD. */ private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"; /** * URI for XHTML 1.1. */ private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"; /** * URI for XHTML Basic 1.0. */ // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"; /** * xhtml namespace. */ private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; /** * lists all the known versions. */ private static final Lexer.W3CVersionInfo[] W3CVERSION = { new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET), new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET), new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20), new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)}; /** * getToken state: content. */ private static final short LEX_CONTENT = 0; /** * getToken state: gt. */ private static final short LEX_GT = 1; /** * getToken state: endtag. */ private static final short LEX_ENDTAG = 2; /** * getToken state: start tag. */ private static final short LEX_STARTTAG = 3; /** * getToken state: comment. */ private static final short LEX_COMMENT = 4; /** * getToken state: doctype. */ private static final short LEX_DOCTYPE = 5; /** * getToken state: procinstr. */ private static final short LEX_PROCINSTR = 6; /** * getToken state: cdata. */ private static final short LEX_CDATA = 8; /** * getToken state: section. */ private static final short LEX_SECTION = 9; /** * getToken state: asp. */ private static final short LEX_ASP = 10; /** * getToken state: jste. */ private static final short LEX_JSTE = 11; /** * getToken state: php. */ private static final short LEX_PHP = 12; /** * getToken state: xml declaration. */ private static final short LEX_XMLDECL = 13; /** * file stream. */ protected StreamIn in; /** * error output stream. */ protected PrintWriter errout; /** * for accessibility errors. */ protected short badAccess; /** * for bad style errors. */ protected short badLayout; /** * for bad char encodings. */ protected short badChars; /** * for mismatched/mispositioned form tags. */ protected short badForm; /** * count of warnings in this document. */ protected short warnings; /** * count of errors. */ protected short errors; /** * lines seen. */ protected int lines; /** * at start of current token. */ protected int columns; /** * used to collapse contiguous white space. */ protected boolean waswhite; /** * true after token has been pushed back. */ protected boolean pushed; /** * when space is moved after end tag. */ protected boolean insertspace; /** * Netscape compatibility. */ protected boolean excludeBlocks; /** * true if moved out of table. */ protected boolean exiled; /** * true if xmlns attribute on html element. */ protected boolean isvoyager; /** * bit vector of HTML versions. */ protected short versions; /** * version as given by doctype (if any). */ protected int doctype; /** * set if html or PUBLIC is missing. */ protected boolean badDoctype; /** * start of current node. */ protected int txtstart; /** * end of current node. */ protected int txtend; /** * state of lexer's finite state machine. */ protected short state; /** * current node. */ protected Node token; /** * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars. */ protected byte[] lexbuf; /** * allocated. */ protected int lexlength; /** * used. */ protected int lexsize; /** * Inline stack for compatibility with Mosaic. For deferring text node. */ protected Node inode; /** * for inferring inline tags. */ protected int insert; /** * stack. */ protected Stack istack; /** * start of frame. */ protected int istackbase; /** * used for cleaning up presentation markup. */ protected Style styles; /** * configuration. */ protected Configuration configuration; /** * already seen end body tag? */ protected boolean seenEndBody; /** * already seen end html tag? */ protected boolean seenEndHtml; /** * report. */ protected Report report; /** * Root node is saved here. */ protected Node root; /** * node list. */ private List nodeList; /** * Instantiates a new Lexer. * @param in StreamIn * @param configuration configuation instance * @param report report instance, for reporting errors */ public Lexer(StreamIn in, Configuration configuration, Report report) { this.report = report; this.in = in; this.lines = 1; this.columns = 1; this.state = LEX_CONTENT; this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY); this.doctype = Dict.VERS_UNKNOWN; this.insert = -1; this.istack = new Stack<>(); this.configuration = configuration; this.nodeList = new Vector<>(); } /** * Creates a new node and add it to nodelist. * @return Node */ public Node newNode() { Node node = new Node(); this.nodeList.add(node); return node; } /** * Creates a new node and add it to nodelist. * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE | * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG | * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL * @param textarray array of bytes contained in the Node * @param start start position * @param end end position * @return Node */ public Node newNode(short type, byte[] textarray, int start, int end) { Node node = new Node(type, textarray, start, end); this.nodeList.add(node); return node; } /** * Creates a new node and add it to nodelist. * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE | * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG | * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL * @param textarray array of bytes contained in the Node * @param start start position * @param end end position * @param element tag name * @return Node */ public Node newNode(short type, byte[] textarray, int start, int end, String element) { Node node = new Node(type, textarray, start, end, element, this.configuration.tt); this.nodeList.add(node); return node; } /** * Clones a node and add it to node list. * @param node Node * @return cloned Node */ public Node cloneNode(Node node) { Node cnode = node.cloneNode(false); this.nodeList.add(cnode); for (AttVal att = cnode.attributes; att != null; att = att.next) { if (att.asp != null) { this.nodeList.add(att.asp); } if (att.php != null) { this.nodeList.add(att.php); } } return cnode; } /** * Clones an attribute value and add eventual asp or php node to node list. * @param attrs original AttVal * @return cloned AttVal */ public AttVal cloneAttributes(AttVal attrs) { AttVal cattrs = (AttVal) attrs.clone(); for (AttVal att = cattrs; att != null; att = att.next) { if (att.asp != null) { this.nodeList.add(att.asp); } if (att.php != null) { this.nodeList.add(att.php); } } return cattrs; } /** * Update oldtextarray in the current nodes. * @param oldtextarray previous text array * @param newtextarray new text array */ protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) { Node node; for (Object aNodeList : this.nodeList) { node = (Node) aNodeList; if (node.textarray == oldtextarray) { node.textarray = newtextarray; } } } /** * Adds a new line node. Used for creating preformatted text from Word2000. * @return new line node */ public Node newLineNode() { Node node = newNode(); node.textarray = this.lexbuf; node.start = this.lexsize; addCharToLexer('\n'); node.end = this.lexsize; return node; } /** * Has end of input stream been reached? * @return true if end of input stream been reached */ public boolean endOfInput() { return this.in.isEndOfStream(); } /** * Adds a byte to lexer buffer. * @param c byte to add */ public void addByte(int c) { if (this.lexsize + 1 >= this.lexlength) { while (this.lexsize + 1 >= this.lexlength) { if (this.lexlength == 0) { this.lexlength = 8192; } else { this.lexlength = this.lexlength * 2; } } byte[] temp = this.lexbuf; this.lexbuf = new byte[this.lexlength]; if (temp != null) { System.arraycopy(temp, 0, this.lexbuf, 0, temp.length); updateNodeTextArrays(temp, this.lexbuf); } } this.lexbuf[this.lexsize++] = (byte) c; this.lexbuf[this.lexsize] = (byte) '\0'; // debug } /** * Substitute the last char in buffer. * @param c new char */ public void changeChar(byte c) { if (this.lexsize > 0) { this.lexbuf[this.lexsize - 1] = c; } } /** * Store char c as UTF-8 encoded byte stream. * @param c char to store */ public void addCharToLexer(int c) { // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char // Fix by Pablo Mayrgundter 17-08-2004 if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first. || c == 0x9 || c == 0xA || c == 0xD // Then white-space. || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode. || (c >= 0x10000 && c <= 0x10FFFF))) { return; } int i = 0; int[] count = new int[]{0}; byte[] buf = new byte[10]; // unsigned char boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count); if (err) { // replacement char 0xFFFD encoded as UTF-8 buf[0] = (byte) 0xEF; buf[1] = (byte) 0xBF; buf[2] = (byte) 0xBD; count[0] = 3; } for (i = 0; i < count[0]; i++) { addByte(buf[i]); // uint } } /** * Adds a string to lexer buffer. * @param str String to add */ public void addStringToLexer(String str) { for (int i = 0; i < str.length(); i++) { addCharToLexer(str.charAt(i)); } } /** * Parse an html entity. * @param mode mode */ public void parseEntity(short mode) { // No longer attempts to insert missing ';' for unknown // entities unless one was present already, since this // gives unexpected results. // // For example: // was tidied to: // rather than: // // My thanks for Maurice Buxton for spotting this. // // Also Randy Waki pointed out the following case for the // 04 Aug 00 version (bug #433012): // // For example: // was tidied to: // rather than: // // where "lang" is a known entity (#9001), but browsers would // misinterpret "⟨" because it had a value > 256. // // So the case of an apparently known entity with a value > 256 and // missing a semicolon is handled specially. // // "ParseEntity" is also a bit of a misnomer - it handles entities and // numeric character references. Invalid NCR's are now reported. int start; boolean first = true; boolean semicolon = false; int c, ch, startcol; String str; start = this.lexsize - 1; // to start at "&" startcol = this.in.getCurcol() - 1; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { if (c == ';') { semicolon = true; break; } if (first && c == '#') { // #431953 - start RJ if (!this.configuration.ncr || "BIG5".equals(this.configuration.getInCharEncodingName()) || "SHIFTJIS".equals(this.configuration.getInCharEncodingName())) { this.in.ungetChar(c); return; } // #431953 - end RJ addCharToLexer(c); first = false; continue; } first = false; if (TidyUtils.isNamechar((char) c)) { addCharToLexer(c); continue; } // otherwise put it back this.in.ungetChar(c); break; } str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start); if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML) { report.entityError(this, Report.APOS_UNDEFINED, str, 39); } ch = EntityTable.getDefaultEntityTable().entityCode(str); // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004 // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first. // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space. // || (ch >= 0xE000 && ch <= 0xFFFD))) // { // this.lexsize = start; // return; // } // deal with unrecognized or invalid entities // #433012 - fix by Randy Waki 17 Feb 01 // report invalid NCR's - Terry Teague 01 Sep 01 if (ch <= 0 || (ch >= 256 && c != ';')) { // set error position just before offending character this.lines = this.in.getCurline(); this.columns = startcol; if (this.lexsize > start + 1) { if (ch >= 128 && ch <= 159) { // invalid numeric character reference int c1 = 0; if ("WIN1252".equals(configuration.replacementCharEncoding)) { c1 = EncodingUtils.decodeWin1252(ch); } else if ("MACROMAN".equals(configuration.replacementCharEncoding)) { c1 = EncodingUtils.decodeMacRoman(ch); } // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR; if (c != ';') /* issue warning if not terminated by ';' */ { report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c); } report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch); if (c1 != 0) { // make the replacement this.lexsize = start; addCharToLexer(c1); semicolon = false; } else { /* discard */ this.lexsize = start; semicolon = false; } } else { report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); } if (semicolon) { addCharToLexer(';'); } } else { // naked & report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); } } else { // issue warning if not terminated by ';' if (c != ';') { // set error position just before offending character this.lines = this.in.getCurline(); this.columns = startcol; report.entityError(this, Report.MISSING_SEMICOLON, str, c); } this.lexsize = start; if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED)) { ch = ' '; } addCharToLexer(ch); if (ch == '&' && !this.configuration.quoteAmpersand) { addCharToLexer('a'); addCharToLexer('m'); addCharToLexer('p'); addCharToLexer(';'); } } } /** * Parses a tag name. * @return first char after the tag name */ public char parseTagName() { int c; // fold case of first char in buffer c = this.lexbuf[this.txtstart]; if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); this.lexbuf[this.txtstart] = (byte) c; } while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { if (!TidyUtils.isNamechar((char) c)) { break; } // fold case of subsequent chars if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } addCharToLexer(c); } this.txtend = this.lexsize; return (char) c; } /** * calls addCharToLexer for any char in the string. * @param str input String */ public void addStringLiteral(String str) { int len = str.length(); for (int i = 0; i < len; i++) { addCharToLexer(str.charAt(i)); } } /** * calls addCharToLexer for any char in the string till len is reached. * @param str input String * @param len length of the substring to be added */ void addStringLiteralLen(String str, int len) { int strlen = str.length(); if (strlen < len) { len = strlen; } for (int i = 0; i < len; i++) { addCharToLexer(str.charAt(i)); } } /** * Choose what version to use for new doctype. * @return html version constant */ public short htmlVersion() { if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20)) { return Dict.VERS_HTML20; } if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager) && TidyUtils.toBoolean(versions & Dict.VERS_HTML32)) { return Dict.VERS_HTML32; } if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11)) { return Dict.VERS_XHTML11; } if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT)) { return Dict.VERS_HTML40_STRICT; } if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE)) { return Dict.VERS_HTML40_LOOSE; } if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET)) { return Dict.VERS_FRAMESET; } return Dict.VERS_UNKNOWN; } /** * Choose what version to use for new doctype. * @return html version name */ public String htmlVersionName() { short guessed; int j; guessed = apparentVersion(); for (j = 0; j < W3CVERSION.length; ++j) { if (guessed == W3CVERSION[j].code) { if (this.isvoyager) { return W3CVERSION[j].voyagerName; } return W3CVERSION[j].name; } } return null; } /** * Add meta element for Tidy. If the meta tag is already present, update release date. * @param root root node * @return true if the tag has been added */ public boolean addGenerator(Node root) { AttVal attval; Node node; Node head = root.findHEAD(this.configuration.tt); if (head != null) { String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see jtidy.sourceforge.net"; for (node = head.content; node != null; node = node.next) { if (node.tag == this.configuration.tt.tagMeta) { attval = node.getAttrByName("name"); if (attval != null && "generator".equalsIgnoreCase(attval.value)) { attval = node.getAttrByName("content"); if (attval != null && attval.value != null && attval.value.length() >= 9 && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9))) { attval.value = meta; return false; } } } } node = this.inferredTag("meta"); node.addAttribute("content", meta); node.addAttribute("name", "generator"); head.insertNodeAtStart(node); return true; } return false; } /** * Check system keywords (keywords should be uppercase). * @param doctype doctype node * @return true if doctype keywords are all uppercase */ public boolean checkDocTypeKeyWords(Node doctype) { int len = doctype.end - doctype.start; String s = TidyUtils.getString(this.lexbuf, doctype.start, len); return !(TidyUtils.findBadSubString("SYSTEM", s, s.length()) || TidyUtils.findBadSubString("PUBLIC", s, s.length()) || TidyUtils.findBadSubString("//DTD", s, s.length()) || TidyUtils.findBadSubString("//W3C", s, s.length()) || TidyUtils.findBadSubString("//EN", s, s.length())); } /** * Examine DOCTYPE to identify version. * @param doctype doctype node * @return version code */ public short findGivenVersion(Node doctype) { String p, s; int i, j; int len; String str1; String str2; // if root tag for doctype isn't html give up now str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5); if (!"html ".equalsIgnoreCase(str1)) { return 0; } if (!checkDocTypeKeyWords(doctype)) { report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); } // give up if all we are given is the system id for the doctype str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7); if ("SYSTEM ".equalsIgnoreCase(str1)) { // but at least ensure the case is correct if (!str1.substring(0, 6).equals("SYSTEM")) { System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6); } return 0; // unrecognized } if ("PUBLIC ".equalsIgnoreCase(str1)) { if (!str1.substring(0, 6).equals("PUBLIC")) { System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6); } } else { this.badDoctype = true; } for (i = doctype.start; i < doctype.end; ++i) { if (this.lexbuf[i] == (byte) '"') { str1 = TidyUtils.getString(this.lexbuf, i + 1, 12); str2 = TidyUtils.getString(this.lexbuf, i + 1, 13); if (str1.equals("-//W3C//DTD ")) { // compute length of identifier e.g. "HTML 4.0 Transitional" for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j) { // } len = j - i - 13; p = TidyUtils.getString(this.lexbuf, i + 13, len); for (j = 1; j < W3CVERSION.length; ++j) { s = W3CVERSION[j].name; if (len == s.length() && s.equals(p)) { return W3CVERSION[j].code; } } // else unrecognized version } else if (str2.equals("-//IETF//DTD ")) { // compute length of identifier e.g. "HTML 2.0" for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j) { // } len = j - i - 14; p = TidyUtils.getString(this.lexbuf, i + 14, len); s = W3CVERSION[0].name; if (len == s.length() && s.equals(p)) { return W3CVERSION[0].code; } // else unrecognized version } break; } } return 0; } /** * Fix xhtml namespace. * @param root root Node * @param profile current profile */ public void fixHTMLNameSpace(Node root, String profile) { Node node; AttVal attr; node = root.content; while (node != null && node.tag != this.configuration.tt.tagHtml) { node = node.next; } if (node != null) { for (attr = node.attributes; attr != null; attr = attr.next) { if (attr.attribute.equals("xmlns")) { break; } } if (attr != null) { if (!attr.value.equals(profile)) { report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); attr.value = profile; } } else { attr = new AttVal(node.attributes, null, '"', "xmlns", profile); attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr); node.attributes = attr; } } } /** * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the * html tag. Should also work for any comments, etc. that may precede the html tag. * @param root root node * @return new doctype node */ Node newXhtmlDocTypeNode(Node root) { Node html = root.findHTML(this.configuration.tt); if (html == null) { return null; } Node newdoctype = newNode(); newdoctype.setType(Node.DOCTYPE_TAG); newdoctype.next = html; newdoctype.parent = root; newdoctype.prev = null; if (html == root.content) { // No declaration. root.content.prev = newdoctype; root.content = newdoctype; newdoctype.prev = null; } else { // we have an declaration. newdoctype.prev = html.prev; newdoctype.prev.next = newdoctype; } html.prev = newdoctype; return newdoctype; } /** * Adds a new xhtml doctype to the document. * @param root root node * @return true if a doctype has been added */ public boolean setXHTMLDocType(Node root) { String fpi = " "; String sysid = ""; String dtdsub = null; Node doctype; int dtdlen = 0; doctype = root.findDocType(); fixHTMLNameSpace(root, XHTML_NAMESPACE); // #427839 - fix by Evan Lenz 05 Sep 00 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) { Node.discardElement(doctype); } return true; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { // see what flavor of XHTML this document matches if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { // use XHTML strict fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = VOYAGER_STRICT; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { // use XHTML frames fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; sysid = VOYAGER_FRAMESET; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE)) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = VOYAGER_LOOSE; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { // use XHTML 1.1 fpi = "-//W3C//DTD XHTML 1.1//EN"; sysid = VOYAGER_11; } else { // proprietary fpi = null; sysid = ""; if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01 { Node.discardElement(doctype); } } } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = VOYAGER_STRICT; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = VOYAGER_LOOSE; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null) { fpi = this.configuration.docTypeStr; sysid = ""; } if (fpi == null) { return false; } if (doctype != null) { // Look for internal DTD subset if (configuration.xHTML || configuration.xmlOut) { int len = doctype.end - doctype.start + 1; String start = TidyUtils.getString(this.lexbuf, doctype.start, len); int dtdbeg = start.indexOf('['); if (dtdbeg >= 0) { int dtdend = start.substring(dtdbeg).indexOf(']'); if (dtdend >= 0) { dtdlen = dtdend + 1; dtdsub = start.substring(dtdbeg); } } } } else { if ((doctype = newXhtmlDocTypeNode(root)) == null) { return false; } } this.txtstart = this.lexsize; this.txtend = this.lexsize; // add public identifier addStringLiteral("html PUBLIC "); // check if the fpi is quoted or not if (fpi.charAt(0) == '"') { addStringLiteral(fpi); } else { addStringLiteral("\""); addStringLiteral(fpi); addStringLiteral("\""); } if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen) { addStringLiteral("\n\""); } else { // FG: don't wrap addStringLiteral(" \""); } // add system identifier addStringLiteral(sysid); addStringLiteral("\""); if (dtdlen > 0 && dtdsub != null) { addCharToLexer(' '); addStringLiteralLen(dtdsub, dtdlen); } this.txtend = this.lexsize; int length = this.txtend - this.txtstart; doctype.textarray = new byte[length]; System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length); doctype.start = 0; doctype.end = length; return false; } /** * Return the html version used in document. * @return version code */ public short apparentVersion() { switch (this.doctype) { case Dict.VERS_UNKNOWN : return htmlVersion(); case Dict.VERS_HTML20 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20)) { return Dict.VERS_HTML20; } break; case Dict.VERS_HTML32 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32)) { return Dict.VERS_HTML32; } break; // to replace old version by new case Dict.VERS_HTML40_STRICT : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { return Dict.VERS_HTML40_STRICT; } break; case Dict.VERS_HTML40_LOOSE : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE)) { return Dict.VERS_HTML40_LOOSE; } break; // to replace old version by new case Dict.VERS_FRAMESET : if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { return Dict.VERS_FRAMESET; } break; case Dict.VERS_XHTML11 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { return Dict.VERS_XHTML11; } break; default : // should never reach here break; } // kludge to avoid error appearing at end of file // it would be better to note the actual position // when first encountering the doctype declaration this.lines = 1; this.columns = 1; report.warning(this, null, null, Report.INCONSISTENT_VERSION); return this.htmlVersion(); } /** * Fixup doctype if missing. * @param root root node * @return false if current version has not been identified */ public boolean fixDocType(Node root) { Node doctype; int guessed = Dict.VERS_HTML40_STRICT, i; if (this.badDoctype) { report.warning(this, null, null, Report.MALFORMED_DOCTYPE); } doctype = root.findDocType(); if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) { Node.discardElement(doctype); } return true; } if (this.configuration.xmlOut) { return true; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_STRICT; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_LOOSE; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { if (doctype != null) { if (this.doctype == Dict.VERS_UNKNOWN) { return false; } switch (this.doctype) { case Dict.VERS_UNKNOWN : return false; case Dict.VERS_HTML20 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20)) { return true; } break; // to replace old version by new case Dict.VERS_HTML32 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32)) { return true; } break; // to replace old version by new case Dict.VERS_HTML40_STRICT : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { return true; } break; // to replace old version by new case Dict.VERS_HTML40_LOOSE : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE)) { return true; } break; // to replace old version by new case Dict.VERS_FRAMESET : if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { return true; } break; // to replace old version by new case Dict.VERS_XHTML11 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { return true; } break; // to replace old version by new default : // should never reach here break; } // INCONSISTENT_VERSION warning is now issued by ApparentVersion() } // choose new doctype guessed = htmlVersion(); } if (guessed == Dict.VERS_UNKNOWN) { return false; } // for XML use the Voyager system identifier if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager) { if (doctype != null) { Node.discardElement(doctype); } fixHTMLNameSpace(root, XHTML_NAMESPACE); // Namespace is the same for all XHTML variants // Also, don't return yet. Still need to add DOCTYPE declaration. // // for (i = 0; i < W3CVersion.length; ++i) // { // if (guessed == W3CVersion[i].code) // { // fixHTMLNameSpace(root, W3CVersion[i].profile); // break; // } // } // return true; } if (doctype == null) { if ((doctype = newXhtmlDocTypeNode(root)) == null) { return false; } } this.txtstart = this.lexsize; this.txtend = this.lexsize; // use the appropriate public identifier addStringLiteral("html PUBLIC "); if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null && this.configuration.docTypeStr.length() > 0) { // check if the fpi is quoted or not if (this.configuration.docTypeStr.charAt(0) == '"') { addStringLiteral(this.configuration.docTypeStr); } else { addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001 addStringLiteral(this.configuration.docTypeStr); addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001 } } else if (guessed == Dict.VERS_HTML20) { addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\""); } else { addStringLiteral("\"-//W3C//DTD "); for (i = 0; i < W3CVERSION.length; ++i) { if (guessed == W3CVERSION[i].code) { addStringLiteral(W3CVERSION[i].name); break; } } addStringLiteral("//EN\""); } this.txtend = this.lexsize; int length = this.txtend - this.txtstart; doctype.textarray = new byte[length]; System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length); doctype.start = 0; doctype.end = length; return true; } /** * Ensure XML document starts with <?XML version="1.0"?>. Add encoding attribute if not using * ASCII or UTF-8 output. * @param root root node * @return always true */ public boolean fixXmlDecl(Node root) { Node xml; AttVal version; AttVal encoding; if (root.content != null && root.content.type == Node.XML_DECL) { xml = root.content; } else { xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0); xml.next = root.content; if (root.content != null) { root.content.prev = xml; xml.next = root.content; } root.content = xml; } version = xml.getAttrByName("version"); encoding = xml.getAttrByName("encoding"); // We need to insert a check if declared encoding and output encoding mismatch // and fix the Xml declaration accordingly!!! if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName())) { if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName())) { xml.addAttribute("encoding", "iso-8859-1"); } if ("ISO2022".equals(this.configuration.getOutCharEncodingName())) { xml.addAttribute("encoding", "iso-2022"); } } if (version == null) { xml.addAttribute("version", "1.0"); } return true; } /** * Generates and inserts a new node. * @param name tag name * @return generated node */ public Node inferredTag(String name) { Node node; node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name); node.implicit = true; return node; } private static final int CDATA_INTERMEDIATE = 0; private static final int CDATA_STARTTAG = 1; private static final int CDATA_ENDTAG = 2; /** * Create a text node for the contents of a CDATA element like style or script which * ends with </foo> for some foo. * @param container container node * @return cdata node */ public Node getCDATA(Node container) { int start = 0; int nested = 0; int state = CDATA_INTERMEDIATE; int c; boolean isEmpty = true; boolean matches = false; boolean hasSrc = container.getAttrByName("src") != null; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; /* seen start tag, look for matching end tag */ while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { addCharToLexer(c); txtend = lexsize; if (state == CDATA_INTERMEDIATE) { if (c != '<') { if (isEmpty && !TidyUtils.isWhite((char) c)) { isEmpty = false; } continue; } c = in.readChar(); if (TidyUtils.isLetter((char) c)) { /*