All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ajax4jsf.org.w3c.tidy.Lexer Maven / Gradle / Ivy

Go to download

Ajax4jsf is an open source extension to the JavaServer Faces standard that adds AJAX capability to JSF applications without requiring the writing of any JavaScript.

The newest version!
/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 *
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights.
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 *
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */
package org.ajax4jsf.org.w3c.tidy;

import java.io.PrintWriter;
import java.util.List;
import java.util.Stack;
import java.util.Vector;


/**
 * Lexer for html parser.
 * 

* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2 * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case. * Not yet done: - Doctype subset and marked sections *

* @author Dave Raggett [email protected] * @author Andy Quick [email protected] (translation to Java) * @author Fabrizio Giustina * @version $Revision: 1.8 $ ($Author: alexsmirnov $) */ public class Lexer { /** * state: ignore whitespace. */ public static final short IGNORE_WHITESPACE = 0; /** * state: mixed content. */ public static final short MIXED_CONTENT = 1; /** * state: preformatted. */ public static final short PREFORMATTED = 2; /** * state: ignore markup. */ public static final short IGNORE_MARKUP = 3; /** * URI for XHTML 1.0 transitional DTD. */ private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; /** * URI for XHTML 1.0 strict DTD. */ private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; /** * URI for XHTML 1.0 frameset DTD. */ private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"; /** * URI for XHTML 1.1. */ private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"; /** * URI for XHTML Basic 1.0. */ // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"; /** * xhtml namespace. */ private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; /** * lists all the known versions. */ private static final Lexer.W3CVersionInfo[] W3CVERSION = { new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET), new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET), new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20), new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)}; /** * getToken state: content. */ private static final short LEX_CONTENT = 0; /** * getToken state: gt. */ private static final short LEX_GT = 1; /** * getToken state: endtag. */ private static final short LEX_ENDTAG = 2; /** * getToken state: start tag. */ private static final short LEX_STARTTAG = 3; /** * getToken state: comment. */ private static final short LEX_COMMENT = 4; /** * getToken state: doctype. */ private static final short LEX_DOCTYPE = 5; /** * getToken state: procinstr. */ private static final short LEX_PROCINSTR = 6; /** * getToken state: cdata. */ private static final short LEX_CDATA = 8; /** * getToken state: section. */ private static final short LEX_SECTION = 9; /** * getToken state: asp. */ private static final short LEX_ASP = 10; /** * getToken state: jste. */ private static final short LEX_JSTE = 11; /** * getToken state: php. */ private static final short LEX_PHP = 12; /** * getToken state: xml declaration. */ private static final short LEX_XMLDECL = 13; /** * file stream. */ protected StreamIn in; /** * error output stream. */ protected PrintWriter errout; /** * for accessibility errors. */ protected short badAccess; /** * for bad style errors. */ protected short badLayout; /** * for bad char encodings. */ protected short badChars; /** * for mismatched/mispositioned form tags. */ protected short badForm; /** * count of warnings in this document. */ protected short warnings; /** * count of errors. */ protected short errors; /** * lines seen. */ protected int lines; /** * at start of current token. */ protected int columns; /** * used to collapse contiguous white space. */ protected boolean waswhite; /** * true after token has been pushed back. */ protected boolean pushed; /** * when space is moved after end tag. */ protected boolean insertspace; /** * Netscape compatibility. */ protected boolean excludeBlocks; /** * true if moved out of table. */ protected boolean exiled; /** * true if xmlns attribute on html element. */ protected boolean isvoyager; /** * bit vector of HTML versions. */ protected short versions; /** * version as given by doctype (if any). */ protected int doctype; /** * set if html or PUBLIC is missing. */ protected boolean badDoctype; /** * start of current node. */ protected int txtstart; /** * end of current node. */ protected int txtend; /** * state of lexer's finite state machine. */ protected short state; /** * current node. */ protected Node token; /** * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars. */ protected byte[] lexbuf; /** * allocated. */ protected int lexlength; /** * used. */ protected int lexsize; /** * Inline stack for compatibility with Mosaic. For deferring text node. */ protected Node inode; /** * for inferring inline tags. */ protected int insert; /** * stack. */ protected Stack istack; /** * start of frame. */ protected int istackbase; /** * used for cleaning up presentation markup. */ protected Style styles; /** * configuration. */ protected Configuration configuration; /** * already seen end body tag? */ protected boolean seenEndBody; /** * already seen end html tag? */ protected boolean seenEndHtml; /** * report. */ protected Report report; /** * Root node is saved here. */ protected Node root; /** * node list. */ private List nodeList; /** * Instantiates a new Lexer. * @param in StreamIn * @param configuration configuation instance * @param report report instance, for reporting errors */ public Lexer(StreamIn in, Configuration configuration, Report report) { this.report = report; this.in = in; this.lines = 1; this.columns = 1; this.state = LEX_CONTENT; this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY); this.doctype = Dict.VERS_UNKNOWN; this.insert = -1; this.istack = new Stack(); this.configuration = configuration; this.nodeList = new Vector(); } /** * Creates a new node and add it to nodelist. * @return Node */ public Node newNode() { Node node = new Node(); this.nodeList.add(node); return node; } /** * Creates a new node and add it to nodelist. * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE | * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG | * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL * @param textarray array of bytes contained in the Node * @param start start position * @param end end position * @return Node */ public Node newNode(short type, byte[] textarray, int start, int end) { Node node = new Node(type, textarray, start, end); this.nodeList.add(node); return node; } /** * Creates a new node and add it to nodelist. * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE | * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG | * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL * @param textarray array of bytes contained in the Node * @param start start position * @param end end position * @param element tag name * @return Node */ public Node newNode(short type, byte[] textarray, int start, int end, String element) { Node node = new Node(type, textarray, start, end, element, this.configuration.tt); this.nodeList.add(node); return node; } /** * Clones a node and add it to node list. * @param node Node * @return cloned Node */ public Node cloneNode(Node node) { Node cnode = (Node) node.clone(); this.nodeList.add(cnode); for (AttVal att = cnode.attributes; att != null; att = att.next) { if (att.asp != null) { this.nodeList.add(att.asp); } if (att.php != null) { this.nodeList.add(att.php); } } return cnode; } /** * Clones an attribute value and add eventual asp or php node to node list. * @param attrs original AttVal * @return cloned AttVal */ public AttVal cloneAttributes(AttVal attrs) { AttVal cattrs = (AttVal) attrs.clone(); for (AttVal att = cattrs; att != null; att = att.next) { if (att.asp != null) { this.nodeList.add(att.asp); } if (att.php != null) { this.nodeList.add(att.php); } } return cattrs; } /** * Update oldtextarray in the current nodes. * @param oldtextarray previous text array * @param newtextarray new text array */ protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) { Node node; for (int i = 0; i < this.nodeList.size(); i++) { node = (Node) (this.nodeList.get(i)); if (node.textarray == oldtextarray) { node.textarray = newtextarray; } } } /** * Adds a new line node. Used for creating preformatted text from Word2000. * @return new line node */ public Node newLineNode() { Node node = newNode(); node.textarray = this.lexbuf; node.start = this.lexsize; addCharToLexer('\n'); node.end = this.lexsize; return node; } /** * Has end of input stream been reached? * @return true if end of input stream been reached */ public boolean endOfInput() { return this.in.isEndOfStream(); } /** * Adds a byte to lexer buffer. * @param c byte to add */ public void addByte(int c) { if (this.lexsize + 1 >= this.lexlength) { while (this.lexsize + 1 >= this.lexlength) { if (this.lexlength == 0) { this.lexlength = 8192; } else { this.lexlength = this.lexlength * 2; } } byte[] temp = this.lexbuf; this.lexbuf = new byte[this.lexlength]; if (temp != null) { System.arraycopy(temp, 0, this.lexbuf, 0, temp.length); updateNodeTextArrays(temp, this.lexbuf); } } this.lexbuf[this.lexsize++] = (byte) c; this.lexbuf[this.lexsize] = (byte) '\0'; // debug } /** * Substitute the last char in buffer. * @param c new char */ public void changeChar(byte c) { if (this.lexsize > 0) { this.lexbuf[this.lexsize - 1] = c; } } /** * Store char c as UTF-8 encoded byte stream. * @param c char to store */ public void addCharToLexer(int c) { // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char // Fix by Pablo Mayrgundter 17-08-2004 if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first. || c == 0x9 || c == 0xA || c == 0xD // Then white-space. || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode. || (c >= 0x10000 && c <= 0x10FFFF))) { return; } int i = 0; int[] count = new int[]{0}; byte[] buf = new byte[10]; // unsigned char boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count); if (err) { // replacement char 0xFFFD encoded as UTF-8 buf[0] = (byte) 0xEF; buf[1] = (byte) 0xBF; buf[2] = (byte) 0xBD; count[0] = 3; } for (i = 0; i < count[0]; i++) { addByte(buf[i]); // uint } } /** * Adds a string to lexer buffer. * @param str String to add */ public void addStringToLexer(String str) { for (int i = 0; i < str.length(); i++) { addCharToLexer(str.charAt(i)); } } /** * Parse an html entity. * @param mode mode */ public void parseEntity(short mode) { // No longer attempts to insert missing ';' for unknown // entities unless one was present already, since this // gives unexpected results. // // For example: // was tidied to: // rather than: // // My thanks for Maurice Buxton for spotting this. // // Also Randy Waki pointed out the following case for the // 04 Aug 00 version (bug #433012): // // For example: // was tidied to: // rather than: // // where "lang" is a known entity (#9001), but browsers would // misinterpret "⟨" because it had a value > 256. // // So the case of an apparently known entity with a value > 256 and // missing a semicolon is handled specially. // // "ParseEntity" is also a bit of a misnomer - it handles entities and // numeric character references. Invalid NCR's are now reported. int start; boolean first = true; boolean semicolon = false; int c, ch, startcol; String str; start = this.lexsize - 1; // to start at "&" startcol = this.in.getCurcol() - 1; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { if (c == ';') { semicolon = true; break; } if (first && c == '#') { // #431953 - start RJ if (!this.configuration.ncr || this.configuration.getInCharEncoding() == Configuration.BIG5 || this.configuration.getInCharEncoding() == Configuration.SHIFTJIS) { this.in.ungetChar(c); return; } // #431953 - end RJ addCharToLexer(c); first = false; continue; } first = false; if (TidyUtils.isNamechar((char) c)) { addCharToLexer(c); continue; } // otherwise put it back this.in.ungetChar(c); break; } str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start); if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML) { report.entityError(this, Report.APOS_UNDEFINED, str, 39); } ch = EntityTable.getDefaultEntityTable().entityCode(str); // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004 // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first. // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space. // || (ch >= 0xE000 && ch <= 0xFFFD))) // { // this.lexsize = start; // return; // } // deal with unrecognized or invalid entities // #433012 - fix by Randy Waki 17 Feb 01 // report invalid NCR's - Terry Teague 01 Sep 01 if (ch <= 0 || (ch >= 256 && c != ';')) { // set error position just before offending character this.lines = this.in.getCurline(); this.columns = startcol; if (this.lexsize > start + 1) { if (ch >= 128 && ch <= 159) { // invalid numeric character reference int c1 = 0; if (configuration.replacementCharEncoding == Configuration.WIN1252) { c1 = EncodingUtils.decodeWin1252(ch); } else if (configuration.replacementCharEncoding == Configuration.MACROMAN) { c1 = EncodingUtils.decodeMacRoman(ch); } // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR; if (c != ';') /* issue warning if not terminated by ';' */ { report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c); } report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch); if (c1 != 0) { // make the replacement this.lexsize = start; addCharToLexer(c1); semicolon = false; } else { /* discard */ this.lexsize = start; semicolon = false; } } else { report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); } if (semicolon) { addCharToLexer(';'); } } else { // naked & report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); } } else { // issue warning if not terminated by ';' if (c != ';') { // set error position just before offending character this.lines = this.in.getCurline(); this.columns = startcol; report.entityError(this, Report.MISSING_SEMICOLON, str, c); } this.lexsize = start; if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED)) { ch = ' '; } addCharToLexer(ch); if (ch == '&' && !this.configuration.quoteAmpersand) { addCharToLexer('a'); addCharToLexer('m'); addCharToLexer('p'); addCharToLexer(';'); } } } /** * Parses a tag name. * @return first char after the tag name */ public char parseTagName() { int c; // fold case of first char in buffer c = this.lexbuf[this.txtstart]; if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); this.lexbuf[this.txtstart] = (byte) c; } while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { if (!TidyUtils.isNamechar((char) c)) { break; } // fold case of subsequent chars if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } addCharToLexer(c); } this.txtend = this.lexsize; return (char) c; } /** * calls addCharToLexer for any char in the string. * @param str input String */ public void addStringLiteral(String str) { int len = str.length(); for (int i = 0; i < len; i++) { addCharToLexer(str.charAt(i)); } } /** * calls addCharToLexer for any char in the string till len is reached. * @param str input String * @param len length of the substring to be added */ void addStringLiteralLen(String str, int len) { int strlen = str.length(); if (strlen < len) { len = strlen; } for (int i = 0; i < len; i++) { addCharToLexer(str.charAt(i)); } } /** * Choose what version to use for new doctype. * @return html version constant */ public short htmlVersion() { if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20)) { return Dict.VERS_HTML20; } if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager) && TidyUtils.toBoolean(versions & Dict.VERS_HTML32)) { return Dict.VERS_HTML32; } if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11)) { return Dict.VERS_XHTML11; } if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT)) { return Dict.VERS_HTML40_STRICT; } if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE)) { return Dict.VERS_HTML40_LOOSE; } if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET)) { return Dict.VERS_FRAMESET; } return Dict.VERS_UNKNOWN; } /** * Choose what version to use for new doctype. * @return html version name */ public String htmlVersionName() { short guessed; int j; guessed = apparentVersion(); for (j = 0; j < W3CVERSION.length; ++j) { if (guessed == W3CVERSION[j].code) { if (this.isvoyager) { return W3CVERSION[j].voyagerName; } return W3CVERSION[j].name; } } return null; } /** * Add meta element for Tidy. If the meta tag is already present, update release date. * @param root root node * @return true if the tag has been added */ public boolean addGenerator(Node root) { AttVal attval; Node node; Node head = root.findHEAD(this.configuration.tt); if (head != null) { String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see www.w3.org"; for (node = head.content; node != null; node = node.next) { if (node.tag == this.configuration.tt.tagMeta) { attval = node.getAttrByName("name"); if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value)) { attval = node.getAttrByName("content"); if (attval != null && attval.value != null && attval.value.length() >= 9 && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9))) { attval.value = meta; return false; } } } } node = this.inferredTag("meta"); node.addAttribute("content", meta); node.addAttribute("name", "generator"); head.insertNodeAtStart(node); return true; } return false; } /** * Check system keywords (keywords should be uppercase). * @param doctype doctype node * @return true if doctype keywords are all uppercase */ public boolean checkDocTypeKeyWords(Node doctype) { int len = doctype.end - doctype.start; String s = TidyUtils.getString(this.lexbuf, doctype.start, len); return !(TidyUtils.findBadSubString("SYSTEM", s, len) || TidyUtils.findBadSubString("PUBLIC", s, len) || TidyUtils.findBadSubString("//DTD", s, len) || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils.findBadSubString("//EN", s, len)); } /** * Examine DOCTYPE to identify version. * @param doctype doctype node * @return version code */ public short findGivenVersion(Node doctype) { String p, s; int i, j; int len; String str1; String str2; // if root tag for doctype isn't html give up now str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5); if (!"html ".equalsIgnoreCase(str1)) { return 0; } if (!checkDocTypeKeyWords(doctype)) { report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); } // give up if all we are given is the system id for the doctype str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7); if ("SYSTEM ".equalsIgnoreCase(str1)) { // but at least ensure the case is correct if (!str1.substring(0, 6).equals("SYSTEM")) { System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6); } return 0; // unrecognized } if ("PUBLIC ".equalsIgnoreCase(str1)) { if (!str1.substring(0, 6).equals("PUBLIC")) { System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6); } } else { this.badDoctype = true; } for (i = doctype.start; i < doctype.end; ++i) { if (this.lexbuf[i] == (byte) '"') { str1 = TidyUtils.getString(this.lexbuf, i + 1, 12); str2 = TidyUtils.getString(this.lexbuf, i + 1, 13); if (str1.equals("-//W3C//DTD ")) { // compute length of identifier e.g. "HTML 4.0 Transitional" for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j) { // } len = j - i - 13; p = TidyUtils.getString(this.lexbuf, i + 13, len); for (j = 1; j < W3CVERSION.length; ++j) { s = W3CVERSION[j].name; if (len == s.length() && s.equals(p)) { return W3CVERSION[j].code; } } // else unrecognized version } else if (str2.equals("-//IETF//DTD ")) { // compute length of identifier e.g. "HTML 2.0" for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j) { // } len = j - i - 14; p = TidyUtils.getString(this.lexbuf, i + 14, len); s = W3CVERSION[0].name; if (len == s.length() && s.equals(p)) { return W3CVERSION[0].code; } // else unrecognized version } break; } } return 0; } /** * Fix xhtml namespace. * @param root root Node * @param profile current profile */ public void fixHTMLNameSpace(Node root, String profile) { Node node; AttVal attr; node = root.content; while (node != null && node.tag != this.configuration.tt.tagHtml) { node = node.next; } if (node != null) { for (attr = node.attributes; attr != null; attr = attr.next) { if (attr.attribute.equals("xmlns")) { break; } } if (attr != null) { if (!attr.value.equals(profile)) { report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); attr.value = profile; } } else { attr = new AttVal(node.attributes, null, '"', "xmlns", profile); attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr); node.attributes = attr; } } } /** * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the * html tag. Should also work for any comments, etc. that may precede the html tag. * @param root root node * @return new doctype node */ Node newXhtmlDocTypeNode(Node root) { Node html = root.findHTML(this.configuration.tt); if (html == null) { return null; } Node newdoctype = newNode(); newdoctype.setType(Node.DOCTYPE_TAG); newdoctype.next = html; newdoctype.parent = root; newdoctype.prev = null; if (html == root.content) { // No declaration. root.content.prev = newdoctype; root.content = newdoctype; newdoctype.prev = null; } else { // we have an declaration. newdoctype.prev = html.prev; newdoctype.prev.next = newdoctype; } html.prev = newdoctype; return newdoctype; } /** * Adds a new xhtml doctype to the document. * @param root root node * @return true if a doctype has been added */ public boolean setXHTMLDocType(Node root) { String fpi = " "; String sysid = ""; String namespace = XHTML_NAMESPACE; String dtdsub = null; Node doctype; int dtdlen = 0; if ( this.configuration.docTypeMode == Configuration.DOCTYPE_IGNORE) { return true; } doctype = root.findDocType(); fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) { Node.discardElement(doctype); } return true; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { // see what flavor of XHTML this document matches if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { // use XHTML strict fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = VOYAGER_STRICT; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { // use XHTML frames fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; sysid = VOYAGER_FRAMESET; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE)) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = VOYAGER_LOOSE; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { // use XHTML 1.1 fpi = "-//W3C//DTD XHTML 1.1//EN"; sysid = VOYAGER_11; } else { // proprietary fpi = null; sysid = ""; if (doctype != null)// #473490 - fix by Bjoern Hoehrmann 10 Oct 01 { Node.discardElement(doctype); } } } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = VOYAGER_STRICT; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = VOYAGER_LOOSE; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null) { fpi = this.configuration.docTypeStr; sysid = ""; } if (fpi == null) { return false; } if (doctype != null) { // Look for internal DTD subset if (configuration.xHTML || configuration.xmlOut) { int len = doctype.end - doctype.start + 1; String start = TidyUtils.getString(this.lexbuf, doctype.start, len); int dtdbeg = start.indexOf('['); if (dtdbeg >= 0) { int dtdend = start.substring(dtdbeg).indexOf(']'); if (dtdend >= 0) { dtdlen = dtdend + 1; dtdsub = start.substring(dtdbeg); } } } } else { if ((doctype = newXhtmlDocTypeNode(root)) == null) { return false; } } this.txtstart = this.lexsize; this.txtend = this.lexsize; // add public identifier addStringLiteral("html PUBLIC "); // check if the fpi is quoted or not if (fpi.charAt(0) == '"') { addStringLiteral(fpi); } else { addStringLiteral("\""); addStringLiteral(fpi); addStringLiteral("\""); } if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen) { addStringLiteral("\n\""); } else { // FG: don't wrap addStringLiteral(" \""); } // add system identifier addStringLiteral(sysid); addStringLiteral("\""); if (dtdlen > 0 && dtdsub != null) { addCharToLexer(' '); addStringLiteralLen(dtdsub, dtdlen); } this.txtend = this.lexsize; int length = this.txtend - this.txtstart; doctype.textarray = new byte[length]; System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length); doctype.start = 0; doctype.end = length; return false; } /** * Return the html version used in document. * @return version code */ public short apparentVersion() { switch (this.doctype) { case Dict.VERS_UNKNOWN : return htmlVersion(); case Dict.VERS_HTML20 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20)) { return Dict.VERS_HTML20; } break; case Dict.VERS_HTML32 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32)) { return Dict.VERS_HTML32; } break; // to replace old version by new case Dict.VERS_HTML40_STRICT : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { return Dict.VERS_HTML40_STRICT; } break; case Dict.VERS_HTML40_LOOSE : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE)) { return Dict.VERS_HTML40_LOOSE; } break; // to replace old version by new case Dict.VERS_FRAMESET : if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { return Dict.VERS_FRAMESET; } break; case Dict.VERS_XHTML11 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { return Dict.VERS_XHTML11; } break; default : // should never reach here break; } // kludge to avoid error appearing at end of file // it would be better to note the actual position // when first encountering the doctype declaration this.lines = 1; this.columns = 1; report.warning(this, null, null, Report.INCONSISTENT_VERSION); return this.htmlVersion(); } /** * Fixup doctype if missing. * @param root root node * @return false if current version has not been identified */ public boolean fixDocType(Node root) { Node doctype; int guessed = Dict.VERS_HTML40_STRICT, i; if (this.badDoctype) { report.warning(this, null, null, Report.MALFORMED_DOCTYPE); } doctype = root.findDocType(); if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) { Node.discardElement(doctype); } return true; } if (this.configuration.xmlOut || this.configuration.docTypeMode == Configuration.DOCTYPE_IGNORE) { return true; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_STRICT; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_LOOSE; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { if (doctype != null) { if (this.doctype == Dict.VERS_UNKNOWN) { return false; } switch (this.doctype) { case Dict.VERS_UNKNOWN : return false; case Dict.VERS_HTML20 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20)) { return true; } break; // to replace old version by new case Dict.VERS_HTML32 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32)) { return true; } break; // to replace old version by new case Dict.VERS_HTML40_STRICT : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { return true; } break; // to replace old version by new case Dict.VERS_HTML40_LOOSE : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE)) { return true; } break; // to replace old version by new case Dict.VERS_FRAMESET : if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { return true; } break; // to replace old version by new case Dict.VERS_XHTML11 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { return true; } break; // to replace old version by new default : // should never reach here break; } // INCONSISTENT_VERSION warning is now issued by ApparentVersion() } // choose new doctype guessed = htmlVersion(); } if (guessed == Dict.VERS_UNKNOWN) { return false; } // for XML use the Voyager system identifier if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager) { if (doctype != null) { Node.discardElement(doctype); } fixHTMLNameSpace(root, XHTML_NAMESPACE); // Namespace is the same for all XHTML variants // Also, don't return yet. Still need to add DOCTYPE declaration. // // for (i = 0; i < W3CVersion.length; ++i) // { // if (guessed == W3CVersion[i].code) // { // fixHTMLNameSpace(root, W3CVersion[i].profile); // break; // } // } // return true; } if (doctype == null) { if ((doctype = newXhtmlDocTypeNode(root)) == null) { return false; } } this.txtstart = this.lexsize; this.txtend = this.lexsize; // use the appropriate public identifier addStringLiteral("html PUBLIC "); if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null && this.configuration.docTypeStr.length() > 0) { // check if the fpi is quoted or not if (this.configuration.docTypeStr.charAt(0) == '"') { addStringLiteral(this.configuration.docTypeStr); } else { addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001 addStringLiteral(this.configuration.docTypeStr); addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001 } } else if (guessed == Dict.VERS_HTML20) { addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\""); } else { addStringLiteral("\"-//W3C//DTD "); for (i = 0; i < W3CVERSION.length; ++i) { if (guessed == W3CVERSION[i].code) { addStringLiteral(W3CVERSION[i].name); break; } } addStringLiteral("//EN\""); } this.txtend = this.lexsize; int length = this.txtend - this.txtstart; doctype.textarray = new byte[length]; System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length); doctype.start = 0; doctype.end = length; return true; } /** * Ensure XML document starts with <?XML version="1.0"?>. Add encoding attribute if not using * ASCII or UTF-8 output. * @param root root node * @return always true */ public boolean fixXmlDecl(Node root) { Node xml; AttVal version; AttVal encoding; if (root.content != null && root.content.type == Node.XML_DECL) { xml = root.content; } else { xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0); xml.next = root.content; if (root.content != null) { root.content.prev = xml; xml.next = root.content; } root.content = xml; } version = xml.getAttrByName("version"); encoding = xml.getAttrByName("encoding"); // We need to insert a check if declared encoding and output encoding mismatch // and fix the Xml declaration accordingly!!! if (encoding == null && this.configuration.getOutCharEncoding() != Configuration.UTF8) { if (this.configuration.getOutCharEncoding() == Configuration.LATIN1) { xml.addAttribute("encoding", "iso-8859-1"); } if (this.configuration.getOutCharEncoding() == Configuration.ISO2022) { xml.addAttribute("encoding", "iso-2022"); } } if (version == null) { xml.addAttribute("version", "1.0"); } return true; } /** * Generates and inserts a new node. * @param name tag name * @return generated node */ public Node inferredTag(String name) { Node node; node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name); node.implicit = true; return node; } /** * Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some * foo. * @param container container node * @return cdata node */ public Node getCDATA(Node container) { int c, lastc, prelastc, start, len, i; int qt = 0; int esc = 0; String str=""; boolean endtag = false; boolean begtag = false; boolean cdata = false; boolean comment = false; if (container.isJavaScript()) { esc = '\\'; } this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; lastc = '\0'; prelastc = '\0'; start = -1; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { // treat \r\n as \n and \r as \n if (qt > 0) { // #598860 script parsing fails with quote chars // A quoted string is ended by the quotation character, or end of line if ((c == '\r' || c == '\n' || c == qt) && (!TidyUtils.toBoolean(esc) || lastc != esc)) { qt = 0; } else if (c == '/' && lastc == '<') { start = this.lexsize + 1; // to first letter } else if (c == '>' && start >= 0) { len = this.lexsize - start; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol() - 3; report.warning(this, null, null, Report.BAD_CDATA_CONTENT); // if javascript insert backslash before / if (TidyUtils.toBoolean(esc)) { for (i = this.lexsize; i > start - 1; --i) { this.lexbuf[i] = this.lexbuf[i - 1]; } this.lexbuf[start - 1] = (byte) esc; this.lexsize++; } start = -1; } } else if (TidyUtils.isQuote(c) && (!TidyUtils.toBoolean(esc) || lastc != esc)) { qt = c; } else if (c == '<' && !cdata && !comment) { start = this.lexsize + 1; // to first letter endtag = false; begtag = true; } // else if (c == '!' && lastc == '<') // Cancel start tag // { // start = -1; // endtag = false; // begtag = false; // } // Fix CDATA and comments. else if( c == '[' && this.lexsize >= 8 && TidyUtils.getString(this.lexbuf, this.lexsize-8, 8).equals("= 3 && TidyUtils.getString(this.lexbuf, this.lexsize-3, 3).equals("' && cdata && lastc == ']' && prelastc == ']'){ cdata = false; } else if( c == '>' && comment && lastc == '-' && prelastc == '-'){ comment = false; if(cdata || this.configuration.xHTML ){ this.lexsize -= 2; continue; } } else if (c == '>' && start >= 0) // End of begin or end tag { int decr = 2; if (endtag) { // str = TidyUtils.getString(this.lexbuf, start, len); if (container.element.equalsIgnoreCase(str)) { this.txtend = start - decr; this.lexsize = start - decr; // #433857 - fix by Huajun Zeng 26 Apr 01 break; } } // Unquoted markup will end SCRIPT or STYLE elements this.lines = this.in.getCurline(); this.columns = this.in.getCurcol() - 3; report.warning(this, null, null, Report.BAD_CDATA_CONTENT); if (begtag) { decr = 1; } this.txtend = start - decr; this.lexsize = start - decr; break; } // #427844 - fix by Markus Hoenicka 21 Oct 00 else if (c == '\r') { // if (begtag || endtag) // { // continue; // discard whitespace in endtag // } c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } c = '\n'; } // else if ((c == '\n' || c == '\t' || c == ' ') && (endtag)) // { // continue; // discard whitespace in endtag // } if (endtag && TidyUtils.isNamechar((char) c) ) { str = str + (char) c; } if(begtag && !TidyUtils.isNamechar((char) c)){ if(lastc == '<' || ( qt == 0 && c !='=' && c !=';' && !TidyUtils.isWhite((char) c)) ){ start = -1; endtag = false; begtag = false; } } addCharToLexer(c); this.txtend = this.lexsize; prelastc = lastc; lastc = c; } if (c == StreamIn.END_OF_STREAM) { report.warning(this, container, null, Report.MISSING_ENDTAG_FOR); } if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } return null; } /** * * */ public void ungetToken() { this.pushed = true; } /** * Gets a token. * @param mode one of the following: *
    *
  • MixedContent-- for elements which don't accept PCDATA
  • *
  • Preformatted-- white spacepreserved as is
  • *
  • IgnoreMarkup-- for CDATA elements such as script, style
  • *
* @return next Node */ public Node getToken(short mode) { int c = 0; int badcomment = 0; // pass by reference boolean[] isempty = new boolean[1]; boolean inDTDSubset = false; AttVal attributes = null; short basemode = mode; if (this.pushed) { // duplicate inlines in preference to pushed text nodes when appropriate if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null)) { this.pushed = false; return this.token; } } // at start of block elements, unclosed inline if (this.insert != -1 || this.inode != null) { return insertedToken(); } this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { // FG fix for [427846] different from tidy // if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE))) if (this.insertspace && mode != IGNORE_WHITESPACE) { addCharToLexer(' '); } if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE))) { this.waswhite = true; this.insertspace = false; } // treat \r\n as \n and \r as \n if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } c = '\n'; } addCharToLexer(c); switch (this.state) { case LEX_CONTENT : // element content // Discard white space if appropriate. // Its cheaper to do this here rather than in parser methods for elements that // don't have mixed content. if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1) { --this.lexsize; this.waswhite = false; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); continue; } if (c == '<') { this.state = LEX_GT; continue; } if (TidyUtils.isWhite((char) c)) { // was previous char white? if (this.waswhite) { if (mode != PREFORMATTED && mode != IGNORE_MARKUP) { --this.lexsize; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); } } else { // prev char wasn't white this.waswhite = true; if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ') { changeChar((byte) ' '); } } continue; } else if (c == '&' && mode != IGNORE_MARKUP) { parseEntity(mode); } // this is needed to avoid trimming trailing whitespace if (mode == IGNORE_WHITESPACE) { mode = MIXED_CONTENT; } this.waswhite = false; continue; case LEX_GT : // < // check for endtag if (c == '/') { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); continue; } addCharToLexer(c); if (TidyUtils.isLetter((char) c)) { this.lexsize -= 3; this.txtend = this.lexsize; this.in.ungetChar(c); this.state = LEX_ENDTAG; this.lexbuf[this.lexsize] = (byte) '\0'; // debug // changed from // this.in.curcol -= 2; this.columns -= 2; // if some text before the this.txtstart) { // trim space char before end tag if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ') { this.lexsize -= 1; this.txtend = this.lexsize; } this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } continue; // no text so keep going } // otherwise treat as CDATA this.waswhite = false; this.state = LEX_CONTENT; continue; } if (mode == IGNORE_MARKUP) { // otherwise treat as CDATA this.waswhite = false; this.state = LEX_CONTENT; continue; } // look out for comments, doctype or marked sections this isn't quite right, but its getting there if (c == '!') { c = this.in.readChar(); if (c == '-') { c = this.in.readChar(); if (c == '-') { this.state = LEX_COMMENT; // comment this.lexsize -= 2; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } report.warning(this, null, null, Report.MALFORMED_COMMENT); } else if (c == 'd' || c == 'D') { this.state = LEX_DOCTYPE; // doctype this.lexsize -= 2; this.txtend = this.lexsize; mode = IGNORE_WHITESPACE; // skip until white space or '>' for (;;) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM || c == '>') { this.in.ungetChar(c); break; } if (!TidyUtils.isWhite((char) c)) { continue; } // and skip to end of whitespace for (;;) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM || c == '>') { this.in.ungetChar(c); break; } if (TidyUtils.isWhite((char) c)) { continue; } this.in.ungetChar(c); break; } break; } // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } else if (c == '[') { // Word 2000 embeds ... sequences this.lexsize -= 2; this.state = LEX_SECTION; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // otherwise swallow chars up to and including next '>' while (true) { c = this.in.readChar(); if (c == '>') { break; } if (c == -1) { this.in.ungetChar(c); break; } } this.lexsize -= 2; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; continue; } // processing instructions if (c == '?') { this.lexsize -= 2; this.state = LEX_PROCINSTR; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // Microsoft ASP's e.g. <% ... server-code ... %> if (c == '%') { this.lexsize -= 2; this.state = LEX_ASP; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // Netscapes JSTE e.g. <# ... server-code ... #> if (c == '#') { this.lexsize -= 2; this.state = LEX_JSTE; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // check for start tag if (TidyUtils.isLetter((char) c)) { this.in.ungetChar(c); // push back letter this.lexsize -= 2; // discard " <" + letter this.txtend = this.lexsize; this.state = LEX_STARTTAG; // ready to read tag name // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } continue; // no text so keep going } // otherwise treat as CDATA this.state = LEX_CONTENT; this.waswhite = false; continue; case LEX_ENDTAG : // ' while (c != '>') { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { break; } } if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); continue; } this.state = LEX_CONTENT; this.waswhite = false; return this.token; // the endtag token case LEX_STARTTAG : // first letter of tagname this.txtstart = this.lexsize - 1; // set txtstart to first letter c = parseTagName(); isempty[0] = false; attributes = null; this.token = newNode( (isempty[0] ? Node.START_END_TAG : Node.START_TAG), this.lexbuf, this.txtstart, this.txtend, TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart)); // parse attributes, consuming closing ">" if (c != '>') { if (c == '/') { this.in.ungetChar(c); } attributes = parseAttrs(isempty); } if (isempty[0]) { this.token.type = Node.START_END_TAG; } this.token.attributes = attributes; this.lexsize = this.txtstart; this.txtend = this.txtstart; // swallow newline following start tag // special check needed for CRLF sequence // this doesn't apply to empty elements // nor to preformatted content that needs escaping if ( (mode != PREFORMATTED || preContent(this.token)) && (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr)) { c = this.in.readChar(); if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } } else if (c != '\n' && c != '\f') { this.in.ungetChar(c); } this.waswhite = true; // to swallow leading whitespace } else { this.waswhite = false; } this.state = LEX_CONTENT; if (this.token.tag == null) { report.error(this, null, this.token, Report.UNKNOWN_ELEMENT); } else if (!this.configuration.xmlTags) { constrainVersion(this.token.tag.versions); if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY)) { // #427810 - fix by Gary Deschaines 24 May 00 if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr && // this.token.tag != this.configuration.tt.tagWbr)) { report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); } // #427810 - fix by Terry Teague 2 Jul 01 else if (!this.configuration.makeClean) { report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); } } if (this.token.tag.getChkattrs() != null) { this.token.tag.getChkattrs().check(this, this.token); } else { this.token.checkAttributes(this); } // should this be called before attribute checks? this.token.repairDuplicateAttributes(this); } return this.token; // return start tag case LEX_COMMENT : // seen if (c != '-') { continue; } c = this.in.readChar(); addCharToLexer(c); if (c != '-') { continue; } end_comment : while (true) { c = this.in.readChar(); if (c == '>') { if (badcomment != 0) { report.warning(this, null, null, Report.MALFORMED_COMMENT); } this.txtend = this.lexsize - 2; // AQ 8Jul2000 this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend); // now look for a line break c = this.in.readChar(); if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.token.linebreak = true; } } if (c == '\n') { this.token.linebreak = true; } else { this.in.ungetChar(c); } return this.token; } // note position of first such error in the comment if (badcomment == 0) { this.lines = this.in.getCurline(); this.columns = this.in.getCurcol() - 3; } badcomment++; if (this.configuration.fixComments) { this.lexbuf[this.lexsize - 2] = (byte) '='; } addCharToLexer(c); // if '-' then look for '>' to end the comment if (c != '-') { break end_comment; } } // otherwise continue to look for --> this.lexbuf[this.lexsize - 2] = (byte) '='; continue; case LEX_DOCTYPE : // seen ' munging whitespace if (TidyUtils.isWhite((char) c)) { if (this.waswhite) { this.lexsize -= 1; } this.waswhite = true; } else { this.waswhite = false; } if (inDTDSubset) { if (c == ']') { inDTDSubset = false; } } else if (c == '[') { inDTDSubset = true; } if (inDTDSubset || c != '>') { continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend); // make a note of the version named by the doctype this.doctype = findGivenVersion(this.token); return this.token; case LEX_PROCINSTR : // seen ' // check for PHP preprocessor instructions if (this.lexsize - this.txtstart == 3) { if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php")) { this.state = LEX_PHP; continue; } } if (this.lexsize - this.txtstart == 4) { if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml") && TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3])) { this.state = LEX_XMLDECL; attributes = null; continue; } } if (this.configuration.xmlPIs) // insist on ?> as terminator { if (c != '?') { continue; } // now look for '>' c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); continue; } addCharToLexer(c); } if (c != '>') { continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_ASP : // seen <% so look for "%> " if (c != '%') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_JSTE : // seen <# so look for "#> " if (c != '#') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_PHP : // seen " " if (c != '?') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_XMLDECL : // seen "" if (TidyUtils.isWhite((char) c) && c != '?') { continue; } // get pseudo-attribute if (c != '?') { String name; Node[] asp = new Node[1]; Node[] php = new Node[1]; AttVal av = new AttVal(); int[] pdelim = new int[1]; isempty[0] = false; this.in.ungetChar(c); name = this.parseAttribute(isempty, asp, php); av.attribute = name; av.value = this.parseValue(name, true, isempty, pdelim); av.delim = pdelim[0]; av.next = attributes; attributes = av; // continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.txtstart; this.lexbuf[this.txtend] = '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend); this.token.attributes = attributes; return this.token; case LEX_SECTION : // seen " " if (c == '[') { if (this.lexsize == (this.txtstart + 6) && (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA[")) { this.state = LEX_CDATA; this.lexsize -= 6; mode = IGNORE_MARKUP; continue; } } if (c != ']') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_CDATA : // seen " " if (c != ']') { continue; } // now look for ']' c = this.in.readChar(); if (c != ']') { this.in.ungetChar(c); continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; default : // should never reach here break; } } if (this.state == LEX_CONTENT) // text string { this.txtend = this.lexsize; if (this.txtend > this.txtstart) { this.in.ungetChar(c); if (this.lexbuf[this.lexsize - 1] == (byte) ' ') { this.lexsize -= 1; this.txtend = this.lexsize; } this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } } else if (this.state == LEX_COMMENT) // comment { if (c == StreamIn.END_OF_STREAM) { report.warning(this, null, null, Report.MALFORMED_COMMENT); } this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; } return null; } /** * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to * tailor the attribute value. Here is an example of a work around for using ASP in attribute values: * href='<%=rsSchool.Fields("ID").Value%>' where the ASP that generates the attribute value is * masked from Tidy by the quotemarks. * @return parsed Node */ public Node parseAsp() { int c; Node asp = null; this.txtstart = this.lexsize; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { addCharToLexer(c); if (c != '%') { continue; } if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM) { break; } addCharToLexer(c); if (c == '>') { break; } } this.lexsize -= 2; this.txtend = this.lexsize; if (this.txtend > this.txtstart) { asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend); } this.txtstart = this.txtend; return asp; } /** * PHP is like ASP but is based upon XML processing instructions, e.g. <?php ... ?>. * @return parsed Node */ public Node parsePhp() { int c; Node php = null; this.txtstart = this.lexsize; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { addCharToLexer(c); if (c != '?') { continue; } if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM) { break; } addCharToLexer(c); if (c == '>') { break; } } this.lexsize -= 2; this.txtend = this.lexsize; if (this.txtend > this.txtstart) { php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend); } this.txtstart = this.txtend; return php; } /** * consumes the '>' terminating start tags. * @param isempty flag is passed as array so it can be modified * @param asp asp Node, passed as array so it can be modified * @param php php Node, passed as array so it can be modified * @return parsed attribute */ public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php) { int start = 0; String attr; int c = 0; int lastc = 0; asp[0] = null; // clear asp pointer php[0] = null; // clear php pointer // skip white space before the attribute for (;;) { c = this.in.readChar(); if (c == '/') { c = this.in.readChar(); if (c == '>') { isempty[0] = true; return null; } this.in.ungetChar(c); c = '/'; break; } if (c == '>') { return null; } if (c == '<') { c = this.in.readChar(); if (c == '%') { asp[0] = parseAsp(); return null; } else if (c == '?') { php[0] = parsePhp(); return null; } this.in.ungetChar(c); if (this.state != LEX_XMLDECL) // FG fix for 532535 { this.in.ungetChar('<'); // fix for 433360 } report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return null; } if (c == '=') { report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN); continue; } if (c == '"' || c == '\'') { report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); continue; } if (c == StreamIn.END_OF_STREAM) { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return null; } if (!TidyUtils.isWhite((char) c)) { break; } } start = this.lexsize; lastc = c; for (;;) { // but push back '=' for parseValue() if (c == '=' || c == '>') { this.in.ungetChar(c); break; } if (c == '<' || c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); break; } if (lastc == '-' && (c == '"' || c == '\'')) { this.lexsize--; this.in.ungetChar(c); break; } if (TidyUtils.isWhite((char) c)) { break; } // what should be done about non-namechar characters? // currently these are incorporated into the attr name if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } // ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00 addCharToLexer(c); lastc = c; c = this.in.readChar(); } // #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00 int len = this.lexsize - start; attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null); this.lexsize = start; return attr; } /** * Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this * routine recognizes ' and " quoted strings. * @return delimiter */ public int parseServerInstruction() { int c, delim = '"'; boolean isrule = false; c = this.in.readChar(); addCharToLexer(c); // check for ASP, PHP or Tango if (c == '%' || c == '?' || c == '@') { isrule = true; } for (;;) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { break; } if (c == '>') { if (isrule) { addCharToLexer(c); } else { this.in.ungetChar(c); } break; } // if not recognized as ASP, PHP or Tango // then also finish value on whitespace if (!isrule) { if (TidyUtils.isWhite((char) c)) { break; } } addCharToLexer(c); if (c == '"') { do { c = this.in.readChar(); if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01 { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return 0; } if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01 { this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return 0; } addCharToLexer(c); } while (c != '"'); delim = '\''; continue; } if (c == '\'') { do { c = this.in.readChar(); if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01 { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return 0; } if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01 { this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return 0; } addCharToLexer(c); } while (c != '\''); } } return delim; } /** * Parse an attribute value. * @param name attribute name * @param foldCase fold case? * @param isempty is attribute empty? Passed as an array reference to allow modification * @param pdelim delimiter, passed as an array reference to allow modification * @return parsed value */ public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim) { // values start with "=" or " = " etc. // doesn't consume the ">" at end of start tag int len = 0; int start; boolean seenGt = false; boolean munge = true; int c = 0; int lastc, delim, quotewarning; String value; delim = 0; pdelim[0] = '"'; // Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are // significant and must be preserved if (this.configuration.literalAttribs) { munge = false; } // skip white space before the '=' while (true) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); break; } if (!TidyUtils.isWhite((char) c)) { break; } } // c should be '=' if there is a value other legal possibilities are white space, '/' and '>' if (c != '=' && c != '"' && c != '\'') { this.in.ungetChar(c); return null; } // skip white space after '=' while (true) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); break; } if (!TidyUtils.isWhite((char) c)) { break; } } // check for quote marks if (c == '"' || c == '\'') { delim = c; } else if (c == '<') { start = this.lexsize; addCharToLexer(c); pdelim[0] = parseServerInstruction(); len = this.lexsize - start; this.lexsize = start; return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null); } else { this.in.ungetChar(c); } // and read the value string check for quote mark if needed quotewarning = 0; start = this.lexsize; c = '\0'; while (true) { lastc = c; // track last character c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); break; } if (delim == (char) 0) { if (c == '>') { this.in.ungetChar(c); break; } if (c == '"' || c == '\'') { report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); break; } if (c == '<') { this.in.ungetChar(c); // fix for 433360 c = '>'; this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); break; } // For cases like
need to avoid treating /> as part of the attribute value, however // care is needed to avoid so treating
in this way, which would map the // tag to if (c == '/') { // peek ahead in case of /> c = this.in.readChar(); if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name)) { isempty[0] = true; this.in.ungetChar(c); break; } // unget peeked char this.in.ungetChar(c); c = '/'; } } else { // delim is '\'' or '"' if (c == delim) { break; } // treat CRLF, CR and LF as single line break if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } c = '\n'; } if (c == '\n' || c == '<' || c == '>') { ++quotewarning; } if (c == '>') { seenGt = true; } } if (c == '&') { // no entities in ID attributes if ("id".equalsIgnoreCase(name)) { report.attrError(this, null, null, Report.ENTITY_IN_ID); continue; } addCharToLexer(c); parseEntity((short) 0); continue; } // kludge for JavaScript attribute values with line continuations in string literals if (c == '\\') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); c = '\\'; } } if (TidyUtils.isWhite((char) c)) { if (delim == (char) 0) { break; } if (munge) { // discard line breaks in quoted URLs // #438650 - fix by Randy Waki if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name)) { // warn that we discard this newline report.attrError(this, this.token, null, Report.NEWLINE_IN_URI); continue; } c = ' '; if (lastc == ' ') { continue; } } } else if (foldCase && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } addCharToLexer(c); } if (quotewarning > 10 && seenGt && munge) { // there is almost certainly a missing trailing quote mark as we have see too many newlines, < or > // characters. an exception is made for Javascript attributes and the javascript URL scheme which may // legitimately include < and >, and for attributes starting with " 0 || delim != 0) { // ignore leading and trailing white space for all but title, alt, value and prompts attributes unless // --literal-attributes is set to yes // #994841 - Whitespace is removed from value attributes if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name)) { while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1])) { --len; } while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len) { ++start; --len; } } value = TidyUtils.getString(this.lexbuf, start, len); } else { value = null; } // note delimiter if given if (delim != 0) { pdelim[0] = delim; } else { pdelim[0] = '"'; } return value; } /** * Check if attr is a valid name. * @param attr String to check, must be non-null * @return true if attr is a valid name. */ public static boolean isValidAttrName(String attr) { char c; int i; // first character should be a letter c = attr.charAt(0); if (!TidyUtils.isLetter(c)) { return false; } // remaining characters should be namechars for (i = 1; i < attr.length(); i++) { c = attr.charAt(i); if (TidyUtils.isNamechar(c)) { continue; } return false; } return true; } /** * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special * meaning, by putting a backslash in front. * @param buf css selector name * @return true if the given string is a valid css1 selector name */ public static boolean isCSS1Selector(String buf) { if (buf == null) { return false; } // #508936 - CSS class naming for -clean option boolean valid = true; int esclen = 0; char c; int pos; for (pos = 0; valid && pos < buf.length(); ++pos) { c = buf.charAt(pos); if (c == '\\') { esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444} } else if (Character.isDigit(c)) { // Digit not 1st, unless escaped (Max length "\112F") if (esclen > 0) { valid = (++esclen < 6); } if (valid) { valid = (pos > 0 || esclen > 0); } } else { valid = (esclen > 0 // Escaped? Anything goes. || (pos > 0 && c == '-') // Dash cannot be 1st char || Character.isLetter(c) // a-z, A-Z anywhere || (c >= 161 && c <= 255)); // Unicode 161-255 anywhere esclen = 0; } } return valid; } /** * Parse tag attributes. * @param isempty is tag empty? * @return parsed attribute/value list */ public AttVal parseAttrs(boolean[] isempty) { AttVal av, list; String attribute, value; int[] delim = new int[1]; Node[] asp = new Node[1]; Node[] php = new Node[1]; list = null; while (!endOfInput()) { attribute = parseAttribute(isempty, asp, php); if (attribute == null) { // check if attributes are created by ASP markup if (asp[0] != null) { av = new AttVal(list, null, asp[0], null, '\0', null, null); list = av; continue; } // check if attributes are created by PHP markup if (php[0] != null) { av = new AttVal(list, null, null, php[0], '\0', null, null); list = av; continue; } break; } value = parseValue(attribute, false, isempty, delim); if (attribute != null && isValidAttrName(attribute)) { av = new AttVal(list, null, null, null, delim[0], attribute, value); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); list = av; } else { av = new AttVal(null, null, null, null, 0, attribute, value); // #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett if (value != null) { report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE); } else if (TidyUtils.lastChar(attribute) == '"') { report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK); } else { report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE); } } } return list; } /** * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance: * <p><em> text <p><em> more text Shouldn't be mapped to * <p><em> text </em></p><p><em><em> more text </em></em> * @param node Node to be pushed */ public void pushInline(Node node) { IStack is; if (node.implicit) { return; } if (node.tag == null) { return; } if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) { return; } if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) { return; } if (node.tag != this.configuration.tt.tagFont && isPushed(node)) { return; } // make sure there is enough space for the stack is = new IStack(); is.tag = node.tag; is.element = node.element; if (node.attributes != null) { is.attributes = cloneAttributes(node.attributes); } this.istack.push(is); } /** * Pop a copy of an inline node from the stack. * @param node Node to be popped */ public void popInline(Node node) { IStack is; if (node != null) { if (node.tag == null) { return; } if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) { return; } if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) { return; } // if node is then pop until we find an if (node.tag == this.configuration.tt.tagA) { while (this.istack.size() > 0) { is = (IStack) this.istack.pop(); if (is.tag == this.configuration.tt.tagA) { break; } } if (this.insert >= this.istack.size()) { this.insert = -1; } return; } } if (this.istack.size() > 0) { is = (IStack) this.istack.pop(); if (this.insert >= this.istack.size()) { this.insert = -1; } } } /** * Is the node in the stack? * @param node Node * @return true is the node is found in the stack */ public boolean isPushed(Node node) { int i; IStack is; for (i = this.istack.size() - 1; i >= 0; --i) { is = (IStack) this.istack.elementAt(i); if (is.tag == node.tag) { return true; } } return false; } /** * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as * will be the case in: <i><h1>italic heading</h1></i> which is then treated as * equivalent to <h1><i>italic heading</i></h1> This is implemented by setting the lexer * into a mode where it gets tokens from the inline stack rather than from the input stream. * @param node original node * @return stack size */ public int inlineDup(Node node) { int n; n = this.istack.size() - this.istackbase; if (n > 0) { this.insert = this.istackbase; this.inode = node; } return n; } /** * @return */ public Node insertedToken() { Node node; IStack is; int n; // this will only be null if inode != null if (this.insert == -1) { node = this.inode; this.inode = null; return node; } // is this is the "latest" node then update the position, otherwise use current values if (this.inode == null) { this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); } node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend); // GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy node.implicit = true; is = (IStack) this.istack.elementAt(this.insert); node.element = is.element; node.tag = is.tag; if (is.attributes != null) { node.attributes = cloneAttributes(is.attributes); } // advance lexer to next item on the stack n = this.insert; // and recover state if we have reached the end if (++n < this.istack.size()) { this.insert = n; } else { this.insert = -1; } return node; } /** * Can the given element be removed? * @param element node * @return true if he element can be removed */ public boolean canPrune(Node element) { if (element.type == Node.TEXT_NODE) { return true; } if (element.content != null) { return false; } if (element.tag == this.configuration.tt.tagA && element.attributes != null) { return false; } if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas) { return false; } if (element.tag == null) { return false; } if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW)) { return false; } if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY)) { return false; } if (element.tag == this.configuration.tt.tagApplet) { return false; } if (element.tag == this.configuration.tt.tagObject) { return false; } if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null) { return false; } // #540555 Empty title tag is trimmed if (element.tag == this.configuration.tt.tagTitle) { return false; } // #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed if (element.tag == this.configuration.tt.tagIframe) { return false; } if (element.getAttrByName("id") != null || element.getAttrByName("name") != null) { return false; } return true; } /** * duplicate name attribute as an id and check if id and name match. * @param node Node to check for name/it attributes */ public void fixId(Node node) { AttVal name = node.getAttrByName("name"); AttVal id = node.getAttrByName("id"); if (name != null) { if (id != null) { if (id.value != null && !id.value.equals(name.value)) { report.attrError(this, node, name, Report.ID_NAME_MISMATCH); } } else if (this.configuration.xmlOut) { node.addAttribute("id", name.value); } } } /** * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated. */ public void deferDup() { this.insert = -1; this.inode = null; } /** * Find last inserted element for put properly place to error message. * @return */ public Node getLastNode() { Node last = root; while(last.last != null) { last=last.last; } return last; } /** * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of * HTML this is handled here rather than in the tag/attr dicts. * @param vers html version code */ void constrainVersion(int vers) { this.versions &= (vers | Dict.VERS_PROPRIETARY); } /** * Is content acceptable for pre elements? * @param node content * @return true if node is acceptable in pre elements */ protected boolean preContent(Node node) { // p is coerced to br's if (node.tag == this.configuration.tt.tagP) { return true; } if (node.tag == null || node.tag == this.configuration.tt.tagP || !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW))) { return false; } return true; } /** * document type. */ private static class W3CVersionInfo { /** * name. */ String name; /** * voyager name. */ String voyagerName; /** * profile. */ String profile; /** * code. */ short code; /** * Instantiates a new W3CVersionInfo. * @param name version name * @param voyagerName voyager (xhtml) name * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET * @param code unique code for this version info */ public W3CVersionInfo(String name, String voyagerName, String profile, short code) { this.name = name; this.voyagerName = voyagerName; this.profile = profile; this.code = code; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy