org.w3c.tidy.Lexer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jtidy Show documentation
Show all versions of jtidy Show documentation
JTidy is a Java port of HTML Tidy, a HTML syntax checker and pretty printer. Like its non-Java cousin, JTidy can be
used as a tool for cleaning up malformed and faulty HTML. In addition, JTidy provides a DOM interface to the
document that is being processed, which effectively makes you able to use JTidy as a DOM parser for real-world HTML.
/*
* Java HTML Tidy - JTidy
* HTML parser and pretty printer
*
* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
* Institute of Technology, Institut National de Recherche en
* Informatique et en Automatique, Keio University). All Rights
* Reserved.
*
* Contributing Author(s):
*
* Dave Raggett
* Andy Quick (translation to Java)
* Gary L Peskin (Java development)
* Sami Lempinen (release management)
* Fabrizio Giustina
*
* The contributing author(s) would like to thank all those who
* helped with testing, bug fixes, and patience. This wouldn't
* have been possible without all of you.
*
* COPYRIGHT NOTICE:
*
* This software and documentation is provided "as is," and
* the copyright holders and contributing author(s) make no
* representations or warranties, express or implied, including
* but not limited to, warranties of merchantability or fitness
* for any particular purpose or that the use of the software or
* documentation will not infringe any third party patents,
* copyrights, trademarks or other rights.
*
* The copyright holders and contributing author(s) will not be
* liable for any direct, indirect, special or consequential damages
* arising out of any use of the software or documentation, even if
* advised of the possibility of such damage.
*
* Permission is hereby granted to use, copy, modify, and distribute
* this source code, or portions hereof, documentation and executables,
* for any purpose, without fee, subject to the following restrictions:
*
* 1. The origin of this source code must not be misrepresented.
* 2. Altered versions must be plainly marked as such and must
* not be misrepresented as being the original source.
* 3. This Copyright notice may not be removed or altered from any
* source or altered source distribution.
*
* The copyright holders and contributing author(s) specifically
* permit, without fee, and encourage the use of this source code
* as a component for supporting the Hypertext Markup Language in
* commercial products. If you use this source code in a product,
* acknowledgment is not required but would be appreciated.
*
*/
package org.w3c.tidy;
import java.io.PrintWriter;
import java.util.List;
import java.util.Stack;
import java.util.Vector;
/**
* Lexer for html parser.
*
* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
* level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
* null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
* mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
* to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
* Not yet done: - Doctype subset and marked sections
*
* @author Dave Raggett [email protected]
* @author Andy Quick [email protected] (translation to Java)
* @author Fabrizio Giustina
* @version $Revision$ ($Author$)
*/
public class Lexer
{
/**
* state: ignore whitespace.
*/
public static final short IGNORE_WHITESPACE = 0;
/**
* state: mixed content.
*/
public static final short MIXED_CONTENT = 1;
/**
* state: preformatted.
*/
public static final short PREFORMATTED = 2;
/**
* state: ignore markup.
*/
public static final short IGNORE_MARKUP = 3;
/**
* URI for XHTML 1.0 transitional DTD.
*/
private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
/**
* URI for XHTML 1.0 strict DTD.
*/
private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
/**
* URI for XHTML 1.0 frameset DTD.
*/
private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
/**
* URI for XHTML 1.1.
*/
private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
/**
* URI for XHTML Basic 1.0.
*/
// private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
/**
* xhtml namespace.
*/
private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/**
* lists all the known versions.
*/
private static final Lexer.W3CVersionInfo[] W3CVERSION = {
new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
/**
* getToken state: content.
*/
private static final short LEX_CONTENT = 0;
/**
* getToken state: gt.
*/
private static final short LEX_GT = 1;
/**
* getToken state: endtag.
*/
private static final short LEX_ENDTAG = 2;
/**
* getToken state: start tag.
*/
private static final short LEX_STARTTAG = 3;
/**
* getToken state: comment.
*/
private static final short LEX_COMMENT = 4;
/**
* getToken state: doctype.
*/
private static final short LEX_DOCTYPE = 5;
/**
* getToken state: procinstr.
*/
private static final short LEX_PROCINSTR = 6;
/**
* getToken state: cdata.
*/
private static final short LEX_CDATA = 8;
/**
* getToken state: section.
*/
private static final short LEX_SECTION = 9;
/**
* getToken state: asp.
*/
private static final short LEX_ASP = 10;
/**
* getToken state: jste.
*/
private static final short LEX_JSTE = 11;
/**
* getToken state: php.
*/
private static final short LEX_PHP = 12;
/**
* getToken state: xml declaration.
*/
private static final short LEX_XMLDECL = 13;
/**
* file stream.
*/
protected StreamIn in;
/**
* error output stream.
*/
protected PrintWriter errout;
/**
* for accessibility errors.
*/
protected short badAccess;
/**
* for bad style errors.
*/
protected short badLayout;
/**
* for bad char encodings.
*/
protected short badChars;
/**
* for mismatched/mispositioned form tags.
*/
protected short badForm;
/**
* count of warnings in this document.
*/
protected short warnings;
/**
* count of errors.
*/
protected short errors;
/**
* lines seen.
*/
protected int lines;
/**
* at start of current token.
*/
protected int columns;
/**
* used to collapse contiguous white space.
*/
protected boolean waswhite;
/**
* true after token has been pushed back.
*/
protected boolean pushed;
/**
* when space is moved after end tag.
*/
protected boolean insertspace;
/**
* Netscape compatibility.
*/
protected boolean excludeBlocks;
/**
* true if moved out of table.
*/
protected boolean exiled;
/**
* true if xmlns attribute on html element.
*/
protected boolean isvoyager;
/**
* bit vector of HTML versions.
*/
protected short versions;
/**
* version as given by doctype (if any).
*/
protected int doctype;
/**
* set if html or PUBLIC is missing.
*/
protected boolean badDoctype;
/**
* start of current node.
*/
protected int txtstart;
/**
* end of current node.
*/
protected int txtend;
/**
* state of lexer's finite state machine.
*/
protected short state;
/**
* current node.
*/
protected Node token;
/**
* Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
* all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
*/
protected byte[] lexbuf;
/**
* allocated.
*/
protected int lexlength;
/**
* used.
*/
protected int lexsize;
/**
* Inline stack for compatibility with Mosaic. For deferring text node.
*/
protected Node inode;
/**
* for inferring inline tags.
*/
protected int insert;
/**
* stack.
*/
protected Stack istack;
/**
* start of frame.
*/
protected int istackbase;
/**
* used for cleaning up presentation markup.
*/
protected Style styles;
/**
* configuration.
*/
protected Configuration configuration;
/**
* already seen end body tag?
*/
protected boolean seenEndBody;
/**
* already seen end html tag?
*/
protected boolean seenEndHtml;
/**
* report.
*/
protected Report report;
/**
* Root node is saved here.
*/
protected Node root;
/**
* node list.
*/
private List nodeList;
/**
* Instantiates a new Lexer.
* @param in StreamIn
* @param configuration configuation instance
* @param report report instance, for reporting errors
*/
public Lexer(StreamIn in, Configuration configuration, Report report)
{
this.report = report;
this.in = in;
this.lines = 1;
this.columns = 1;
this.state = LEX_CONTENT;
this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
this.doctype = Dict.VERS_UNKNOWN;
this.insert = -1;
this.istack = new Stack<>();
this.configuration = configuration;
this.nodeList = new Vector<>();
}
/**
* Creates a new node and add it to nodelist.
* @return Node
*/
public Node newNode()
{
Node node = new Node();
this.nodeList.add(node);
return node;
}
/**
* Creates a new node and add it to nodelist.
* @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
* Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
* Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
* @param textarray array of bytes contained in the Node
* @param start start position
* @param end end position
* @return Node
*/
public Node newNode(short type, byte[] textarray, int start, int end)
{
Node node = new Node(type, textarray, start, end);
this.nodeList.add(node);
return node;
}
/**
* Creates a new node and add it to nodelist.
* @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
* Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
* Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
* @param textarray array of bytes contained in the Node
* @param start start position
* @param end end position
* @param element tag name
* @return Node
*/
public Node newNode(short type, byte[] textarray, int start, int end, String element)
{
Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
this.nodeList.add(node);
return node;
}
/**
* Clones a node and add it to node list.
* @param node Node
* @return cloned Node
*/
public Node cloneNode(Node node)
{
Node cnode = node.cloneNode(false);
this.nodeList.add(cnode);
for (AttVal att = cnode.attributes; att != null; att = att.next)
{
if (att.asp != null)
{
this.nodeList.add(att.asp);
}
if (att.php != null)
{
this.nodeList.add(att.php);
}
}
return cnode;
}
/**
* Clones an attribute value and add eventual asp or php node to node list.
* @param attrs original AttVal
* @return cloned AttVal
*/
public AttVal cloneAttributes(AttVal attrs)
{
AttVal cattrs = (AttVal) attrs.clone();
for (AttVal att = cattrs; att != null; att = att.next)
{
if (att.asp != null)
{
this.nodeList.add(att.asp);
}
if (att.php != null)
{
this.nodeList.add(att.php);
}
}
return cattrs;
}
/**
* Update oldtextarray
in the current nodes.
* @param oldtextarray previous text array
* @param newtextarray new text array
*/
protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
{
Node node;
for (Object aNodeList : this.nodeList)
{
node = (Node) aNodeList;
if (node.textarray == oldtextarray)
{
node.textarray = newtextarray;
}
}
}
/**
* Adds a new line node. Used for creating preformatted text from Word2000.
* @return new line node
*/
public Node newLineNode()
{
Node node = newNode();
node.textarray = this.lexbuf;
node.start = this.lexsize;
addCharToLexer('\n');
node.end = this.lexsize;
return node;
}
/**
* Has end of input stream been reached?
* @return true
if end of input stream been reached
*/
public boolean endOfInput()
{
return this.in.isEndOfStream();
}
/**
* Adds a byte to lexer buffer.
* @param c byte to add
*/
public void addByte(int c)
{
if (this.lexsize + 1 >= this.lexlength)
{
while (this.lexsize + 1 >= this.lexlength)
{
if (this.lexlength == 0)
{
this.lexlength = 8192;
}
else
{
this.lexlength = this.lexlength * 2;
}
}
byte[] temp = this.lexbuf;
this.lexbuf = new byte[this.lexlength];
if (temp != null)
{
System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
updateNodeTextArrays(temp, this.lexbuf);
}
}
this.lexbuf[this.lexsize++] = (byte) c;
this.lexbuf[this.lexsize] = (byte) '\0'; // debug
}
/**
* Substitute the last char in buffer.
* @param c new char
*/
public void changeChar(byte c)
{
if (this.lexsize > 0)
{
this.lexbuf[this.lexsize - 1] = c;
}
}
/**
* Store char c as UTF-8 encoded byte stream.
* @param c char to store
*/
public void addCharToLexer(int c)
{
// Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
// Fix by Pablo Mayrgundter 17-08-2004
if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
&& !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
|| c == 0x9
|| c == 0xA
|| c == 0xD // Then white-space.
|| (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
|| (c >= 0x10000 && c <= 0x10FFFF)))
{
return;
}
int i = 0;
int[] count = new int[]{0};
byte[] buf = new byte[10]; // unsigned char
boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
if (err)
{
// replacement char 0xFFFD encoded as UTF-8
buf[0] = (byte) 0xEF;
buf[1] = (byte) 0xBF;
buf[2] = (byte) 0xBD;
count[0] = 3;
}
for (i = 0; i < count[0]; i++)
{
addByte(buf[i]); // uint
}
}
/**
* Adds a string to lexer buffer.
* @param str String to add
*/
public void addStringToLexer(String str)
{
for (int i = 0; i < str.length(); i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* Parse an html entity.
* @param mode mode
*/
public void parseEntity(short mode)
{
// No longer attempts to insert missing ';' for unknown
// entities unless one was present already, since this
// gives unexpected results.
//
// For example:
// was tidied to:
// rather than:
//
// My thanks for Maurice Buxton for spotting this.
//
// Also Randy Waki pointed out the following case for the
// 04 Aug 00 version (bug #433012):
//
// For example:
// was tidied to:
// rather than:
//
// where "lang" is a known entity (#9001), but browsers would
// misinterpret "〈" because it had a value > 256.
//
// So the case of an apparently known entity with a value > 256 and
// missing a semicolon is handled specially.
//
// "ParseEntity" is also a bit of a misnomer - it handles entities and
// numeric character references. Invalid NCR's are now reported.
int start;
boolean first = true;
boolean semicolon = false;
int c, ch, startcol;
String str;
start = this.lexsize - 1; // to start at "&"
startcol = this.in.getCurcol() - 1;
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
if (c == ';')
{
semicolon = true;
break;
}
if (first && c == '#')
{
// #431953 - start RJ
if (!this.configuration.ncr
|| "BIG5".equals(this.configuration.getInCharEncodingName())
|| "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
{
this.in.ungetChar(c);
return;
}
// #431953 - end RJ
addCharToLexer(c);
first = false;
continue;
}
first = false;
if (TidyUtils.isNamechar((char) c))
{
addCharToLexer(c);
continue;
}
// otherwise put it back
this.in.ungetChar(c);
break;
}
str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
{
report.entityError(this, Report.APOS_UNDEFINED, str, 39);
}
ch = EntityTable.getDefaultEntityTable().entityCode(str);
// drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
// if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
// && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
// || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
// || (ch >= 0xE000 && ch <= 0xFFFD)))
// {
// this.lexsize = start;
// return;
// }
// deal with unrecognized or invalid entities
// #433012 - fix by Randy Waki 17 Feb 01
// report invalid NCR's - Terry Teague 01 Sep 01
if (ch <= 0 || (ch >= 256 && c != ';'))
{
// set error position just before offending character
this.lines = this.in.getCurline();
this.columns = startcol;
if (this.lexsize > start + 1)
{
if (ch >= 128 && ch <= 159)
{
// invalid numeric character reference
int c1 = 0;
if ("WIN1252".equals(configuration.replacementCharEncoding))
{
c1 = EncodingUtils.decodeWin1252(ch);
}
else if ("MACROMAN".equals(configuration.replacementCharEncoding))
{
c1 = EncodingUtils.decodeMacRoman(ch);
}
// "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
if (c != ';') /* issue warning if not terminated by ';' */
{
report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
}
report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
if (c1 != 0)
{
// make the replacement
this.lexsize = start;
addCharToLexer(c1);
semicolon = false;
}
else
{
/* discard */
this.lexsize = start;
semicolon = false;
}
}
else
{
report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
}
if (semicolon)
{
addCharToLexer(';');
}
}
else
{
// naked &
report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
}
}
else
{
// issue warning if not terminated by ';'
if (c != ';')
{
// set error position just before offending character
this.lines = this.in.getCurline();
this.columns = startcol;
report.entityError(this, Report.MISSING_SEMICOLON, str, c);
}
this.lexsize = start;
if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
{
ch = ' ';
}
addCharToLexer(ch);
if (ch == '&' && !this.configuration.quoteAmpersand)
{
addCharToLexer('a');
addCharToLexer('m');
addCharToLexer('p');
addCharToLexer(';');
}
}
}
/**
* Parses a tag name.
* @return first char after the tag name
*/
public char parseTagName()
{
int c;
// fold case of first char in buffer
c = this.lexbuf[this.txtstart];
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
this.lexbuf[this.txtstart] = (byte) c;
}
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
if (!TidyUtils.isNamechar((char) c))
{
break;
}
// fold case of subsequent chars
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
}
addCharToLexer(c);
}
this.txtend = this.lexsize;
return (char) c;
}
/**
* calls addCharToLexer for any char in the string.
* @param str input String
*/
public void addStringLiteral(String str)
{
int len = str.length();
for (int i = 0; i < len; i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* calls addCharToLexer for any char in the string till len is reached.
* @param str input String
* @param len length of the substring to be added
*/
void addStringLiteralLen(String str, int len)
{
int strlen = str.length();
if (strlen < len)
{
len = strlen;
}
for (int i = 0; i < len; i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* Choose what version to use for new doctype.
* @return html version constant
*/
public short htmlVersion()
{
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
{
return Dict.VERS_HTML20;
}
if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
&& TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
{
return Dict.VERS_HTML32;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
{
return Dict.VERS_XHTML11;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
{
return Dict.VERS_HTML40_STRICT;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
{
return Dict.VERS_HTML40_LOOSE;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
{
return Dict.VERS_FRAMESET;
}
return Dict.VERS_UNKNOWN;
}
/**
* Choose what version to use for new doctype.
* @return html version name
*/
public String htmlVersionName()
{
short guessed;
int j;
guessed = apparentVersion();
for (j = 0; j < W3CVERSION.length; ++j)
{
if (guessed == W3CVERSION[j].code)
{
if (this.isvoyager)
{
return W3CVERSION[j].voyagerName;
}
return W3CVERSION[j].name;
}
}
return null;
}
/**
* Add meta element for Tidy. If the meta tag is already present, update release date.
* @param root root node
* @return true
if the tag has been added
*/
public boolean addGenerator(Node root)
{
AttVal attval;
Node node;
Node head = root.findHEAD(this.configuration.tt);
if (head != null)
{
String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see jtidy.sourceforge.net";
for (node = head.content; node != null; node = node.next)
{
if (node.tag == this.configuration.tt.tagMeta)
{
attval = node.getAttrByName("name");
if (attval != null && "generator".equalsIgnoreCase(attval.value))
{
attval = node.getAttrByName("content");
if (attval != null
&& attval.value != null
&& attval.value.length() >= 9
&& "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
{
attval.value = meta;
return false;
}
}
}
}
node = this.inferredTag("meta");
node.addAttribute("content", meta);
node.addAttribute("name", "generator");
head.insertNodeAtStart(node);
return true;
}
return false;
}
/**
* Check system keywords (keywords should be uppercase).
* @param doctype doctype node
* @return true if doctype keywords are all uppercase
*/
public boolean checkDocTypeKeyWords(Node doctype)
{
int len = doctype.end - doctype.start;
String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
return !(TidyUtils.findBadSubString("SYSTEM", s, s.length())
|| TidyUtils.findBadSubString("PUBLIC", s, s.length())
|| TidyUtils.findBadSubString("//DTD", s, s.length())
|| TidyUtils.findBadSubString("//W3C", s, s.length())
|| TidyUtils.findBadSubString("//EN", s, s.length()));
}
/**
* Examine DOCTYPE to identify version.
* @param doctype doctype node
* @return version code
*/
public short findGivenVersion(Node doctype)
{
String p, s;
int i, j;
int len;
String str1;
String str2;
// if root tag for doctype isn't html give up now
str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
if (!"html ".equalsIgnoreCase(str1))
{
return 0;
}
if (!checkDocTypeKeyWords(doctype))
{
report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
}
// give up if all we are given is the system id for the doctype
str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
if ("SYSTEM ".equalsIgnoreCase(str1))
{
// but at least ensure the case is correct
if (!str1.substring(0, 6).equals("SYSTEM"))
{
System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
}
return 0; // unrecognized
}
if ("PUBLIC ".equalsIgnoreCase(str1))
{
if (!str1.substring(0, 6).equals("PUBLIC"))
{
System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
}
}
else
{
this.badDoctype = true;
}
for (i = doctype.start; i < doctype.end; ++i)
{
if (this.lexbuf[i] == (byte) '"')
{
str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
if (str1.equals("-//W3C//DTD "))
{
// compute length of identifier e.g. "HTML 4.0 Transitional"
for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
{
//
}
len = j - i - 13;
p = TidyUtils.getString(this.lexbuf, i + 13, len);
for (j = 1; j < W3CVERSION.length; ++j)
{
s = W3CVERSION[j].name;
if (len == s.length() && s.equals(p))
{
return W3CVERSION[j].code;
}
}
// else unrecognized version
}
else if (str2.equals("-//IETF//DTD "))
{
// compute length of identifier e.g. "HTML 2.0"
for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
{
//
}
len = j - i - 14;
p = TidyUtils.getString(this.lexbuf, i + 14, len);
s = W3CVERSION[0].name;
if (len == s.length() && s.equals(p))
{
return W3CVERSION[0].code;
}
// else unrecognized version
}
break;
}
}
return 0;
}
/**
* Fix xhtml namespace.
* @param root root Node
* @param profile current profile
*/
public void fixHTMLNameSpace(Node root, String profile)
{
Node node;
AttVal attr;
node = root.content;
while (node != null && node.tag != this.configuration.tt.tagHtml)
{
node = node.next;
}
if (node != null)
{
for (attr = node.attributes; attr != null; attr = attr.next)
{
if (attr.attribute.equals("xmlns"))
{
break;
}
}
if (attr != null)
{
if (!attr.value.equals(profile))
{
report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
attr.value = profile;
}
}
else
{
attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
node.attributes = attr;
}
}
}
/**
* Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
* html
tag. Should also work for any comments, etc. that may precede the html
tag.
* @param root root node
* @return new doctype node
*/
Node newXhtmlDocTypeNode(Node root)
{
Node html = root.findHTML(this.configuration.tt);
if (html == null)
{
return null;
}
Node newdoctype = newNode();
newdoctype.setType(Node.DOCTYPE_TAG);
newdoctype.next = html;
newdoctype.parent = root;
newdoctype.prev = null;
if (html == root.content)
{
// No declaration.
root.content.prev = newdoctype;
root.content = newdoctype;
newdoctype.prev = null;
}
else
{
// we have an declaration.
newdoctype.prev = html.prev;
newdoctype.prev.next = newdoctype;
}
html.prev = newdoctype;
return newdoctype;
}
/**
* Adds a new xhtml doctype to the document.
* @param root root node
* @return true
if a doctype has been added
*/
public boolean setXHTMLDocType(Node root)
{
String fpi = " ";
String sysid = "";
String dtdsub = null;
Node doctype;
int dtdlen = 0;
doctype = root.findDocType();
fixHTMLNameSpace(root, XHTML_NAMESPACE); // #427839 - fix by Evan Lenz 05 Sep 00
if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
return true;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
// see what flavor of XHTML this document matches
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
// use XHTML strict
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = VOYAGER_STRICT;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
// use XHTML frames
fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
sysid = VOYAGER_FRAMESET;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = VOYAGER_LOOSE;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
// use XHTML 1.1
fpi = "-//W3C//DTD XHTML 1.1//EN";
sysid = VOYAGER_11;
}
else
{
// proprietary
fpi = null;
sysid = "";
if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
{
Node.discardElement(doctype);
}
}
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = VOYAGER_STRICT;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = VOYAGER_LOOSE;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
{
fpi = this.configuration.docTypeStr;
sysid = "";
}
if (fpi == null)
{
return false;
}
if (doctype != null)
{
// Look for internal DTD subset
if (configuration.xHTML || configuration.xmlOut)
{
int len = doctype.end - doctype.start + 1;
String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
int dtdbeg = start.indexOf('[');
if (dtdbeg >= 0)
{
int dtdend = start.substring(dtdbeg).indexOf(']');
if (dtdend >= 0)
{
dtdlen = dtdend + 1;
dtdsub = start.substring(dtdbeg);
}
}
}
}
else
{
if ((doctype = newXhtmlDocTypeNode(root)) == null)
{
return false;
}
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
// add public identifier
addStringLiteral("html PUBLIC ");
// check if the fpi is quoted or not
if (fpi.charAt(0) == '"')
{
addStringLiteral(fpi);
}
else
{
addStringLiteral("\"");
addStringLiteral(fpi);
addStringLiteral("\"");
}
if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
{
addStringLiteral("\n\"");
}
else
{
// FG: don't wrap
addStringLiteral(" \"");
}
// add system identifier
addStringLiteral(sysid);
addStringLiteral("\"");
if (dtdlen > 0 && dtdsub != null)
{
addCharToLexer(' ');
addStringLiteralLen(dtdsub, dtdlen);
}
this.txtend = this.lexsize;
int length = this.txtend - this.txtstart;
doctype.textarray = new byte[length];
System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
doctype.start = 0;
doctype.end = length;
return false;
}
/**
* Return the html version used in document.
* @return version code
*/
public short apparentVersion()
{
switch (this.doctype)
{
case Dict.VERS_UNKNOWN :
return htmlVersion();
case Dict.VERS_HTML20 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
{
return Dict.VERS_HTML20;
}
break;
case Dict.VERS_HTML32 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
{
return Dict.VERS_HTML32;
}
break; // to replace old version by new
case Dict.VERS_HTML40_STRICT :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
return Dict.VERS_HTML40_STRICT;
}
break;
case Dict.VERS_HTML40_LOOSE :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
{
return Dict.VERS_HTML40_LOOSE;
}
break; // to replace old version by new
case Dict.VERS_FRAMESET :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
return Dict.VERS_FRAMESET;
}
break;
case Dict.VERS_XHTML11 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
return Dict.VERS_XHTML11;
}
break;
default :
// should never reach here
break;
}
// kludge to avoid error appearing at end of file
// it would be better to note the actual position
// when first encountering the doctype declaration
this.lines = 1;
this.columns = 1;
report.warning(this, null, null, Report.INCONSISTENT_VERSION);
return this.htmlVersion();
}
/**
* Fixup doctype if missing.
* @param root root node
* @return false
if current version has not been identified
*/
public boolean fixDocType(Node root)
{
Node doctype;
int guessed = Dict.VERS_HTML40_STRICT, i;
if (this.badDoctype)
{
report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
}
doctype = root.findDocType();
if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
return true;
}
if (this.configuration.xmlOut)
{
return true;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_STRICT;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_LOOSE;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
if (doctype != null)
{
if (this.doctype == Dict.VERS_UNKNOWN)
{
return false;
}
switch (this.doctype)
{
case Dict.VERS_UNKNOWN :
return false;
case Dict.VERS_HTML20 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML32 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML40_STRICT :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML40_LOOSE :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_FRAMESET :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_XHTML11 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
return true;
}
break; // to replace old version by new
default :
// should never reach here
break;
}
// INCONSISTENT_VERSION warning is now issued by ApparentVersion()
}
// choose new doctype
guessed = htmlVersion();
}
if (guessed == Dict.VERS_UNKNOWN)
{
return false;
}
// for XML use the Voyager system identifier
if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
fixHTMLNameSpace(root, XHTML_NAMESPACE);
// Namespace is the same for all XHTML variants
// Also, don't return yet. Still need to add DOCTYPE declaration.
//
// for (i = 0; i < W3CVersion.length; ++i)
// {
// if (guessed == W3CVersion[i].code)
// {
// fixHTMLNameSpace(root, W3CVersion[i].profile);
// break;
// }
// }
// return true;
}
if (doctype == null)
{
if ((doctype = newXhtmlDocTypeNode(root)) == null)
{
return false;
}
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
// use the appropriate public identifier
addStringLiteral("html PUBLIC ");
if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
&& this.configuration.docTypeStr != null
&& this.configuration.docTypeStr.length() > 0)
{
// check if the fpi is quoted or not
if (this.configuration.docTypeStr.charAt(0) == '"')
{
addStringLiteral(this.configuration.docTypeStr);
}
else
{
addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
addStringLiteral(this.configuration.docTypeStr);
addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
}
}
else if (guessed == Dict.VERS_HTML20)
{
addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
}
else
{
addStringLiteral("\"-//W3C//DTD ");
for (i = 0; i < W3CVERSION.length; ++i)
{
if (guessed == W3CVERSION[i].code)
{
addStringLiteral(W3CVERSION[i].name);
break;
}
}
addStringLiteral("//EN\"");
}
this.txtend = this.lexsize;
int length = this.txtend - this.txtstart;
doctype.textarray = new byte[length];
System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
doctype.start = 0;
doctype.end = length;
return true;
}
/**
* Ensure XML document starts with <?XML version="1.0"?>
. Add encoding attribute if not using
* ASCII or UTF-8 output.
* @param root root node
* @return always true
*/
public boolean fixXmlDecl(Node root)
{
Node xml;
AttVal version;
AttVal encoding;
if (root.content != null && root.content.type == Node.XML_DECL)
{
xml = root.content;
}
else
{
xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
xml.next = root.content;
if (root.content != null)
{
root.content.prev = xml;
xml.next = root.content;
}
root.content = xml;
}
version = xml.getAttrByName("version");
encoding = xml.getAttrByName("encoding");
// We need to insert a check if declared encoding and output encoding mismatch
// and fix the Xml declaration accordingly!!!
if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
{
if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
{
xml.addAttribute("encoding", "iso-8859-1");
}
if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
{
xml.addAttribute("encoding", "iso-2022");
}
}
if (version == null)
{
xml.addAttribute("version", "1.0");
}
return true;
}
/**
* Generates and inserts a new node.
* @param name tag name
* @return generated node
*/
public Node inferredTag(String name)
{
Node node;
node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
node.implicit = true;
return node;
}
private static final int CDATA_INTERMEDIATE = 0;
private static final int CDATA_STARTTAG = 1;
private static final int CDATA_ENDTAG = 2;
/**
* Create a text node for the contents of a CDATA element like style or script which
* ends with </foo> for some foo.
* @param container container node
* @return cdata node
*/
public Node getCDATA(Node container)
{
int start = 0;
int nested = 0;
int state = CDATA_INTERMEDIATE;
int c;
boolean isEmpty = true;
boolean matches = false;
boolean hasSrc = container.getAttrByName("src") != null;
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
this.waswhite = false;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
/* seen start tag, look for matching end tag */
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) {
addCharToLexer(c);
txtend = lexsize;
if (state == CDATA_INTERMEDIATE) {
if (c != '<') {
if (isEmpty && !TidyUtils.isWhite((char) c)) {
isEmpty = false;
}
continue;
}
c = in.readChar();
if (TidyUtils.isLetter((char) c)) {
/*