org.w3c.tidy5.Lexer Maven / Gradle / Ivy
/*
* Java HTML Tidy - JTidy
* HTML parser and pretty printer
*
* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
* Institute of Technology, Institut National de Recherche en
* Informatique et en Automatique, Keio University). All Rights
* Reserved.
*
* Contributing Author(s):
*
* Dave Raggett
* Andy Quick (translation to Java)
* Gary L Peskin (Java development)
* Sami Lempinen (release management)
* Fabrizio Giustina
*
* The contributing author(s) would like to thank all those who
* helped with testing, bug fixes, and patience. This wouldn't
* have been possible without all of you.
*
* COPYRIGHT NOTICE:
*
* This software and documentation is provided "as is," and
* the copyright holders and contributing author(s) make no
* representations or warranties, express or implied, including
* but not limited to, warranties of merchantability or fitness
* for any particular purpose or that the use of the software or
* documentation will not infringe any third party patents,
* copyrights, trademarks or other rights.
*
* The copyright holders and contributing author(s) will not be
* liable for any direct, indirect, special or consequential damages
* arising out of any use of the software or documentation, even if
* advised of the possibility of such damage.
*
* Permission is hereby granted to use, copy, modify, and distribute
* this source code, or portions hereof, documentation and executables,
* for any purpose, without fee, subject to the following restrictions:
*
* 1. The origin of this source code must not be misrepresented.
* 2. Altered versions must be plainly marked as such and must
* not be misrepresented as being the original source.
* 3. This Copyright notice may not be removed or altered from any
* source or altered source distribution.
*
* The copyright holders and contributing author(s) specifically
* permit, without fee, and encourage the use of this source code
* as a component for supporting the Hypertext Markup Language in
* commercial products. If you use this source code in a product,
* acknowledgment is not required but would be appreciated.
*
*/
package org.w3c.tidy5;
import java.io.PrintWriter;
import java.util.List;
import java.util.Stack;
import java.util.Vector;
/**
* Lexer for html parser.
*
* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
* level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
* null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
* mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
* to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
* Not yet done: - Doctype subset and marked sections
*
* @author Dave Raggett [email protected]
* @author Andy Quick [email protected] (translation to Java)
* @author Fabrizio Giustina
* @version $Revision: 927 $ ($Author: aditsu $)
*/
public class Lexer
{
/**
* state: ignore whitespace.
*/
public static final short IGNORE_WHITESPACE = 0;
/**
* state: mixed content.
*/
public static final short MIXED_CONTENT = 1;
/**
* state: preformatted.
*/
public static final short PREFORMATTED = 2;
/**
* state: ignore markup.
*/
public static final short IGNORE_MARKUP = 3;
/**
* URI for XHTML 1.0 transitional DTD.
*/
private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
/**
* URI for XHTML 1.0 strict DTD.
*/
private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
/**
* URI for XHTML 1.0 frameset DTD.
*/
private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
/**
* URI for XHTML 1.1.
*/
private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
/**
* URI for XHTML Basic 1.0.
*/
// private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
/**
* xhtml namespace.
*/
private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/**
* lists all the known versions.
*/
private static final Lexer.W3CVersionInfo[] W3CVERSION = {
new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
/**
* getToken state: content.
*/
private static final short LEX_CONTENT = 0;
/**
* getToken state: gt.
*/
private static final short LEX_GT = 1;
/**
* getToken state: endtag.
*/
private static final short LEX_ENDTAG = 2;
/**
* getToken state: start tag.
*/
private static final short LEX_STARTTAG = 3;
/**
* getToken state: comment.
*/
private static final short LEX_COMMENT = 4;
/**
* getToken state: doctype.
*/
private static final short LEX_DOCTYPE = 5;
/**
* getToken state: procinstr.
*/
private static final short LEX_PROCINSTR = 6;
/**
* getToken state: cdata.
*/
private static final short LEX_CDATA = 8;
/**
* getToken state: section.
*/
private static final short LEX_SECTION = 9;
/**
* getToken state: asp.
*/
private static final short LEX_ASP = 10;
/**
* getToken state: jste.
*/
private static final short LEX_JSTE = 11;
/**
* getToken state: php.
*/
private static final short LEX_PHP = 12;
/**
* getToken state: xml declaration.
*/
private static final short LEX_XMLDECL = 13;
/**
* file stream.
*/
protected StreamIn in;
/**
* error output stream.
*/
protected PrintWriter errout;
/**
* for accessibility errors.
*/
protected short badAccess;
/**
* for bad style errors.
*/
protected short badLayout;
/**
* for bad char encodings.
*/
protected short badChars;
/**
* for mismatched/mispositioned form tags.
*/
protected short badForm;
/**
* count of warnings in this document.
*/
protected short warnings;
/**
* count of errors.
*/
protected short errors;
/**
* lines seen.
*/
protected int lines;
/**
* at start of current token.
*/
protected int columns;
/**
* used to collapse contiguous white space.
*/
protected boolean waswhite;
/**
* true after token has been pushed back.
*/
protected boolean pushed;
/**
* when space is moved after end tag.
*/
protected boolean insertspace;
/**
* Netscape compatibility.
*/
protected boolean excludeBlocks;
/**
* true if moved out of table.
*/
protected boolean exiled;
/**
* true if xmlns attribute on html element.
*/
protected boolean isvoyager;
/**
* bit vector of HTML versions.
*/
protected short versions;
/**
* version as given by doctype (if any).
*/
protected int doctype;
/**
* set if html or PUBLIC is missing.
*/
protected boolean badDoctype;
/**
* start of current node.
*/
protected int txtstart;
/**
* end of current node.
*/
protected int txtend;
/**
* state of lexer's finite state machine.
*/
protected short state;
/**
* current node.
*/
protected Node token;
/**
* Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
* all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
*/
protected byte[] lexbuf;
/**
* allocated.
*/
protected int lexlength;
/**
* used.
*/
protected int lexsize;
/**
* Inline stack for compatibility with Mosaic. For deferring text node.
*/
protected Node inode;
/**
* for inferring inline tags.
*/
protected int insert;
/**
* stack.
*/
protected Stack istack;
/**
* start of frame.
*/
protected int istackbase;
/**
* used for cleaning up presentation markup.
*/
protected Style styles;
/**
* configuration.
*/
protected Configuration configuration;
/**
* already seen end body tag?
*/
protected boolean seenEndBody;
/**
* already seen end html tag?
*/
protected boolean seenEndHtml;
/**
* report.
*/
protected Report report;
/**
* Root node is saved here.
*/
protected Node root;
/**
* node list.
*/
private List nodeList;
/**
* Instantiates a new Lexer.
* @param in StreamIn
* @param configuration configuation instance
* @param report report instance, for reporting errors
*/
public Lexer(StreamIn in, Configuration configuration, Report report)
{
this.report = report;
this.in = in;
this.lines = 1;
this.columns = 1;
this.state = LEX_CONTENT;
this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
this.doctype = Dict.VERS_UNKNOWN;
this.insert = -1;
this.istack = new Stack();
this.configuration = configuration;
this.nodeList = new Vector();
}
/**
* Creates a new node and add it to nodelist.
* @return Node
*/
public Node newNode()
{
Node node = new Node();
this.nodeList.add(node);
return node;
}
/**
* Creates a new node and add it to nodelist.
* @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
* Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
* Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
* @param textarray array of bytes contained in the Node
* @param start start position
* @param end end position
* @return Node
*/
public Node newNode(short type, byte[] textarray, int start, int end)
{
Node node = new Node(type, textarray, start, end);
this.nodeList.add(node);
return node;
}
/**
* Creates a new node and add it to nodelist.
* @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
* Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
* Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
* @param textarray array of bytes contained in the Node
* @param start start position
* @param end end position
* @param element tag name
* @return Node
*/
public Node newNode(short type, byte[] textarray, int start, int end, String element)
{
Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
this.nodeList.add(node);
return node;
}
/**
* Clones a node and add it to node list.
* @param node Node
* @return cloned Node
*/
public Node cloneNode(Node node)
{
Node cnode = node.cloneNode(false);
this.nodeList.add(cnode);
for (AttVal att = cnode.attributes; att != null; att = att.next)
{
if (att.asp != null)
{
this.nodeList.add(att.asp);
}
if (att.php != null)
{
this.nodeList.add(att.php);
}
}
return cnode;
}
/**
* Clones an attribute value and add eventual asp or php node to node list.
* @param attrs original AttVal
* @return cloned AttVal
*/
public AttVal cloneAttributes(AttVal attrs)
{
AttVal cattrs = (AttVal) attrs.clone();
for (AttVal att = cattrs; att != null; att = att.next)
{
if (att.asp != null)
{
this.nodeList.add(att.asp);
}
if (att.php != null)
{
this.nodeList.add(att.php);
}
}
return cattrs;
}
/**
* Update oldtextarray
in the current nodes.
* @param oldtextarray previous text array
* @param newtextarray new text array
*/
protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
{
Node node;
for (int i = 0; i < this.nodeList.size(); i++)
{
node = (Node) (this.nodeList.get(i));
if (node.textarray == oldtextarray)
{
node.textarray = newtextarray;
}
}
}
/**
* Adds a new line node. Used for creating preformatted text from Word2000.
* @return new line node
*/
public Node newLineNode()
{
Node node = newNode();
node.textarray = this.lexbuf;
node.start = this.lexsize;
addCharToLexer('\n');
node.end = this.lexsize;
return node;
}
/**
* Has end of input stream been reached?
* @return true
if end of input stream been reached
*/
public boolean endOfInput()
{
return this.in.isEndOfStream();
}
/**
* Adds a byte to lexer buffer.
* @param c byte to add
*/
public void addByte(int c)
{
if (this.lexsize + 1 >= this.lexlength)
{
while (this.lexsize + 1 >= this.lexlength)
{
if (this.lexlength == 0)
{
this.lexlength = 8192;
}
else
{
this.lexlength = this.lexlength * 2;
}
}
byte[] temp = this.lexbuf;
this.lexbuf = new byte[this.lexlength];
if (temp != null)
{
System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
updateNodeTextArrays(temp, this.lexbuf);
}
}
this.lexbuf[this.lexsize++] = (byte) c;
this.lexbuf[this.lexsize] = (byte) '\0'; // debug
}
/**
* Substitute the last char in buffer.
* @param c new char
*/
public void changeChar(byte c)
{
if (this.lexsize > 0)
{
this.lexbuf[this.lexsize - 1] = c;
}
}
/**
* Store char c as UTF-8 encoded byte stream.
* @param c char to store
*/
public void addCharToLexer(int c)
{
// Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
// Fix by Pablo Mayrgundter 17-08-2004
if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
&& !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
|| c == 0x9
|| c == 0xA
|| c == 0xD // Then white-space.
|| (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
|| (c >= 0x10000 && c <= 0x10FFFF)))
{
return;
}
int i = 0;
int[] count = new int[]{0};
byte[] buf = new byte[10]; // unsigned char
boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
if (err)
{
// replacement char 0xFFFD encoded as UTF-8
buf[0] = (byte) 0xEF;
buf[1] = (byte) 0xBF;
buf[2] = (byte) 0xBD;
count[0] = 3;
}
for (i = 0; i < count[0]; i++)
{
addByte(buf[i]); // uint
}
}
/**
* Adds a string to lexer buffer.
* @param str String to add
*/
public void addStringToLexer(String str)
{
for (int i = 0; i < str.length(); i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* Parse an html entity.
* @param mode mode
*/
public void parseEntity(short mode)
{
// No longer attempts to insert missing ';' for unknown
// entities unless one was present already, since this
// gives unexpected results.
//
// For example:
// was tidied to:
// rather than:
//
// My thanks for Maurice Buxton for spotting this.
//
// Also Randy Waki pointed out the following case for the
// 04 Aug 00 version (bug #433012):
//
// For example:
// was tidied to:
// rather than:
//
// where "lang" is a known entity (#9001), but browsers would
// misinterpret "〈" because it had a value > 256.
//
// So the case of an apparently known entity with a value > 256 and
// missing a semicolon is handled specially.
//
// "ParseEntity" is also a bit of a misnomer - it handles entities and
// numeric character references. Invalid NCR's are now reported.
int start;
boolean first = true;
boolean semicolon = false;
int c, ch, startcol;
String str;
start = this.lexsize - 1; // to start at "&"
startcol = this.in.getCurcol() - 1;
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
if (c == ';')
{
semicolon = true;
break;
}
if (first && c == '#')
{
// #431953 - start RJ
if (!this.configuration.ncr
|| "BIG5".equals(this.configuration.getInCharEncodingName())
|| "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
{
this.in.ungetChar(c);
return;
}
// #431953 - end RJ
addCharToLexer(c);
first = false;
continue;
}
first = false;
if (TidyUtils.isNamechar((char) c))
{
addCharToLexer(c);
continue;
}
// otherwise put it back
this.in.ungetChar(c);
break;
}
str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
{
report.entityError(this, Report.APOS_UNDEFINED, str, 39);
}
ch = EntityTable.getDefaultEntityTable().entityCode(str);
// drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
// if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
// && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
// || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
// || (ch >= 0xE000 && ch <= 0xFFFD)))
// {
// this.lexsize = start;
// return;
// }
// deal with unrecognized or invalid entities
// #433012 - fix by Randy Waki 17 Feb 01
// report invalid NCR's - Terry Teague 01 Sep 01
if (ch <= 0 || (ch >= 256 && c != ';'))
{
// set error position just before offending character
this.lines = this.in.getCurline();
this.columns = startcol;
if (this.lexsize > start + 1)
{
if (ch >= 128 && ch <= 159)
{
// invalid numeric character reference
int c1 = 0;
if ("WIN1252".equals(configuration.replacementCharEncoding))
{
c1 = EncodingUtils.decodeWin1252(ch);
}
else if ("MACROMAN".equals(configuration.replacementCharEncoding))
{
c1 = EncodingUtils.decodeMacRoman(ch);
}
// "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
if (c != ';') /* issue warning if not terminated by ';' */
{
report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
}
report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
if (c1 != 0)
{
// make the replacement
this.lexsize = start;
addCharToLexer(c1);
semicolon = false;
}
else
{
/* discard */
this.lexsize = start;
semicolon = false;
}
}
else
{
report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
}
if (semicolon)
{
addCharToLexer(';');
}
}
else
{
// naked &
report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
}
}
else
{
// issue warning if not terminated by ';'
if (c != ';')
{
// set error position just before offending character
this.lines = this.in.getCurline();
this.columns = startcol;
report.entityError(this, Report.MISSING_SEMICOLON, str, c);
}
this.lexsize = start;
if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
{
ch = ' ';
}
addCharToLexer(ch);
if (ch == '&' && !this.configuration.quoteAmpersand)
{
addCharToLexer('a');
addCharToLexer('m');
addCharToLexer('p');
addCharToLexer(';');
}
}
}
/**
* Parses a tag name.
* @return first char after the tag name
*/
public char parseTagName()
{
int c;
// fold case of first char in buffer
c = this.lexbuf[this.txtstart];
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
this.lexbuf[this.txtstart] = (byte) c;
}
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
if (!TidyUtils.isNamechar((char) c))
{
break;
}
// fold case of subsequent chars
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
}
addCharToLexer(c);
}
this.txtend = this.lexsize;
return (char) c;
}
/**
* calls addCharToLexer for any char in the string.
* @param str input String
*/
public void addStringLiteral(String str)
{
int len = str.length();
for (int i = 0; i < len; i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* calls addCharToLexer for any char in the string till len is reached.
* @param str input String
* @param len length of the substring to be added
*/
void addStringLiteralLen(String str, int len)
{
int strlen = str.length();
if (strlen < len)
{
len = strlen;
}
for (int i = 0; i < len; i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* Choose what version to use for new doctype.
* @return html version constant
*/
public short htmlVersion()
{
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
{
return Dict.VERS_HTML20;
}
if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
&& TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
{
return Dict.VERS_HTML32;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
{
return Dict.VERS_XHTML11;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
{
return Dict.VERS_HTML40_STRICT;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
{
return Dict.VERS_HTML40_LOOSE;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
{
return Dict.VERS_FRAMESET;
}
return Dict.VERS_UNKNOWN;
}
/**
* Choose what version to use for new doctype.
* @return html version name
*/
public String htmlVersionName()
{
short guessed;
int j;
guessed = apparentVersion();
for (j = 0; j < W3CVERSION.length; ++j)
{
if (guessed == W3CVERSION[j].code)
{
if (this.isvoyager)
{
return W3CVERSION[j].voyagerName;
}
return W3CVERSION[j].name;
}
}
return null;
}
/**
* Add meta element for Tidy. If the meta tag is already present, update release date.
* @param root root node
* @return true
if the tag has been added
*/
public boolean addGenerator(Node root)
{
AttVal attval;
Node node;
Node head = root.findHEAD(this.configuration.tt);
if (head != null)
{
String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see https://github.com/xjl219/jtidy5";
for (node = head.content; node != null; node = node.next)
{
if (node.tag == this.configuration.tt.tagMeta)
{
attval = node.getAttrByName("name");
if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
{
attval = node.getAttrByName("content");
if (attval != null
&& attval.value != null
&& attval.value.length() >= 9
&& "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
{
attval.value = meta;
return false;
}
}
}
}
node = this.inferredTag("meta");
node.addAttribute("content", meta);
node.addAttribute("name", "generator");
head.insertNodeAtStart(node);
return true;
}
return false;
}
/**
* Check system keywords (keywords should be uppercase).
* @param doctype doctype node
* @return true if doctype keywords are all uppercase
*/
public boolean checkDocTypeKeyWords(Node doctype)
{
int len = doctype.end - doctype.start;
String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
return !(TidyUtils.findBadSubString("SYSTEM", s, s.length())
|| TidyUtils.findBadSubString("PUBLIC", s, s.length())
|| TidyUtils.findBadSubString("//DTD", s, s.length())
|| TidyUtils.findBadSubString("//W3C", s, s.length())
|| TidyUtils.findBadSubString("//EN", s, s.length()));
}
/**
* Examine DOCTYPE to identify version.
* @param doctype doctype node
* @return version code
*/
public short findGivenVersion(Node doctype)
{
String p, s;
int i, j;
int len;
String str1;
String str2;
// if root tag for doctype isn't html give up now
str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
if (!"html ".equalsIgnoreCase(str1))
{
return 0;
}
if (!checkDocTypeKeyWords(doctype))
{
report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
}
// give up if all we are given is the system id for the doctype
str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
if ("SYSTEM ".equalsIgnoreCase(str1))
{
// but at least ensure the case is correct
if (!str1.substring(0, 6).equals("SYSTEM"))
{
System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
}
return 0; // unrecognized
}
if ("PUBLIC ".equalsIgnoreCase(str1))
{
if (!str1.substring(0, 6).equals("PUBLIC"))
{
System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
}
}
else
{
this.badDoctype = true;
}
for (i = doctype.start; i < doctype.end; ++i)
{
if (this.lexbuf[i] == (byte) '"')
{
str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
if (str1.equals("-//W3C//DTD "))
{
// compute length of identifier e.g. "HTML 4.0 Transitional"
for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
{
//
}
len = j - i - 13;
p = TidyUtils.getString(this.lexbuf, i + 13, len);
for (j = 1; j < W3CVERSION.length; ++j)
{
s = W3CVERSION[j].name;
if (len == s.length() && s.equals(p))
{
return W3CVERSION[j].code;
}
}
// else unrecognized version
}
else if (str2.equals("-//IETF//DTD "))
{
// compute length of identifier e.g. "HTML 2.0"
for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
{
//
}
len = j - i - 14;
p = TidyUtils.getString(this.lexbuf, i + 14, len);
s = W3CVERSION[0].name;
if (len == s.length() && s.equals(p))
{
return W3CVERSION[0].code;
}
// else unrecognized version
}
break;
}
}
return 0;
}
/**
* Fix xhtml namespace.
* @param root root Node
* @param profile current profile
*/
public void fixHTMLNameSpace(Node root, String profile)
{
Node node;
AttVal attr;
node = root.content;
while (node != null && node.tag != this.configuration.tt.tagHtml)
{
node = node.next;
}
if (node != null)
{
for (attr = node.attributes; attr != null; attr = attr.next)
{
if (attr.attribute.equals("xmlns"))
{
break;
}
}
if (attr != null)
{
if (!attr.value.equals(profile))
{
report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
attr.value = profile;
}
}
else
{
attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
node.attributes = attr;
}
}
}
/**
* Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
* html
tag. Should also work for any comments, etc. that may precede the html
tag.
* @param root root node
* @return new doctype node
*/
Node newXhtmlDocTypeNode(Node root)
{
Node html = root.findHTML(this.configuration.tt);
if (html == null)
{
return null;
}
Node newdoctype = newNode();
newdoctype.setType(Node.DOCTYPE_TAG);
newdoctype.next = html;
newdoctype.parent = root;
newdoctype.prev = null;
if (html == root.content)
{
// No declaration.
root.content.prev = newdoctype;
root.content = newdoctype;
newdoctype.prev = null;
}
else
{
// we have an declaration.
newdoctype.prev = html.prev;
newdoctype.prev.next = newdoctype;
}
html.prev = newdoctype;
return newdoctype;
}
/**
* Adds a new xhtml doctype to the document.
* @param root root node
* @return true
if a doctype has been added
*/
public boolean setXHTMLDocType(Node root)
{
String fpi = " ";
String sysid = "";
String namespace = XHTML_NAMESPACE;
String dtdsub = null;
Node doctype;
int dtdlen = 0;
doctype = root.findDocType();
fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
return true;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
// see what flavor of XHTML this document matches
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
// use XHTML strict
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = VOYAGER_STRICT;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
// use XHTML frames
fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
sysid = VOYAGER_FRAMESET;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = VOYAGER_LOOSE;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
// use XHTML 1.1
fpi = "-//W3C//DTD XHTML 1.1//EN";
sysid = VOYAGER_11;
}
else
{
// proprietary
fpi = null;
sysid = "";
if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
{
Node.discardElement(doctype);
}
}
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = VOYAGER_STRICT;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = VOYAGER_LOOSE;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
{
fpi = this.configuration.docTypeStr;
sysid = "";
}
if (fpi == null)
{
return false;
}
if (doctype != null)
{
// Look for internal DTD subset
if (configuration.xHTML || configuration.xmlOut)
{
int len = doctype.end - doctype.start + 1;
String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
int dtdbeg = start.indexOf('[');
if (dtdbeg >= 0)
{
int dtdend = start.substring(dtdbeg).indexOf(']');
if (dtdend >= 0)
{
dtdlen = dtdend + 1;
dtdsub = start.substring(dtdbeg);
}
}
}
}
else
{
if ((doctype = newXhtmlDocTypeNode(root)) == null)
{
return false;
}
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
// add public identifier
addStringLiteral("html PUBLIC ");
// check if the fpi is quoted or not
if (fpi.charAt(0) == '"')
{
addStringLiteral(fpi);
}
else
{
addStringLiteral("\"");
addStringLiteral(fpi);
addStringLiteral("\"");
}
if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
{
addStringLiteral("\n\"");
}
else
{
// FG: don't wrap
addStringLiteral(" \"");
}
// add system identifier
addStringLiteral(sysid);
addStringLiteral("\"");
if (dtdlen > 0 && dtdsub != null)
{
addCharToLexer(' ');
addStringLiteralLen(dtdsub, dtdlen);
}
this.txtend = this.lexsize;
int length = this.txtend - this.txtstart;
doctype.textarray = new byte[length];
System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
doctype.start = 0;
doctype.end = length;
return false;
}
/**
* Return the html version used in document.
* @return version code
*/
public short apparentVersion()
{
switch (this.doctype)
{
case Dict.VERS_UNKNOWN :
return htmlVersion();
case Dict.VERS_HTML20 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
{
return Dict.VERS_HTML20;
}
break;
case Dict.VERS_HTML32 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
{
return Dict.VERS_HTML32;
}
break; // to replace old version by new
case Dict.VERS_HTML40_STRICT :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
return Dict.VERS_HTML40_STRICT;
}
break;
case Dict.VERS_HTML40_LOOSE :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
{
return Dict.VERS_HTML40_LOOSE;
}
break; // to replace old version by new
case Dict.VERS_FRAMESET :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
return Dict.VERS_FRAMESET;
}
break;
case Dict.VERS_XHTML11 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
return Dict.VERS_XHTML11;
}
break;
default :
// should never reach here
break;
}
// kludge to avoid error appearing at end of file
// it would be better to note the actual position
// when first encountering the doctype declaration
this.lines = 1;
this.columns = 1;
report.warning(this, null, null, Report.INCONSISTENT_VERSION);
return this.htmlVersion();
}
/**
* Fixup doctype if missing.
* @param root root node
* @return false
if current version has not been identified
*/
public boolean fixDocType(Node root)
{
Node doctype;
int guessed = Dict.VERS_HTML40_STRICT, i;
if (this.badDoctype)
{
report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
}
doctype = root.findDocType();
if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
return true;
}
if (this.configuration.xmlOut)
{
return true;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_STRICT;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_LOOSE;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
if (doctype != null)
{
if (this.doctype == Dict.VERS_UNKNOWN)
{
return false;
}
switch (this.doctype)
{
case Dict.VERS_UNKNOWN :
return false;
case Dict.VERS_HTML20 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML32 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML40_STRICT :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML40_LOOSE :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_FRAMESET :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_XHTML11 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
return true;
}
break; // to replace old version by new
default :
// should never reach here
break;
}
// INCONSISTENT_VERSION warning is now issued by ApparentVersion()
}
// choose new doctype
guessed = htmlVersion();
}
if (guessed == Dict.VERS_UNKNOWN)
{
return false;
}
// for XML use the Voyager system identifier
if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
fixHTMLNameSpace(root, XHTML_NAMESPACE);
// Namespace is the same for all XHTML variants
// Also, don't return yet. Still need to add DOCTYPE declaration.
//
// for (i = 0; i < W3CVersion.length; ++i)
// {
// if (guessed == W3CVersion[i].code)
// {
// fixHTMLNameSpace(root, W3CVersion[i].profile);
// break;
// }
// }
// return true;
}
if (doctype == null)
{
if ((doctype = newXhtmlDocTypeNode(root)) == null)
{
return false;
}
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
// use the appropriate public identifier
addStringLiteral("html PUBLIC ");
if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
&& this.configuration.docTypeStr != null
&& this.configuration.docTypeStr.length() > 0)
{
// check if the fpi is quoted or not
if (this.configuration.docTypeStr.charAt(0) == '"')
{
addStringLiteral(this.configuration.docTypeStr);
}
else
{
addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
addStringLiteral(this.configuration.docTypeStr);
addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
}
}
else if (guessed == Dict.VERS_HTML20)
{
addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
}
else
{
addStringLiteral("\"-//W3C//DTD ");
for (i = 0; i < W3CVERSION.length; ++i)
{
if (guessed == W3CVERSION[i].code)
{
addStringLiteral(W3CVERSION[i].name);
break;
}
}
addStringLiteral("//EN\"");
}
this.txtend = this.lexsize;
int length = this.txtend - this.txtstart;
doctype.textarray = new byte[length];
System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
doctype.start = 0;
doctype.end = length;
return true;
}
/**
* Ensure XML document starts with <?XML version="1.0"?>
. Add encoding attribute if not using
* ASCII or UTF-8 output.
* @param root root node
* @return always true
*/
public boolean fixXmlDecl(Node root)
{
Node xml;
AttVal version;
AttVal encoding;
if (root.content != null && root.content.type == Node.XML_DECL)
{
xml = root.content;
}
else
{
xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
xml.next = root.content;
if (root.content != null)
{
root.content.prev = xml;
xml.next = root.content;
}
root.content = xml;
}
version = xml.getAttrByName("version");
encoding = xml.getAttrByName("encoding");
// We need to insert a check if declared encoding and output encoding mismatch
// and fix the Xml declaration accordingly!!!
if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
{
if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
{
xml.addAttribute("encoding", "iso-8859-1");
}
if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
{
xml.addAttribute("encoding", "iso-2022");
}
}
if (version == null)
{
xml.addAttribute("version", "1.0");
}
return true;
}
/**
* Generates and inserts a new node.
* @param name tag name
* @return generated node
*/
public Node inferredTag(String name)
{
Node node;
node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
node.implicit = true;
return node;
}
private static final int CDATA_INTERMEDIATE = 0;
private static final int CDATA_STARTTAG = 1;
private static final int CDATA_ENDTAG = 2;
/**
* Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some
* foo.
* @param container container node
* @return cdata node
*/
public Node getCDATA(Node container)
{
int start = 0;
int nested = 0;
int state = CDATA_INTERMEDIATE;
int c;
boolean isEmpty = true;
boolean matches = false;
boolean hasSrc = container.getAttrByName("src") != null;
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
this.waswhite = false;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
/* seen start tag, look for matching end tag */
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) {
addCharToLexer(c);
txtend = lexsize;
if (state == CDATA_INTERMEDIATE) {
if (c != '<') {
if (isEmpty && !TidyUtils.isWhite((char) c)) {
isEmpty = false;
}
continue;
}
c = in.readChar();
if (TidyUtils.isLetter((char) c)) {
/*