Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Java HTML Tidy - JTidy
* HTML parser and pretty printer
*
* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
* Institute of Technology, Institut National de Recherche en
* Informatique et en Automatique, Keio University). All Rights
* Reserved.
*
* Contributing Author(s):
*
* Dave Raggett
* Andy Quick (translation to Java)
* Gary L Peskin (Java development)
* Sami Lempinen (release management)
* Fabrizio Giustina
*
* The contributing author(s) would like to thank all those who
* helped with testing, bug fixes, and patience. This wouldn't
* have been possible without all of you.
*
* COPYRIGHT NOTICE:
*
* This software and documentation is provided "as is," and
* the copyright holders and contributing author(s) make no
* representations or warranties, express or implied, including
* but not limited to, warranties of merchantability or fitness
* for any particular purpose or that the use of the software or
* documentation will not infringe any third party patents,
* copyrights, trademarks or other rights.
*
* The copyright holders and contributing author(s) will not be
* liable for any direct, indirect, special or consequential damages
* arising out of any use of the software or documentation, even if
* advised of the possibility of such damage.
*
* Permission is hereby granted to use, copy, modify, and distribute
* this source code, or portions hereof, documentation and executables,
* for any purpose, without fee, subject to the following restrictions:
*
* 1. The origin of this source code must not be misrepresented.
* 2. Altered versions must be plainly marked as such and must
* not be misrepresented as being the original source.
* 3. This Copyright notice may not be removed or altered from any
* source or altered source distribution.
*
* The copyright holders and contributing author(s) specifically
* permit, without fee, and encourage the use of this source code
* as a component for supporting the Hypertext Markup Language in
* commercial products. If you use this source code in a product,
* acknowledgment is not required but would be appreciated.
*
*/
package org.w3c.tidy5;
/**
* HTML Parser implementation.
* @author Dave Raggett [email protected]
* @author Andy Quick [email protected] (translation to Java)
* @author Fabrizio Giustina
* @version $Revision: 932 $ ($Author: aditsu $)
*/
public final class ParserImpl
{
/**
* parser for html.
*/
public static final Parser HTML = new ParseHTML();
/**
* parser for head.
*/
public static final Parser HEAD = new ParseHead();
/**
* parser for title.
*/
public static final Parser TITLE = new ParseTitle();
/**
* parser for script.
*/
public static final Parser SCRIPT = new ParseScript();
/**
* parser for body.
*/
public static final Parser BODY = new ParseBody();
/**
* parser for frameset.
*/
public static final Parser FRAMESET = new ParseFrameSet();
/**
* parser for inline.
*/
public static final Parser INLINE = new ParseInline();
/**
* parser for list.
*/
public static final Parser LIST = new ParseList();
/**
* parser for definition lists.
*/
public static final Parser DEFLIST = new ParseDefList();
/**
* parser for pre.
*/
public static final Parser PRE = new ParsePre();
/**
* parser for block elements.
*/
public static final Parser BLOCK = new ParseBlock();
/**
* parser for table.
*/
public static final Parser TABLETAG = new ParseTableTag();
/**
* parser for colgroup.
*/
public static final Parser COLGROUP = new ParseColGroup();
/**
* parser for rowgroup.
*/
public static final Parser ROWGROUP = new ParseRowGroup();
/**
* parser for row.
*/
public static final Parser ROW = new ParseRow();
/**
* parser for noframes.
*/
public static final Parser NOFRAMES = new ParseNoFrames();
/**
* parser for select.
*/
public static final Parser SELECT = new ParseSelect();
/**
* parser for text.
*/
public static final Parser TEXT = new ParseText();
/**
* parser for empty elements.
*/
public static final Parser EMPTY = new ParseEmpty();
/**
* parser for optgroup.
*/
public static final Parser OPTGROUP = new ParseOptGroup();
/**
* ParserImpl should not be instantiated.
*/
private ParserImpl()
{
// unused
}
/**
* @param lexer
* @param node
* @param mode
*/
protected static void parseTag(Lexer lexer, Node node, short mode)
{
// Fix by GLP 2000-12-21. Need to reset insertspace if this
// is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
if ((node.tag.model & Dict.CM_EMPTY) != 0)
{
lexer.waswhite = false;
}
else if (!((node.tag.model & Dict.CM_INLINE) != 0))
{
lexer.insertspace = false;
}
if (node.tag.getParser() == null)
{
return;
}
if (node.type == Node.START_END_TAG)
{
Node.trimEmptyElement(lexer, node);
return;
}
node.tag.getParser().parse(lexer, node, mode);
}
/**
* Move node to the head, where element is used as starting point in hunt for head. Normally called during parsing.
* @param lexer
* @param element
* @param node
*/
protected static void moveToHead(Lexer lexer, Node element, Node node)
{
Node head;
node.removeNode(); // make sure that node is isolated
TagTable tt = lexer.configuration.tt;
if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
{
lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
while (element.tag != tt.tagHtml)
{
element = element.parent;
}
for (head = element.content; head != null; head = head.next)
{
if (head.tag == tt.tagHead)
{
head.insertNodeAtEnd(node);
break;
}
}
if (node.tag.getParser() != null)
{
parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
}
}
else
{
lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
}
/**
* moves given node to end of body element.
* @param lexer Lexer
* @param node Node to insert
*/
static void moveNodeToBody(Lexer lexer, Node node)
{
node.removeNode();
Node body = lexer.root.findBody(lexer.configuration.tt);
body.insertNodeAtEnd(node);
}
/**
* Parser for HTML.
*/
public static class ParseHTML implements Parser
{
/**
* @see org.w3c.tidy5.Parser#parse(org.w3c.tidy5.Lexer, org.w3c.tidy5.Node, short)
*/
public void parse(Lexer lexer, Node html, short mode)
{
Node node, head;
Node frameset = null;
Node noframes = null;
lexer.configuration.xmlTags = false;
lexer.seenEndBody = false;
TagTable tt = lexer.configuration.tt;
while (true)
{
node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
if (node == null)
{
node = lexer.inferredTag("head");
break;
}
if (node.tag == tt.tagHead)
{
break;
}
if (node.tag == html.tag && node.type == Node.END_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
// deal with comments etc.
if (Node.insertMisc(html, node))
{
continue;
}
lexer.ungetToken();
node = lexer.inferredTag("head");
break;
}
head = node;
html.insertNodeAtEnd(head);
HEAD.parse(lexer, head, mode);
while (true)
{
node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
if (node == null)
{
if (frameset == null)
{
// implied body
node = lexer.inferredTag("body");
html.insertNodeAtEnd(node);
BODY.parse(lexer, node, mode);
}
return;
}
// robustly handle html tags
if (node.tag == html.tag)
{
if (node.type != Node.START_TAG && frameset == null)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
}
else if (node.type == Node.END_TAG)
{
lexer.seenEndHtml = true;
}
continue;
}
// deal with comments etc.
if (Node.insertMisc(html, node))
{
continue;
}
// if frameset document coerce to
if (node.tag == tt.tagBody)
{
if (node.type != Node.START_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null)
{
lexer.ungetToken();
if (noframes == null)
{
noframes = lexer.inferredTag("noframes");
frameset.insertNodeAtEnd(noframes);
lexer.report.warning(lexer, html, noframes, Report.INSERTING_TAG);
}
parseTag(lexer, noframes, mode);
continue;
}
lexer.constrainVersion(~Dict.VERS_FRAMESET);
break; // to parse body
}
// flag an error if we see more than one frameset
if (node.tag == tt.tagFrameset)
{
if (node.type != Node.START_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null)
{
lexer.report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
}
else
{
frameset = node;
}
html.insertNodeAtEnd(node);
parseTag(lexer, node, mode);
// see if it includes a noframes element so that we can merge subsequent noframes elements
for (node = frameset.content; node != null; node = node.next)
{
if (node.tag == tt.tagNoframes)
{
noframes = node;
}
}
continue;
}
// if not a frameset document coerce to
if (node.tag == tt.tagNoframes)
{
if (node.type != Node.START_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset == null)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
node = lexer.inferredTag("body");
break;
}
if (noframes == null)
{
noframes = node;
frameset.insertNodeAtEnd(noframes);
}
parseTag(lexer, noframes, mode);
continue;
}
if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
{
if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
{
moveToHead(lexer, html, node);
continue;
}
// #427675 - discard illegal frame element following a frameset - fix by Randy Waki 11 Oct 00
if (frameset != null && node.tag == tt.tagFrame)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
}
lexer.ungetToken();
// insert other content into noframes element
if (frameset != null)
{
if (noframes == null)
{
noframes = lexer.inferredTag("noframes");
frameset.insertNodeAtEnd(noframes);
}
else
{
lexer.report.warning(lexer, html, node, Report.NOFRAMES_CONTENT);
}
lexer.constrainVersion(Dict.VERS_FRAMESET);
parseTag(lexer, noframes, mode);
continue;
}
node = lexer.inferredTag("body");
lexer.constrainVersion(~Dict.VERS_FRAMESET);
break;
}
// node must be body
html.insertNodeAtEnd(node);
parseTag(lexer, node, mode);
lexer.seenEndHtml = true;
}
}
/**
* Parser for HEAD.
*/
public static class ParseHead implements Parser
{
/**
* @see org.w3c.tidy5.Parser#parse(org.w3c.tidy5.Lexer, org.w3c.tidy5.Node, short)
*/
public void parse(Lexer lexer, Node head, short mode)
{
Node node;
int hasTitle = 0;
int hasBase = 0;
TagTable tt = lexer.configuration.tt;
while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
{
if (node.tag == head.tag && node.type == Node.END_TAG)
{
head.closed = true;
break;
}
if (node.type == Node.TEXT_NODE)
{
lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
lexer.ungetToken();
break;
}
// deal with comments etc.
if (Node.insertMisc(head, node))
{
continue;
}
if (node.type == Node.DOCTYPE_TAG)
{
Node.insertDocType(lexer, head, node);
continue;
}
// discard unknown tags
if (node.tag == null)
{
lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD))
{
// #545067 Implicit closing of head broken - warn only for XHTML input
if (lexer.isvoyager)
{
lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
}
lexer.ungetToken();
break;
}
if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
{
if (node.tag == tt.tagTitle)
{
++hasTitle;
if (hasTitle > 1)
{
lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
}
}
else if (node.tag == tt.tagBase)
{
++hasBase;
if (hasBase > 1)
{
lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
}
}
else if (node.tag == tt.tagNoscript)
{
lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
}
head.insertNodeAtEnd(node);
parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
continue;
}
// discard unexpected text nodes and end tags
lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
}
}
}
/**
* Parser for TITLE.
*/
public static class ParseTitle implements Parser
{
/**
* @see org.w3c.tidy5.Parser#parse(org.w3c.tidy5.Lexer, org.w3c.tidy5.Node, short)
*/
public void parse(Lexer lexer, Node title, short mode)
{
Node node;
while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null)
{
// [438658] : Missing / in title endtag makes 2 titles
if (node.tag == title.tag && node.type == Node.START_TAG)
{
lexer.report.warning(lexer, title, node, Report.COERCE_TO_ENDTAG);
node.type = Node.END_TAG;
continue;
}
else if (node.tag == title.tag && node.type == Node.END_TAG)
{
title.closed = true;
Node.trimSpaces(lexer, title);
return;
}
if (node.type == Node.TEXT_NODE)
{
// only called for 1st child
if (title.content == null)
{
Node.trimInitialSpace(lexer, title, node);
}
if (node.start >= node.end)
{
continue;
}
title.insertNodeAtEnd(node);
continue;
}
// deal with comments etc.
if (Node.insertMisc(title, node))
{
continue;
}
// discard unknown tags
if (node.tag == null)
{
lexer.report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED);
continue;
}
// pushback unexpected tokens
lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
Node.trimSpaces(lexer, title);
return;
}
lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR);
}
}
/**
* Parser for SCRIPT.
*/
public static class ParseScript implements Parser
{
/**
* @see org.w3c.tidy5.Parser#parse(org.w3c.tidy5.Lexer, org.w3c.tidy5.Node, short)
*/
public void parse(Lexer lexer, Node script, short mode) {
Node node = lexer.getCDATA(script);
if (node != null) {
script.insertNodeAtEnd(node);
} else {
/* handle e.g. a document like "