com.caucho.xml.HtmlPolicy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of resin Show documentation
Show all versions of resin Show documentation
Resin Java Application Server
/*
* Copyright (c) 1998-2018 Caucho Technology -- all rights reserved
*
* This file is part of Resin(R) Open Source
*
* Each copy or derived work must preserve the copyright notice and this
* notice unmodified.
*
* Resin Open Source is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Resin Open Source is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
* of NON-INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with Resin Open Source; if not, write to the
* Free SoftwareFoundation, Inc.
* 59 Temple Place, Suite 330
* Boston, MA 02111-1307 USA
*
* @author Scott Ferguson
*/
package com.caucho.xml;
import com.caucho.util.CharBuffer;
import com.caucho.util.CharCursor;
import com.caucho.util.CharScanner;
import com.caucho.util.IntMap;
import com.caucho.util.StringCharCursor;
import org.w3c.dom.Element;
import java.io.IOException;
import java.util.Locale;
/**
* Policy for parsing an HTML file.
*/
class HtmlPolicy extends Policy {
static final int DOCUMENT = 1;
static final int COMMENT = DOCUMENT + 1;
static final int TEXT = COMMENT + 1;
static final int JSP = TEXT + 1;
static final int WHITESPACE = JSP + 1;
static final int HTML = WHITESPACE + 1;
static final int HEAD = HTML + 1;
static final int TITLE = HEAD + 1;
static final int ISINDEX = TITLE + 1;
static final int BASE = ISINDEX + 1;
static final int SCRIPT = BASE + 1;
static final int STYLE = SCRIPT + 1;
static final int META = STYLE + 1;
static final int LINK = META + 1;
static final int OBJECT = LINK + 1;
static final int BODY = OBJECT + 1;
static final int BASEFONT = BODY + 1;
static final int BR = BASEFONT + 1;
static final int AREA = BR + 1;
static final int IMG = AREA + 1;
static final int PARAM = IMG + 1;
static final int HR = PARAM + 1;
static final int INPUT = HR + 1;
static final int P = INPUT + 1;
static final int DT = P + 1;
static final int DD = DT + 1;
static final int LI = DD + 1;
static final int OPTION = LI + 1;
static final int TABLE = OPTION + 1;
static final int CAPTION = TABLE + 1;
static final int THEAD = CAPTION + 1;
static final int TFOOT = THEAD + 1;
static final int COL = TFOOT + 1;
static final int COLGROUP = COL + 1;
static final int TBODY = COLGROUP + 1;
static final int TR = TBODY + 1;
static final int TD = TR + 1;
static final int TH = TD + 1;
static final int FRAME = TH + 1;
static final int FRAMESET = FRAME + 1;
static final int BLOCK = FRAMESET + 1;
static final int INLINE = BLOCK + 1;
static IntMap names;
static IntMap cbNames;
static QName htmlName = new QName(null, "html", null);
static QName headName = new QName(null, "head", null);
static QName bodyName = new QName(null, "body", null);
boolean toLower = true;
boolean isJsp = false;
boolean autoHtml = false;
boolean hasBody = false;
boolean autoHead = false;
CharBuffer cb = new CharBuffer();
public void init()
{
toLower = true;
isJsp = false;
autoHtml = false;
hasBody = false;
autoHead = false;
}
/**
* When true, HTML parsing normalizes HTML tags to lower case.
*/
public void setToLower(boolean toLower)
{
this.toLower = toLower;
}
/**
* When true, treat text before HTML specially.
*/
public void setJsp(boolean isJsp)
{
this.isJsp = isJsp;
}
/**
* Return the normalized name.
*
* @param tag the raw name in the XML file.
*
* @return the normalized name.
*/
QName getName(CharBuffer tag)
{
if (! toLower)
return super.getName(tag);
cb.clear();
cb.append(tag);
cb.toLowerCase();
int name = cbNames.get(cb);
if (name >= 0)
return super.getName(cb);
else
return super.getName(tag);
}
QName getAttributeName(CharBuffer eltName, CharBuffer source)
{
if (! toLower)
return super.getName(source);
cb.clear();
cb.append(eltName);
cb.toLowerCase();
int name = cbNames.get(cb);
if (name < 0)
return super.getName(source);
else {
source.toLowerCase();
return super.getName(source);
}
}
/**
* Returns the appropriate action when opening a HTML tag.
*
* @param parser the XML parser
* @param node the parent node
* @param next the next child
* @return the action code
*/
int openAction(XmlParser parser, QName node, QName next)
throws XmlParseException
{
String nodeName = node == null ? "#document" : node.getName();
String nextName = next.getName();
int nextCode = names.get(nextName);
switch (names.get(nodeName)) {
case DOCUMENT:
switch (nextCode) {
case HTML:
return PUSH;
case COMMENT:
return PUSH;
case HEAD: case TITLE: case ISINDEX: case BASE: case SCRIPT:
case STYLE: case META: case LINK: case OBJECT:
opt = htmlName;
return PUSH_OPT;
case WHITESPACE:
return IGNORE;
case JSP:
return PUSH;
default:
if (autoHtml)
return PUSH;
autoHtml = true;
opt = htmlName;
return PUSH_OPT;
}
case HTML:
switch (nextCode) {
case HTML:
return ERROR;
case HEAD:
case COMMENT:
case FRAMESET:
return PUSH;
case BODY:
hasBody = true;
return PUSH;
case TITLE: case ISINDEX: case BASE: case SCRIPT:
case STYLE: case META: case LINK: case OBJECT:
opt = headName;
autoHead = true;
return PUSH_OPT;
case WHITESPACE:
return PUSH;
case JSP:
return PUSH;
default:
if (hasBody)
return PUSH;
hasBody = true;
opt = bodyName;
return PUSH_OPT;
}
case HEAD:
switch (nextCode) {
case META:
// checkMetaEncoding((Element) next);
return PUSH_EMPTY;
case LINK: case ISINDEX: case BASE:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
case TITLE:
case OBJECT:
return PUSH;
case WHITESPACE:
return PUSH;
case JSP:
case TEXT:
if (autoHead)
return POP;
else
return PUSH;
default:
return POP;
}
case LI:
switch (nextCode) {
case LI:
return POP;
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case COL: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
case OPTION:
switch (nextCode) {
case WHITESPACE:
case TEXT:
return PUSH;
default:
return POP;
}
case DD:
switch (nextCode) {
case DD: case DT:
return POP;
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case COL: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
case THEAD: case TFOOT: case COLGROUP:
switch (nextCode) {
case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL:
return POP;
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
case TR:
switch (nextCode) {
case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR:
return POP;
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case TD: case TH:
return PUSH;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
case TD: case TH:
switch (nextCode) {
case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR:
case TD: case TH:
return POP;
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
case P: case DT:
switch (nextCode) {
case BLOCK: case P: case TABLE: case CAPTION: case THEAD:
case TFOOT: case COLGROUP: case TBODY: case TR: case TD:
case TH: case DT: case LI:
return POP;
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case COL: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
case TABLE:
switch (nextCode) {
case CAPTION: case THEAD: case TFOOT: case COL: case COLGROUP:
case TBODY: case TR:
return PUSH;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
/*
opt = "tr";
return PUSH_OPT;
*/
return PUSH;
}
default:
switch (nextCode) {
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case COL: case FRAME: case ISINDEX:
case BASE: case META:
return PUSH_EMPTY;
case SCRIPT: case STYLE:
return PUSH_VERBATIM;
default:
return PUSH;
}
}
}
private static CharScanner charsetScanner = new CharScanner(" \t=;");
private void checkMetaEncoding(Element elt)
{
String http = elt.getAttribute("http-equiv");
String content = elt.getAttribute("content");
if (http.equals("") || content.equals("") ||
! http.equalsIgnoreCase("content-type"))
return;
CharCursor cursor = new StringCharCursor(content);
charsetScanner.scan(cursor);
charsetScanner.skip(cursor);
CharBuffer buf = CharBuffer.allocate();
while (cursor.current() != cursor.DONE) {
buf.clear();
charsetScanner.scan(cursor, buf);
if (buf.toString().equalsIgnoreCase("charset")) {
charsetScanner.skip(cursor);
buf.clear();
charsetScanner.scan(cursor, buf);
if (buf.length() > 0) {
try {
is.setEncoding(buf.close());
} catch (IOException e) {
}
return;
}
}
}
}
int elementCloseAction(XmlParser parser, QName node, String tagEnd)
throws XmlParseException
{
String nodeName = node.getName();
if (nodeName.equals(tagEnd))
return POP;
if (nodeName == "#document" && tagEnd.equals("")) {
/*
Document doc = (Document) node;
// If JSP, move any text into the body element
if (isJsp && doc.getDocumentElement() == null &&
node.getFirstChild() instanceof Text) {
Element html = doc.createElement("html");
doc.appendChild(html);
Element body = doc.createElement("body");
html.appendChild(body);
Node child;
while ((child = doc.getFirstChild()) instanceof Text ||
child instanceof Comment) {
body.appendChild(child);
}
}
*/
return POP;
}
switch (names.get(tagEnd)) {
case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
case HR: case INPUT: case COL: case FRAME: case ISINDEX:
case BASE: case META:
String errorTagEnd;
if (tagEnd.equals(""))
errorTagEnd = L.l("end of file");
else
errorTagEnd = "`<" + tagEnd + ">'";
throw parser.error(L.l("{0} expects to be empty",
errorTagEnd));
}
switch (names.get(nodeName)) {
case BODY: case P:
case DT: case DD: case LI: case OPTION:
case THEAD: case TFOOT: case TBODY: case COLGROUP:
case TR: case TH: case TD:
return POP_AND_LOOP;
case HTML:
case HEAD:
// If JSP and missing a body, move any text into the body element
/*
if (isJsp && node.getLastChild() instanceof Text) {
Node child;
for (child = node.getLastChild();
child != null;
child = child.getPreviousSibling()) {
if (child.getNodeName().equals("body"))
return POP_AND_LOOP;
}
Document doc = node.getOwnerDocument();
Element body = doc.createElement("body");
while ((child = node.getLastChild()) instanceof Text ||
child instanceof Comment) {
body.insertBefore(child, body.getFirstChild());
}
doc.getDocumentElement().appendChild(body);
}
*/
return POP_AND_LOOP;
default:
if (forgiving) {
/*
Node parent = node;
for (; parent != null; parent = parent.getParentNode()) {
if (parent.getNodeName().equals(tagEnd))
return POP_AND_LOOP;
}
return IGNORE;
*/
return POP_AND_LOOP;
}
String errorTagEnd;
if (tagEnd.equals(""))
errorTagEnd = L.l("end of file");
else
errorTagEnd = "`" + tagEnd + ">'";
String expect;
if (nodeName.equals("#document")) {
throw parser.error(L.l("expected {0} at {1}",
L.l("end of document"), errorTagEnd));
}
else
expect = "`" + nodeName + ">'";
throw parser.error(L.l("expected {0} at {1} (open at {2})",
expect, errorTagEnd,
"" + parser.getNodeLine()));
}
}
private static void addName(String name, int code)
{
names.put(name, code);
cbNames.put(new CharBuffer(name), code);
String upper = name.toUpperCase(Locale.ENGLISH);
names.put(upper, code);
cbNames.put(new CharBuffer(upper), code);
}
static {
names = new IntMap();
cbNames = new IntMap();
addName("#document", DOCUMENT);
addName("#comment", COMMENT);
addName("#text", TEXT);
addName("#jsp", JSP);
addName("#whitespace", WHITESPACE);
addName("html", HTML);
addName("head", HEAD);
addName("title", TITLE);
addName("isindex", ISINDEX);
addName("base", BASE);
addName("script", SCRIPT);
addName("style", STYLE);
addName("meta", META);
addName("link", LINK);
addName("object", OBJECT);
addName("body", BODY);
addName("basefont", BASEFONT);
addName("br", BR);
addName("area", AREA);
addName("link", LINK);
addName("img", IMG);
addName("param", PARAM);
addName("hr", HR);
addName("input", INPUT);
addName("frame", FRAME);
addName("p", P);
addName("dt", DT);
addName("dd", DD);
addName("li", LI);
addName("option", OPTION);
addName("table", TABLE);
addName("caption", CAPTION);
addName("thead", THEAD);
addName("tfoot", TFOOT);
addName("col", COL);
addName("colgroup", COLGROUP);
addName("tbody", TBODY);
addName("tr", TR);
addName("th", TH);
addName("td", TD);
addName("h1", BLOCK);
addName("h2", BLOCK);
addName("h3", BLOCK);
addName("h4", BLOCK);
addName("h5", BLOCK);
addName("h6", BLOCK);
addName("ul", BLOCK);
addName("ol", BLOCK);
addName("dir", BLOCK);
addName("menu", BLOCK);
addName("pre", BLOCK);
addName("dl", BLOCK);
addName("div", BLOCK);
addName("center", BLOCK);
addName("noscript", BLOCK);
addName("noframes", BLOCK);
addName("blockquote", BLOCK);
addName("form", BLOCK);
addName("fieldset", BLOCK);
addName("address", BLOCK);
addName("tt", INLINE);
addName("i", INLINE);
addName("b", INLINE);
addName("u", INLINE);
addName("s", INLINE);
addName("strike", INLINE);
addName("big", INLINE);
addName("small", INLINE);
addName("em", INLINE);
addName("strong", INLINE);
addName("dfn", INLINE);
addName("code", INLINE);
addName("samp", INLINE);
addName("kbd", INLINE);
addName("var", INLINE);
addName("cite", INLINE);
addName("abbr", INLINE);
addName("acronym", INLINE);
addName("font", INLINE);
addName("iframe", INLINE);
addName("applet", INLINE);
addName("ins", INLINE);
addName("del", INLINE);
addName("a", INLINE);
addName("map", INLINE);
addName("q", INLINE);
addName("sub", INLINE);
addName("sup", INLINE);
addName("span", INLINE);
addName("bdo", INLINE);
addName("select", INLINE);
addName("textarea", INLINE);
addName("label", INLINE);
addName("optgroup", INLINE);
addName("button", INLINE);
addName("legend", INLINE);
addName("frameset", FRAMESET);
// CDATA -- STYLE, SCRIPT
}
}