All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.caucho.xml.HtmlPolicy Maven / Gradle / Ivy

There is a newer version: 4.0.66
Show newest version
/*
 * Copyright (c) 1998-2018 Caucho Technology -- all rights reserved
 *
 * This file is part of Resin(R) Open Source
 *
 * Each copy or derived work must preserve the copyright notice and this
 * notice unmodified.
 *
 * Resin Open Source is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Resin Open Source is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
 * of NON-INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Resin Open Source; if not, write to the
 *   Free SoftwareFoundation, Inc.
 *   59 Temple Place, Suite 330
 *   Boston, MA 02111-1307  USA
 *
 * @author Scott Ferguson
 */

package com.caucho.xml;

import com.caucho.util.CharBuffer;
import com.caucho.util.CharCursor;
import com.caucho.util.CharScanner;
import com.caucho.util.IntMap;
import com.caucho.util.StringCharCursor;

import org.w3c.dom.Element;

import java.io.IOException;
import java.util.Locale;

/**
 * Policy for parsing an HTML file.
 */
class HtmlPolicy extends Policy {
  static final int DOCUMENT = 1;
  static final int COMMENT = DOCUMENT + 1;
  static final int TEXT = COMMENT + 1;
  static final int JSP = TEXT + 1;
  static final int WHITESPACE = JSP + 1;

  static final int HTML = WHITESPACE + 1;
  static final int HEAD = HTML + 1;
  static final int TITLE = HEAD + 1;
  static final int ISINDEX = TITLE + 1;
  static final int BASE = ISINDEX + 1;
  static final int SCRIPT = BASE + 1;
  static final int STYLE = SCRIPT + 1;
  static final int META = STYLE + 1;
  static final int LINK = META + 1;
  static final int OBJECT = LINK + 1;

  static final int BODY = OBJECT + 1;

  static final int BASEFONT = BODY + 1;
  static final int BR = BASEFONT + 1;
  static final int AREA = BR + 1;
  static final int IMG = AREA + 1;
  static final int PARAM = IMG + 1;
  static final int HR = PARAM + 1;
  static final int INPUT = HR + 1;

  static final int P = INPUT + 1;
  static final int DT = P + 1;
  static final int DD = DT + 1;
  static final int LI = DD + 1;
  static final int OPTION = LI + 1;

  static final int TABLE = OPTION + 1;
  static final int CAPTION = TABLE + 1;
  static final int THEAD = CAPTION + 1;
  static final int TFOOT = THEAD + 1;
  static final int COL = TFOOT + 1;
  static final int COLGROUP = COL + 1;
  static final int TBODY = COLGROUP + 1;
  static final int TR = TBODY + 1;
  static final int TD = TR + 1;
  static final int TH = TD + 1;

  static final int FRAME = TH + 1;
  static final int FRAMESET = FRAME + 1;

  static final int BLOCK = FRAMESET + 1;
  static final int INLINE = BLOCK + 1;

  static IntMap names;
  static IntMap cbNames;
  
  static QName htmlName = new QName(null, "html", null);
  static QName headName = new QName(null, "head", null);
  static QName bodyName = new QName(null, "body", null);

  boolean toLower = true;
  boolean isJsp = false;
  boolean autoHtml = false;
  boolean hasBody = false;
  boolean autoHead = false;
  
  CharBuffer cb = new CharBuffer();

  public void init()
  {
    toLower = true;
    isJsp = false;
    autoHtml = false;
    hasBody = false;
    autoHead = false;
  }

  /**
   * When true, HTML parsing normalizes HTML tags to lower case.
   */
  public void setToLower(boolean toLower)
  {
    this.toLower = toLower;
  }

  /**
   * When true, treat text before HTML specially.
   */
  public void setJsp(boolean isJsp)
  {
    this.isJsp = isJsp;
  }

  /**
   * Return the normalized name.
   *
   * @param tag the raw name in the XML file.
   *
   * @return the normalized name.
   */
  QName getName(CharBuffer tag)
  {
    if (! toLower)
      return super.getName(tag);
    
    cb.clear();
    cb.append(tag);
    cb.toLowerCase();

    int name = cbNames.get(cb);

    if (name >= 0)
      return super.getName(cb);
    else
      return super.getName(tag);
  }

  QName getAttributeName(CharBuffer eltName, CharBuffer source)
  {
    if (! toLower)
      return super.getName(source);
    
    cb.clear();
    cb.append(eltName);
    cb.toLowerCase();
    int name = cbNames.get(cb);

    if (name < 0)
      return super.getName(source);
    else {
      source.toLowerCase();
      return super.getName(source);
    }
  }

  /**
   * Returns the appropriate action when opening a HTML tag.
   *
   * @param parser the XML parser
   * @param node the parent node
   * @param next the next child
   * @return the action code
   */
  int openAction(XmlParser parser, QName node, QName next)
    throws XmlParseException
  {
    String nodeName = node == null ? "#document" : node.getName();
    String nextName = next.getName();

    int nextCode = names.get(nextName);

    switch (names.get(nodeName)) {
    case DOCUMENT:
      switch (nextCode) {
      case HTML:
        return PUSH;

      case COMMENT:
        return PUSH;

      case HEAD: case TITLE: case ISINDEX: case BASE: case SCRIPT: 
      case STYLE: case META: case LINK: case OBJECT:
        opt = htmlName;
        return PUSH_OPT;

      case WHITESPACE:
        return IGNORE;

      case JSP:
        return PUSH;

      default:
        if (autoHtml)
          return PUSH;
        
        autoHtml = true;
        opt = htmlName;
        return PUSH_OPT;
      }

    case HTML:
      switch (nextCode) {
      case HTML:
        return ERROR;

      case HEAD:
      case COMMENT:
      case FRAMESET:
        return PUSH;
        
      case BODY:
        hasBody = true;
        return PUSH;

      case TITLE: case ISINDEX: case BASE: case SCRIPT: 
      case STYLE: case META: case LINK: case OBJECT:
        opt = headName;
        autoHead = true;
        return PUSH_OPT;

      case WHITESPACE:
        return PUSH;

      case JSP:
        return PUSH;

      default:
        if (hasBody)
          return PUSH;
        
        hasBody = true;
        opt = bodyName;
        return PUSH_OPT;
      }

    case HEAD:
      switch (nextCode) {
      case META:
        // checkMetaEncoding((Element) next);
        return PUSH_EMPTY;

      case LINK: case ISINDEX: case BASE: 
        return PUSH_EMPTY;
        
      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;
        
      case TITLE:
      case OBJECT:
        return PUSH;

      case WHITESPACE:
        return PUSH;
        
      case JSP:
      case TEXT:
        if (autoHead)
          return POP;
        else
          return PUSH;

      default:
        return POP;
      }

    case LI:
      switch (nextCode) {
      case LI:
        return POP;

      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case COL: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }

    case OPTION:
      switch (nextCode) {
      case WHITESPACE:
      case TEXT:
        return PUSH;

      default:
        return POP;
      }

    case DD:
      switch (nextCode) {
      case DD: case DT:
        return POP;

      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case COL: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }

    case THEAD: case TFOOT: case COLGROUP:
      switch (nextCode) {
      case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL:
        return POP;

      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }

    case TR:
      switch (nextCode) {
      case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR:
        return POP;

      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case TD: case TH:
        return PUSH;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }

    case TD: case TH:
      switch (nextCode) {
      case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR:
      case TD: case TH:
        return POP;

      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }

    case P: case DT:
      switch (nextCode) {
      case BLOCK: case P: case TABLE: case CAPTION: case THEAD:
      case TFOOT: case COLGROUP: case TBODY: case TR: case TD: 
      case TH: case DT: case LI:
        return POP;

      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case COL: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }

    case TABLE:
      switch (nextCode) {
      case CAPTION: case THEAD: case TFOOT: case COL: case COLGROUP:
      case TBODY: case TR:
        return PUSH;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        /*
        opt = "tr";
        return PUSH_OPT;
        */
        return PUSH;
      }

    default:
      switch (nextCode) {
      case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
      case HR: case INPUT: case COL: case FRAME: case ISINDEX: 
      case BASE: case META:
        return PUSH_EMPTY;

      case SCRIPT: case STYLE:
        return PUSH_VERBATIM;

      default:
        return PUSH;
      }
    }
  }

  private static CharScanner charsetScanner = new CharScanner(" \t=;");

  private void checkMetaEncoding(Element elt)
  {
    String http = elt.getAttribute("http-equiv");
    String content = elt.getAttribute("content");
    if (http.equals("") || content.equals("") ||
        ! http.equalsIgnoreCase("content-type"))
      return;

    CharCursor cursor = new StringCharCursor(content);
    charsetScanner.scan(cursor);
    charsetScanner.skip(cursor);
    CharBuffer buf = CharBuffer.allocate();
    while (cursor.current() != cursor.DONE) {
      buf.clear();
      charsetScanner.scan(cursor, buf);
      if (buf.toString().equalsIgnoreCase("charset")) {
        charsetScanner.skip(cursor);
        buf.clear();
        charsetScanner.scan(cursor, buf);
        if (buf.length() > 0) {
          try {
            is.setEncoding(buf.close());
          } catch (IOException e) {
          }
          return;
        }
      }
    }
  }

  int elementCloseAction(XmlParser parser, QName node, String tagEnd)
    throws XmlParseException
  {
    String nodeName = node.getName();
    if (nodeName.equals(tagEnd))
      return POP;

    if (nodeName == "#document" && tagEnd.equals("")) {
      /*
      Document doc = (Document) node;

      // If JSP, move any text into the body element
      if (isJsp && doc.getDocumentElement() == null &&
          node.getFirstChild() instanceof Text) {
        Element html = doc.createElement("html");
        doc.appendChild(html);
        Element body = doc.createElement("body");
        html.appendChild(body);
        Node child;
        while ((child = doc.getFirstChild()) instanceof Text ||
        child instanceof Comment) {
          body.appendChild(child);
        }
      }
      */
      return POP;
    }
    switch (names.get(tagEnd)) {
    case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 
    case HR: case INPUT: case COL: case FRAME: case ISINDEX: 
    case BASE: case META:
      String errorTagEnd;
      if (tagEnd.equals(""))
        errorTagEnd = L.l("end of file");
      else
        errorTagEnd = "`<" + tagEnd + ">'";

      throw parser.error(L.l("{0} expects to be empty",
                             errorTagEnd));
    }

    switch (names.get(nodeName)) {
    case BODY: case P:
    case DT: case DD: case LI: case OPTION:
    case THEAD: case TFOOT: case TBODY: case COLGROUP: 
    case TR: case TH: case TD:
      return POP_AND_LOOP;

    case HTML:
    case HEAD:
      // If JSP and missing a body, move any text into the body element
      /*
      if (isJsp && node.getLastChild() instanceof Text) {
        Node child;

        for (child = node.getLastChild();
             child != null;
             child = child.getPreviousSibling()) {
          if (child.getNodeName().equals("body"))
            return POP_AND_LOOP;
        }

        Document doc = node.getOwnerDocument();
        Element body = doc.createElement("body");
        
        while ((child = node.getLastChild()) instanceof Text ||
               child instanceof Comment) {
          body.insertBefore(child, body.getFirstChild());
        }
        
        doc.getDocumentElement().appendChild(body);
      }
      */
      return POP_AND_LOOP;

    default:

      if (forgiving) {
        /*
        Node parent = node;
        for (; parent != null; parent = parent.getParentNode()) {
          if (parent.getNodeName().equals(tagEnd))
            return POP_AND_LOOP;
        }
        return IGNORE;
        */
        return POP_AND_LOOP;
      }
      
      String errorTagEnd;
      if (tagEnd.equals(""))
        errorTagEnd = L.l("end of file");
      else
        errorTagEnd = "`'";

      String expect;
      if (nodeName.equals("#document")) {
        throw parser.error(L.l("expected {0} at {1}",
                               L.l("end of document"), errorTagEnd));
      }
      else
        expect = "`'";

      throw parser.error(L.l("expected {0} at {1} (open at {2})",
                             expect, errorTagEnd,
                             "" + parser.getNodeLine()));
    }
  }

  private static void addName(String name, int code)
  {
    names.put(name, code);
    cbNames.put(new CharBuffer(name), code);

    String upper = name.toUpperCase(Locale.ENGLISH);
    names.put(upper, code);
    cbNames.put(new CharBuffer(upper), code);
  }

  static {
    names = new IntMap();
    cbNames = new IntMap();
    
    addName("#document", DOCUMENT);
    addName("#comment", COMMENT);
    addName("#text", TEXT);
    addName("#jsp", JSP);
    addName("#whitespace", WHITESPACE);
    addName("html", HTML);

    addName("head", HEAD);
    addName("title", TITLE);
    addName("isindex", ISINDEX);
    addName("base", BASE);
    addName("script", SCRIPT);
    addName("style", STYLE);
    addName("meta", META);
    addName("link", LINK);
    addName("object", OBJECT);

    addName("body", BODY);

    addName("basefont", BASEFONT);
    addName("br", BR);
    addName("area", AREA);
    addName("link", LINK);
    addName("img", IMG);
    addName("param", PARAM);
    addName("hr", HR);
    addName("input", INPUT);
    addName("frame", FRAME);

    addName("p", P);
    addName("dt", DT);
    addName("dd", DD);
    addName("li", LI);
    addName("option", OPTION);

    addName("table", TABLE);
    addName("caption", CAPTION);
    addName("thead", THEAD);
    addName("tfoot", TFOOT);
    addName("col", COL);
    addName("colgroup", COLGROUP);
    addName("tbody", TBODY);
    addName("tr", TR);
    addName("th", TH);
    addName("td", TD);

    addName("h1", BLOCK);
    addName("h2", BLOCK);
    addName("h3", BLOCK);
    addName("h4", BLOCK);
    addName("h5", BLOCK);
    addName("h6", BLOCK);
    addName("ul", BLOCK);
    addName("ol", BLOCK);
    addName("dir", BLOCK);
    addName("menu", BLOCK);
    addName("pre", BLOCK);
    addName("dl", BLOCK);
    addName("div", BLOCK);
    addName("center", BLOCK);
    addName("noscript", BLOCK);
    addName("noframes", BLOCK);
    addName("blockquote", BLOCK);
    addName("form", BLOCK);
    addName("fieldset", BLOCK);
    addName("address", BLOCK);

    addName("tt", INLINE);
    addName("i", INLINE);
    addName("b", INLINE);
    addName("u", INLINE);
    addName("s", INLINE);
    addName("strike", INLINE);
    addName("big", INLINE);
    addName("small", INLINE);

    addName("em", INLINE);
    addName("strong", INLINE);
    addName("dfn", INLINE);
    addName("code", INLINE);
    addName("samp", INLINE);
    addName("kbd", INLINE);
    addName("var", INLINE);
    addName("cite", INLINE);
    addName("abbr", INLINE);
    addName("acronym", INLINE);
    addName("font", INLINE);
    addName("iframe", INLINE);
    addName("applet", INLINE);
    addName("ins", INLINE);
    addName("del", INLINE);

    addName("a", INLINE);
    addName("map", INLINE);
    addName("q", INLINE);
    addName("sub", INLINE);
    addName("sup", INLINE);
    addName("span", INLINE);
    addName("bdo", INLINE);

    addName("select", INLINE);
    addName("textarea", INLINE);
    addName("label", INLINE);
    addName("optgroup", INLINE);
    addName("button", INLINE);
    addName("legend", INLINE);
    addName("frameset", FRAMESET);

    // CDATA -- STYLE, SCRIPT
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy