All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.cobraparser.html.parser.HtmlParser Maven / Gradle / Ivy

There is a newer version: 1.0.2
Show newest version
    Copyright (C) 2006 The Lobo Project

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

    Contact info: [email protected]
 * Created on Aug 28, 2005
package org.cobraparser.html.parser;

import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.cobraparser.html.domimpl.DocumentTypeImpl;
import org.cobraparser.html.domimpl.HTMLDocumentImpl;
import org.cobraparser.util.ArrayUtilities;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;

 * The HtmlParser class is an HTML DOM parser. This parser provides
 * the functionality for the standard DOM parser implementation
 * {@link org.cobraparser.html.parser.DocumentBuilderImpl}. This parser class
 * may be used directly when a different DOM implementation is preferred.
public class HtmlParser {
  private static final Logger logger = Logger.getLogger(HtmlParser.class.getName());
  private final Document document;
  private final UserAgentContext ucontext;
  private final ErrorHandler errorHandler;
  private final boolean isXML;

  // TODO: The quirks mode should go
  private static final boolean QUIRKS_MODE = true;
  private Node lastRootElement = null;
  private Node lastHeadElement = null;
  private Node lastBodyElement = null;
  private boolean needRoot;

  private static final Map ENTITIES = new HashMap<>(256);
  private static final Map ELEMENT_INFOS = new HashMap<>(35);

   * A node UserData key used to tell nodes that their content may
   * be about to be modified. Elements could use this to temporarily suspend
   * notifications. The value set will be either Boolean.TRUE or
   * Boolean.FALSE.
  public static final String MODIFYING_KEY = "cobra.suspend";

  static {
    final Map entities = ENTITIES;
    entities.put("amp", new Character('&'));
    entities.put("lt", new Character('<'));
    entities.put("gt", new Character('>'));
    entities.put("quot", new Character('"'));
    entities.put("nbsp", new Character((char) 160));

    entities.put("lsquo", new Character('\u2018'));
    entities.put("rsquo", new Character('\u2019'));

    entities.put("frasl", new Character((char) 47));
    entities.put("ndash", new Character((char) 8211));
    entities.put("mdash", new Character((char) 8212));
    entities.put("iexcl", new Character((char) 161));
    entities.put("cent", new Character((char) 162));
    entities.put("pound", new Character((char) 163));
    entities.put("curren", new Character((char) 164));
    entities.put("yen", new Character((char) 165));
    entities.put("brvbar", new Character((char) 166));
    entities.put("brkbar", new Character((char) 166));
    entities.put("sect", new Character((char) 167));
    entities.put("uml", new Character((char) 168));
    entities.put("die", new Character((char) 168));
    entities.put("copy", new Character((char) 169));
    entities.put("ordf", new Character((char) 170));
    entities.put("laquo", new Character((char) 171));
    entities.put("not", new Character((char) 172));
    entities.put("shy", new Character((char) 173));
    entities.put("reg", new Character((char) 174));
    entities.put("macr", new Character((char) 175));
    entities.put("hibar", new Character((char) 175));
    entities.put("deg", new Character((char) 176));
    entities.put("plusmn", new Character((char) 177));
    entities.put("sup2", new Character((char) 178));
    entities.put("sup3", new Character((char) 179));
    entities.put("acute", new Character((char) 180));
    entities.put("micro", new Character((char) 181));
    entities.put("para", new Character((char) 182));
    entities.put("middot", new Character((char) 183));
    entities.put("cedil", new Character((char) 184));
    entities.put("sup1", new Character((char) 185));
    entities.put("ordm", new Character((char) 186));
    entities.put("raquo", new Character((char) 187));
    entities.put("frac14", new Character((char) 188));
    entities.put("frac12", new Character((char) 189));
    entities.put("frac34", new Character((char) 190));
    entities.put("iquest", new Character((char) 191));
    entities.put("Agrave", new Character((char) 192));
    entities.put("Aacute", new Character((char) 193));
    entities.put("Acirc", new Character((char) 194));
    entities.put("Atilde", new Character((char) 195));
    entities.put("Auml", new Character((char) 196));
    entities.put("Aring", new Character((char) 197));
    entities.put("AElig", new Character((char) 198));
    entities.put("Ccedil", new Character((char) 199));
    entities.put("Egrave", new Character((char) 200));
    entities.put("Eacute", new Character((char) 201));
    entities.put("Ecirc", new Character((char) 202));
    entities.put("Euml", new Character((char) 203));
    entities.put("Igrave", new Character((char) 204));
    entities.put("Iacute", new Character((char) 205));
    entities.put("Icirc", new Character((char) 206));
    entities.put("Iuml", new Character((char) 207));
    entities.put("ETH", new Character((char) 208));
    entities.put("Ntilde", new Character((char) 209));
    entities.put("Ograve", new Character((char) 210));
    entities.put("Oacute", new Character((char) 211));
    entities.put("Ocirc", new Character((char) 212));
    entities.put("Otilde", new Character((char) 213));
    entities.put("Ouml", new Character((char) 214));
    entities.put("times", new Character((char) 215));
    entities.put("Oslash", new Character((char) 216));
    entities.put("Ugrave", new Character((char) 217));
    entities.put("Uacute", new Character((char) 218));
    entities.put("Ucirc", new Character((char) 219));
    entities.put("Uuml", new Character((char) 220));
    entities.put("Yacute", new Character((char) 221));
    entities.put("THORN", new Character((char) 222));
    entities.put("szlig", new Character((char) 223));
    entities.put("agrave", new Character((char) 224));
    entities.put("aacute", new Character((char) 225));
    entities.put("acirc", new Character((char) 226));
    entities.put("atilde", new Character((char) 227));
    entities.put("auml", new Character((char) 228));
    entities.put("aring", new Character((char) 229));
    entities.put("aelig", new Character((char) 230));
    entities.put("ccedil", new Character((char) 231));
    entities.put("egrave", new Character((char) 232));
    entities.put("eacute", new Character((char) 233));
    entities.put("ecirc", new Character((char) 234));
    entities.put("euml", new Character((char) 235));
    entities.put("igrave", new Character((char) 236));
    entities.put("iacute", new Character((char) 237));
    entities.put("icirc", new Character((char) 238));
    entities.put("iuml", new Character((char) 239));
    entities.put("eth", new Character((char) 240));
    entities.put("ntilde", new Character((char) 241));
    entities.put("ograve", new Character((char) 242));
    entities.put("oacute", new Character((char) 243));
    entities.put("ocirc", new Character((char) 244));
    entities.put("otilde", new Character((char) 245));
    entities.put("ouml", new Character((char) 246));
    entities.put("divide", new Character((char) 247));
    entities.put("oslash", new Character((char) 248));
    entities.put("ugrave", new Character((char) 249));
    entities.put("uacute", new Character((char) 250));
    entities.put("ucirc", new Character((char) 251));
    entities.put("uuml", new Character((char) 252));
    entities.put("yacute", new Character((char) 253));
    entities.put("thorn", new Character((char) 254));
    entities.put("yuml", new Character((char) 255));

    // symbols from

    // greek letters
    entities.put("Alpha", new Character((char) 913));
    entities.put("Beta", new Character((char) 914));
    entities.put("Gamma", new Character((char) 915));
    entities.put("Delta", new Character((char) 916));
    entities.put("Epsilon", new Character((char) 917));
    entities.put("Zeta", new Character((char) 918));
    entities.put("Eta", new Character((char) 919));
    entities.put("Theta", new Character((char) 920));
    entities.put("Iota", new Character((char) 921));
    entities.put("Kappa", new Character((char) 922));
    entities.put("Lambda", new Character((char) 923));
    entities.put("Mu", new Character((char) 924));
    entities.put("Nu", new Character((char) 925));
    entities.put("Xi", new Character((char) 926));
    entities.put("Omicron", new Character((char) 927));
    entities.put("Pi", new Character((char) 928));
    entities.put("Rho", new Character((char) 929));
    entities.put("Sigma", new Character((char) 930));
    entities.put("Sigmaf", new Character((char) 931));
    entities.put("Tau", new Character((char) 932));
    entities.put("Upsilon", new Character((char) 933));
    entities.put("Phi", new Character((char) 934));
    entities.put("Chi", new Character((char) 935));
    entities.put("Psi", new Character((char) 936));
    entities.put("Omega", new Character((char) 937));

    entities.put("alpha", new Character((char) 945));
    entities.put("beta", new Character((char) 946));
    entities.put("gamma", new Character((char) 947));
    entities.put("delta", new Character((char) 948));
    entities.put("epsilon", new Character((char) 949));
    entities.put("zeta", new Character((char) 950));
    entities.put("eta", new Character((char) 951));
    entities.put("theta", new Character((char) 952));
    entities.put("iota", new Character((char) 953));
    entities.put("kappa", new Character((char) 954));
    entities.put("lambda", new Character((char) 955));
    entities.put("mu", new Character((char) 956));
    entities.put("nu", new Character((char) 957));
    entities.put("xi", new Character((char) 958));
    entities.put("omicron", new Character((char) 959));
    entities.put("pi", new Character((char) 960));
    entities.put("rho", new Character((char) 961));
    entities.put("sigma", new Character((char) 962));
    entities.put("sigmaf", new Character((char) 963));
    entities.put("tau", new Character((char) 964));
    entities.put("upsilon", new Character((char) 965));
    entities.put("phi", new Character((char) 966));
    entities.put("chi", new Character((char) 967));
    entities.put("psi", new Character((char) 968));
    entities.put("omega", new Character((char) 969));
    entities.put("thetasym", new Character((char) 977));
    entities.put("upsih", new Character((char) 978));
    entities.put("piv", new Character((char) 982));

    // math symbols
    entities.put("forall", new Character((char) 8704));
    entities.put("part", new Character((char) 8706));
    entities.put("exist", new Character((char) 8707));
    entities.put("empty", new Character((char) 8709));
    entities.put("nabla", new Character((char) 8711));
    entities.put("isin", new Character((char) 8712));
    entities.put("notin", new Character((char) 8713));
    entities.put("ni", new Character((char) 8715));
    entities.put("prod", new Character((char) 8719));
    entities.put("sum", new Character((char) 8721));
    entities.put("minus", new Character((char) 8722));
    entities.put("lowast", new Character((char) 8727));
    entities.put("radic", new Character((char) 8730));
    entities.put("prop", new Character((char) 8733));
    entities.put("infin", new Character((char) 8734));
    entities.put("ang", new Character((char) 8736));
    entities.put("and", new Character((char) 8743));
    entities.put("or", new Character((char) 8744));
    entities.put("cap", new Character((char) 8745));
    entities.put("cup", new Character((char) 8746));
    entities.put("int", new Character((char) 8747));
    entities.put("there4", new Character((char) 8756));
    entities.put("sim", new Character((char) 8764));
    entities.put("cong", new Character((char) 8773));
    entities.put("asymp", new Character((char) 8776));
    entities.put("ne", new Character((char) 8800));
    entities.put("equiv", new Character((char) 8801));
    entities.put("le", new Character((char) 8804));
    entities.put("ge", new Character((char) 8805));
    entities.put("sub", new Character((char) 8834));
    entities.put("sup", new Character((char) 8835));
    entities.put("nsub", new Character((char) 8836));
    entities.put("sube", new Character((char) 8838));
    entities.put("supe", new Character((char) 8839));
    entities.put("oplus", new Character((char) 8853));
    entities.put("otimes", new Character((char) 8855));
    entities.put("perp", new Character((char) 8869));
    entities.put("sdot", new Character((char) 8901));
    entities.put("loz", new Character((char) 9674));

    // technical symbols
    entities.put("lceil", new Character((char) 8968));
    entities.put("rceil", new Character((char) 8969));
    entities.put("lfloor", new Character((char) 8970));
    entities.put("rfloor", new Character((char) 8971));
    entities.put("lang", new Character((char) 9001));
    entities.put("rang", new Character((char) 9002));

    // arrow symbols
    entities.put("larr", new Character((char) 8592));
    entities.put("uarr", new Character((char) 8593));
    entities.put("rarr", new Character((char) 8594));
    entities.put("darr", new Character((char) 8595));
    entities.put("harr", new Character((char) 8596));
    entities.put("crarr", new Character((char) 8629));
    entities.put("lArr", new Character((char) 8656));
    entities.put("uArr", new Character((char) 8657));
    entities.put("rArr", new Character((char) 8658));
    entities.put("dArr", new Character((char) 8659));
    entities.put("hArr", new Character((char) 8960));

    // divers symbols
    entities.put("bull", new Character((char) 8226));
    entities.put("prime", new Character((char) 8242));
    entities.put("Prime", new Character((char) 8243));
    entities.put("oline", new Character((char) 8254));
    entities.put("weierp", new Character((char) 8472));
    entities.put("image", new Character((char) 8465));
    entities.put("real", new Character((char) 8476));
    entities.put("trade", new Character((char) 8482));
    entities.put("euro", new Character((char) 8364));
    entities.put("alefsym", new Character((char) 8501));
    entities.put("spades", new Character((char) 9824));
    entities.put("clubs", new Character((char) 9827));
    entities.put("hearts", new Character((char) 9829));
    entities.put("diams", new Character((char) 9830));

    // ext lat symbols
    entities.put("OElig", new Character((char) 338));
    entities.put("oelig", new Character((char) 339));
    entities.put("Scaron", new Character((char) 352));
    entities.put("scaron", new Character((char) 353));
    entities.put("fnof", new Character((char) 402));

    // interpunction
    entities.put("ensp", new Character((char) 8194));
    entities.put("emsp", new Character((char) 8195));
    entities.put("thinsp", new Character((char) 8201));
    entities.put("zwnj", new Character((char) 8204));
    entities.put("zwj", new Character((char) 8205));
    entities.put("lrm", new Character((char) 8206));
    entities.put("rlm", new Character((char) 8207));

    entities.put("sbquo", new Character((char) 8218));
    entities.put("ldquo", new Character((char) 8220));
    entities.put("rdquo", new Character((char) 8221));
    entities.put("bdquo", new Character((char) 8222));
    entities.put("dagger", new Character((char) 8224));
    entities.put("Dagger", new Character((char) 8225));
    entities.put("hellip", new Character((char) 8230));
    entities.put("permil", new Character((char) 8240));
    entities.put("lsaquo", new Character((char) 8249));
    entities.put("rsaquo", new Character((char) 8250));

    // diacrit symb
    entities.put("circ", new Character((char) 710));
    entities.put("tilde", new Character((char) 732));

    final Map elementInfos = ELEMENT_INFOS;

    elementInfos.put("NOSCRIPT", new ElementInfo(true, ElementInfo.END_ELEMENT_REQUIRED, null, true));

    final ElementInfo optionalEndElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL);
    final ElementInfo forbiddenEndElement = new ElementInfo(false, ElementInfo.END_ELEMENT_FORBIDDEN);
    final ElementInfo onlyTextDE = new ElementInfo(false, ElementInfo.END_ELEMENT_REQUIRED, true);
    final ElementInfo onlyText = new ElementInfo(false, ElementInfo.END_ELEMENT_REQUIRED, false);

    final Set tableCellStopElements = new HashSet<>();
    final ElementInfo tableCellElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, tableCellStopElements);

    final Set headStopElements = new HashSet<>();
    final ElementInfo headElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, headStopElements);

    final Set optionStopElements = new HashSet<>();
    final ElementInfo optionElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, optionStopElements);

    final Set paragraphStopElements = new HashSet<>();
    final ElementInfo paragraphElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, paragraphStopElements);

    // Set liStopElements = new HashSet();
    // liStopElements.add("LI");
    // liStopElements.add("UL");
    // liStopElements.add("OL");

    elementInfos.put("SCRIPT", onlyText);
    elementInfos.put("STYLE", onlyText);
    elementInfos.put("TEXTAREA", onlyTextDE);
    elementInfos.put("IMG", forbiddenEndElement);
    elementInfos.put("META", forbiddenEndElement);
    elementInfos.put("LINK", forbiddenEndElement);
    elementInfos.put("BASE", forbiddenEndElement);
    elementInfos.put("INPUT", forbiddenEndElement);
    elementInfos.put("FRAME", forbiddenEndElement);
    elementInfos.put("BR", forbiddenEndElement);
    elementInfos.put("HR", forbiddenEndElement);
    elementInfos.put("EMBED", forbiddenEndElement);
    elementInfos.put("SPACER", forbiddenEndElement);

    elementInfos.put("P", paragraphElement);
    elementInfos.put("LI", optionalEndElement);
    elementInfos.put("DT", optionalEndElement);
    elementInfos.put("DD", optionalEndElement);
    elementInfos.put("TR", optionalEndElement);
    elementInfos.put("TH", tableCellElement);
    elementInfos.put("TD", tableCellElement);
    elementInfos.put("HEAD", headElement);
    elementInfos.put("OPTION", optionElement);

    // Note: The specification states anchors have
    // a required end element, but browsers generally behave
    // as if it's optional.
    elementInfos.put("A", optionalEndElement);
    elementInfos.put("ANCHOR", optionalEndElement);
    // TODO: Keep adding tags here

   * Constructs a HtmlParser.
   * @param document
   *          A W3C Document instance.
   * @param errorHandler
   *          The error handler.
   * @param publicId
   *          The public ID of the document.
   * @param systemId
   *          The system ID of the document.
   * @deprecated UserAgentContext should be passed in constructor.
  public HtmlParser(final Document document, final ErrorHandler errorHandler, final String publicId, final String systemId) {
    this.ucontext = null;
    this.document = document;
    this.errorHandler = errorHandler;
    this.isXML = false;
    this.needRoot = true;

   * Constructs a HtmlParser.
   * @param ucontext
   *          The user agent context.
   * @param document
   *          An W3C Document instance.
   * @param errorHandler
   *          The error handler.
   * @param publicId
   *          The public ID of the document.
   * @param systemId
   *          The system ID of the document.
   * @param isXML
  public HtmlParser(final UserAgentContext ucontext, final Document document, final ErrorHandler errorHandler, final String publicId,
      final String systemId, final boolean isXML, final boolean needRoot) {
    this.ucontext = ucontext;
    this.document = document;
    this.errorHandler = errorHandler;
    this.isXML = isXML;
    this.needRoot = needRoot;

   * Constructs a HtmlParser.
   * @param ucontext
   *          The user agent context.
   * @param document
   *          A W3C Document instance.
  public HtmlParser(final UserAgentContext ucontext, final Document document) {
    this.ucontext = ucontext;
    this.document = document;
    this.errorHandler = null;
    this.isXML = false;
    this.needRoot = true;

  public static boolean isDecodeEntities(final String elementName) {
    final ElementInfo einfo = ELEMENT_INFOS.get(elementName.toUpperCase());
    return einfo == null ? true : einfo.decodeEntities;

  private boolean shouldDecodeEntities(final ElementInfo einfo) {
    return isXML || (einfo == null ? true : einfo.decodeEntities);

   * Parses HTML from an input stream, assuming the character set is ISO-8859-1.
   * @param in
   *          The input stream.
   * @throws IOException
   *           Thrown when there are errors reading the stream.
   * @throws SAXException
   *           Thrown when there are parse errors.
  public void parse(final InputStream in) throws IOException, SAXException, UnsupportedEncodingException {
    this.parse(in, "ISO-8859-1");

   * Parses HTML from an input stream, using the given character set.
   * @param in
   *          The input stream.
   * @param charset
   *          The character set.
   * @throws IOException
   *           Thrown when there's an error reading from the stream.
   * @throws SAXException
   *           Thrown when there is a parser error.
   * @throws UnsupportedEncodingException
   *           Thrown if the character set is not supported.
  public void parse(final InputStream in, final String charset) throws IOException, SAXException, UnsupportedEncodingException {
    final WritableLineReader reader = new WritableLineReader(new InputStreamReader(in, charset));

   * Parses HTML given by a Reader. This method appends nodes to
   * the document provided to the parser.
   * @param reader
   *          An instance of Reader.
   * @throws IOException
   *           Thrown if there are errors reading the input stream.
   * @throws SAXException
   *           Thrown if there are parse errors.
  public void parse(final Reader reader) throws IOException, SAXException {
    this.parse(new LineNumberReader(reader));

  public void parse(final LineNumberReader reader) throws IOException, SAXException {
    final Document doc = this.document;
    this.parse(reader, doc);

   * This method may be used when the DOM should be built under a given node,
   * such as when innerHTML is used in Javascript.
   * @param reader
   *          A document reader.
   * @param parent
   *          The root node for the parsed DOM.
   * @throws IOException
   * @throws SAXException
  public void parse(final Reader reader, final Node parent) throws IOException, SAXException {
    this.parse(new LineNumberReader(reader), parent);

   * This method may be used when the DOM should be built under a given node,
   * such as when innerHTML is used in Javascript.
   * @param reader
   *          A LineNumberReader for the document.
   * @param parent
   *          The root node for the parsed DOM.
   * @throws IOException
   * @throws SAXException
  public void parse(final LineNumberReader reader, final Node parent) throws IOException, SAXException {

    // Note: Parser does not clear document. It could be used incrementally.
    try {
      parent.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
      try {
        while (this.parseToken(parent, reader, null, new LinkedList()) != TOKEN_EOD) {
      } catch (final StopException se) {
        throw new SAXException("Unexpected flow exception", se);
    } finally {
      if (QUIRKS_MODE && needRoot) {
      parent.setUserData(MODIFYING_KEY, Boolean.FALSE, null);

    // dumpTree(parent);

  private static void dumpTree(final Node parent) {
    org.cobraparser.util.Nodes.forEachNode(parent, (node) -> {
      int depth = 0;
      Node p = node.getParentNode();
      while (p != null) {
        p = p.getParentNode();
      for (int i = 0; i < depth; i++) {
        System.out.print(". ");
      final String textContent = node.getTextContent();
      System.out.println(node.getNodeName() + ":    " + node.getClass().getSimpleName() + " : " + textContent.substring(0, Math.min(5,textContent.length())).trim());

  private static final int TOKEN_EOD = 0;
  private static final int TOKEN_COMMENT = 1;
  private static final int TOKEN_TEXT = 2;
  private static final int TOKEN_BEGIN_ELEMENT = 3;
  private static final int TOKEN_END_ELEMENT = 4;
  private static final int TOKEN_FULL_ELEMENT = 5;
  private static final int TOKEN_BAD = 6;

  private String normalLastTag = null;
  private boolean justReadTagBegin = false;
  private boolean justReadTagEnd = false;

   * Only set when readAttribute returns false.
  private boolean justReadEmptyElement = false;
  private final static String[] elementsThatNeedBodyElement = {
    // TODO: More tags
  private final static String[] elementsThatDontNeedBodyElement = {
    // TODO: More tags

  private final static String[] elementsThatDontNeedHeadElement = {
    // TODO: More tags

  private static boolean hasAncestorTag(final Node node, final String tag) {
    if (node == null) {
      return false;
    } else if (tag.equalsIgnoreCase(node.getNodeName())) {
      return true;
    } else {
      return hasAncestorTag(node.getParentNode(), tag);

  private void safeAppendChild(final Node parent, final Node child) {
    Node newParent = parent;
    if (QUIRKS_MODE && needRoot) {
      final String nodeName = child.getNodeName();
      if ("HTML".equalsIgnoreCase(nodeName)) {
        lastRootElement = child;
      } else if ((child instanceof Element) && (depthAtMost(parent, 1)) && (!hasAncestorTag(parent, "HTML"))) {
        newParent = lastRootElement;

    ensureBodyAppendChild(newParent, child);

  private void ensureRootElement(final Node parent) {
    if (lastRootElement == null) {
      // System.out.println("Inserting HTML");
      lastRootElement = document.createElement("HTML");

  private static boolean depthAtMost(final Node n, final int maxDepth) {
    if (maxDepth <= 0) {
      return false;
    } else {
      final Node parent = n.getParentNode();
      return parent == null ? true : depthAtMost(parent, maxDepth - 1);

  private void ensureBodyAppendChild(final Node parent, final Node child) {
    Node newParent = parent;
    if (QUIRKS_MODE && needRoot) {
      // final String nodeName = child.getNodeName();
      final String nodeNameTU = child.getNodeName().toUpperCase();
      if ("BODY".equals(nodeNameTU)) {
        lastBodyElement = child;
        // System.out.println("Found body elem: " + child);
      } else if ("HEAD".equals(nodeNameTU)) {
        lastHeadElement = child;
      } else if ((child instanceof Element) && (depthAtMost(parent, 2))) {
        final boolean dontNeedBody = ArrayUtilities.contains(elementsThatDontNeedBodyElement, nodeNameTU);
        final boolean dontNeedHead = ArrayUtilities.contains(elementsThatDontNeedHeadElement, nodeNameTU);
        if((!hasAncestorTag(parent, "BODY")) && (!dontNeedBody)) {
          newParent = lastBodyElement;
        } else if((!hasAncestorTag(parent, "HEAD")) && (!dontNeedHead)) {
          newParent = lastHeadElement;

  private void ensureBodyElement(final Node parent) {
    if (lastBodyElement == null) {
      // System.out.println("Inserting BODY");
      lastBodyElement = document.createElement("BODY");

  private void ensureHeadElement(final Node parent) {
    if (lastHeadElement == null) {
      // System.out.println("Inserting HEAD");
      lastHeadElement = document.createElement("HEAD");

  private final static Pattern doctypePattern = Pattern.compile("(\\S+)\\s+PUBLIC\\s+\"([^\"]*)\"\\s+\"([^\"]*)\".*>");

   * Parses text followed by one element.
   * @param parent
   * @param reader
   * @param stopAtTagUC
   *          If this tag is encountered, the method throws StopException.
   * @param stopTags
   *          If tags in this set are encountered, the method throws
   *          StopException.
   * @return
   * @throws IOException
   * @throws StopException
   * @throws SAXException
  private final int parseToken(final Node parent, final LineNumberReader reader, final Set stopTags,
      final LinkedList ancestors)
      throws IOException, StopException, SAXException {
    final Document doc = this.document;
    final HTMLDocumentImpl htmlDoc = (HTMLDocumentImpl) doc;
    final StringBuffer textSb = this.readUpToTagBegin(reader);
    if (textSb == null) {
      return TOKEN_EOD;
    if (textSb.length() != 0) {
      // int textLine = reader.getLineNumber();
      final StringBuffer decText = entityDecode(textSb);
      final Node textNode = doc.createTextNode(decText.toString());
      try {
        safeAppendChild(parent, textNode);
      } catch (final DOMException de) {
        if ((parent.getNodeType() != Node.DOCUMENT_NODE) || (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
          logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
    if (this.justReadTagBegin) {
      String tag = this.readTag(parent, reader);
      if (tag == null) {
        return TOKEN_EOD;
      String normalTag = htmlDoc.isXML() ? tag : tag.toUpperCase();
      try {
        if (tag.startsWith("!")) {
          if ("!--".equals(tag)) {
            // int commentLine = reader.getLineNumber();
            final StringBuffer comment = this.passEndOfComment(reader);
            final StringBuffer decText = entityDecode(comment);

            safeAppendChild(parent, doc.createComment(decText.toString()));

            return TOKEN_COMMENT;
          } else if ("!DOCTYPE".equals(tag)) {
            final String doctypeStr = this.parseEndOfTag(reader);
            final Matcher doctypeMatcher = doctypePattern.matcher(doctypeStr);
            if (doctypeMatcher.matches()) {
              final String qName =;
              final String publicId =;
              final String systemId =;
              final DocumentTypeImpl doctype = new DocumentTypeImpl(qName, publicId, systemId);
              needRoot = false;
            return TOKEN_BAD;
          } else {
            return TOKEN_BAD;
        } else if (tag.startsWith("/")) {
          tag = tag.substring(1);
          normalTag = normalTag.substring(1);
          return TOKEN_END_ELEMENT;
        } else if (tag.startsWith("?")) {
          tag = tag.substring(1);
          final StringBuffer data = readProcessingInstruction(reader);

          safeAppendChild(parent, doc.createProcessingInstruction(tag, data.toString()));

          return TOKEN_FULL_ELEMENT;
        } else {
          final int localIndex = normalTag.indexOf(':');
          final boolean tagHasPrefix = localIndex > 0;
          final String localName = tagHasPrefix ? normalTag.substring(localIndex + 1) : normalTag;
          Element element = doc.createElement(localName);
          element.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
          try {
            if (!this.justReadTagEnd) {
              while (this.readAttribute(reader, element)) {
                // EMPTY LOOP
            if ((stopTags != null) && stopTags.contains(normalTag)) {
              // Throw before appending to parent.
              // After attributes are set.
              // After MODIFYING_KEY is set.
              throw new StopException(element);
            // Add element to parent before children are added.
            // This is necessary for incremental rendering.
            safeAppendChild(parent, element);
            if (!this.justReadEmptyElement) {
              ElementInfo einfo = ELEMENT_INFOS.get(localName.toUpperCase());
              int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.endElementType;
              if (endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) {
                boolean childrenOk = einfo == null ? true : einfo.childElementOk;
                Set newStopSet = einfo == null ? null : einfo.stopTags;
                if (newStopSet == null) {
                  if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
                    newStopSet = Collections.singleton(normalTag);
                if (stopTags != null) {
                  if (newStopSet != null) {
                    final Set newStopSet2 = new HashSet<>();
                    newStopSet = newStopSet2;
                  } else {
                    newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags;
                try {
                  for (;;) {
                    try {
                      int token;
                      if ((einfo != null) && einfo.noScriptElement) {
                        final UserAgentContext ucontext = this.ucontext;
                        if ((ucontext == null) || ucontext.isScriptingEnabled()) {
                          token = this.parseForEndTag(parent, reader, tag, false, shouldDecodeEntities(einfo));
                        } else {
                          token = this.parseToken(element, reader, newStopSet, ancestors);
                      } else {
                        token = childrenOk ? this.parseToken(element, reader, newStopSet, ancestors) : this.parseForEndTag(element, reader,
                            tag, true, shouldDecodeEntities(einfo));
                      if (token == TOKEN_END_ELEMENT) {
                        final String normalLastTag = this.normalLastTag;
                        if (normalTag.equalsIgnoreCase(normalLastTag)) {
                          return TOKEN_FULL_ELEMENT;
                        } else {
                          final ElementInfo closeTagInfo = ELEMENT_INFOS.get(normalLastTag.toUpperCase());
                          if ((closeTagInfo == null) || (closeTagInfo.endElementType != ElementInfo.END_ELEMENT_FORBIDDEN)) {
                            // TODO: Rather inefficient algorithm, but it's
                            // probably executed infrequently?
                            final Iterator i = ancestors.iterator();
                            if (i.hasNext()) {
                              while (i.hasNext()) {
                                final String normalAncestorTag =;
                                if (normalLastTag.equals(normalAncestorTag)) {
                                  normalTag = normalLastTag;
                                  return TOKEN_END_ELEMENT;
                          // TODO: Working here
                      } else if (token == TOKEN_EOD) {
                        return TOKEN_EOD;
                    } catch (final StopException se) {
                      // newElement does not have a parent.
                      final Element newElement = se.getElement();
                      tag = newElement.getTagName();
                      normalTag = tag.toUpperCase();
                      // If a subelement throws StopException with
                      // a tag matching the current stop tag, the exception
                      // is rethrown (e.g. blahblah)
                      if ((stopTags != null) && stopTags.contains(normalTag)) {
                        throw se;
                      einfo = ELEMENT_INFOS.get(normalTag);
                      endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.endElementType;
                      childrenOk = einfo == null ? true : einfo.childElementOk;
                      newStopSet = einfo == null ? null : einfo.stopTags;
                      if (newStopSet == null) {
                        if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
                          newStopSet = Collections.singleton(normalTag);
                      if ((stopTags != null) && (newStopSet != null)) {
                        final Set newStopSet2 = new HashSet<>();
                        newStopSet = newStopSet2;
                      // Switch element
                      element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
                      // newElement should have been suspended.
                      element = newElement;
                      // Add to parent
                      safeAppendChild(parent, element);
                      if (this.justReadEmptyElement) {
                        return TOKEN_BEGIN_ELEMENT;
                } finally {
            return TOKEN_BEGIN_ELEMENT;
          } finally {
            // This can inform elements to continue with notifications.
            // It can also cause Javascript to be loaded / processed.
            // Update: Elements now use Document.addJob() to delay processing
            element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
      } finally {
        this.normalLastTag = normalTag;
    } else {
      this.normalLastTag = null;
      return TOKEN_TEXT;

   * Reads text until the beginning of the next tag. Leaves the reader offset
   * past the opening angle bracket. Returns null only on EOF.
  private final StringBuffer readUpToTagBegin(final LineNumberReader reader) throws IOException, SAXException {
    StringBuffer sb = null;
    int intCh;
    while ((intCh = != -1) {
      final char ch = (char) intCh;
      if (ch == '<') {
        this.justReadTagBegin = true;
        this.justReadTagEnd = false;
        this.justReadEmptyElement = false;
        if (sb == null) {
          sb = new StringBuffer(0);
        return sb;
      if (sb == null) {
        sb = new StringBuffer();
    this.justReadTagBegin = false;
    this.justReadTagEnd = false;
    this.justReadEmptyElement = false;
    return sb;

   * Assumes that the content is completely made up of text, and parses until an
   * ending tag is found.
   * @param parent
   * @param reader
   * @param tagName
   * @return
   * @throws IOException
  private final int parseForEndTag(Node parent, final LineNumberReader reader, final String tagName, final boolean addTextNode,
      final boolean decodeEntities)
      throws IOException, SAXException {
    final Document doc = this.document;
    int intCh;
    StringBuffer sb = new StringBuffer();
    while ((intCh = != -1) {
      char ch = (char) intCh;
      if (ch == '<') {
        intCh =;
        if (intCh != -1) {
          ch = (char) intCh;
          if (ch == '/') {
            final StringBuffer tempBuffer = new StringBuffer();
            INNER: while ((intCh = != -1) {
              ch = (char) intCh;
              if (ch == '>') {
                final String thisTag = tempBuffer.toString().trim();
                if (thisTag.equalsIgnoreCase(tagName)) {
                  this.justReadTagBegin = false;
                  this.justReadTagEnd = true;
                  this.justReadEmptyElement = false;
                  this.normalLastTag = thisTag;
                  if (addTextNode) {
                    if (decodeEntities) {
                      sb = entityDecode(sb);
                    final String text = sb.toString();
                    if (text.length() != 0) {
                      final Node textNode = doc.createTextNode(text);
                      safeAppendChild(parent, textNode);
                  return TOKEN_END_ELEMENT;
                } else {
                  break INNER;
              } else {
            sb.append("= 0 ) {
      final char nextCh = (char) next;
      if (nextCh == ']') {
        final String next2 = readN(reader, 2);
        if (next2 != null) {
          if ("]>".equals(next2)) {
          } else {
            next =;
        } else {
      } else {
        next =;

  // Tries to read at most n characters.
  private static String readN(final LineNumberReader reader, final int n) {
    char[] chars = new char[n];
    int i = 0;
    while(i < n) {
      int ich = -1;
      try {
        ich =;
      } catch (IOException e) {
      if (ich >= 0) {
        chars[i] = (char) ich;
        i += 1;
      } else {

    if (i == 0) {
      return null;
    } else {
      return new String(chars, 0, i);

   * The reader offset should be
   * @param reader
   * @return
  private final String readTag(final Node parent, final LineNumberReader reader) throws IOException {
    final StringBuffer sb = new StringBuffer();
    int chInt;
    chInt =;
    if (chInt != -1) {
      boolean cont = true;
      char ch;
      LOOP: for (;;) {
        ch = (char) chInt;
        if (Character.isLetter(ch)) {
          // Speed up normal case
          break LOOP;
        } else if (ch == '!') {
          chInt =;
          if (chInt != -1) {
            ch = (char) chInt;
            if (ch == '-') {
              chInt =;
              if (chInt != -1) {
                ch = (char) chInt;
                if (ch == '-') {
                  cont = false;
              } else {
                cont = false;
          } else {
            cont = false;
        } else if (ch == '/') {
          chInt =;
          if (chInt != -1) {
            ch = (char) chInt;
          } else {
            cont = false;
        } else if (ch == '<') {
          final StringBuffer ltText = new StringBuffer(3);
          while ((chInt = == '<') {
          final Document doc = this.document;
          final Node textNode = doc.createTextNode(ltText.toString());
          try {
          } catch (final DOMException de) {
            if ((parent.getNodeType() != Node.DOCUMENT_NODE) || (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
              logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
          if (chInt == -1) {
            cont = false;
          } else {
            continue LOOP;
        } else if (Character.isWhitespace(ch)) {
          final StringBuffer ltText = new StringBuffer();
          while ((chInt = != -1) {
            ch = (char) chInt;
            if (ch == '<') {
              chInt =;
          final Document doc = this.document;
          final Node textNode = doc.createTextNode(ltText.toString());
          try {
          } catch (final DOMException de) {
            if ((parent.getNodeType() != Node.DOCUMENT_NODE) || (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
              logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
          if (chInt == -1) {
            cont = false;
          } else {
            continue LOOP;
        break LOOP;
      if (cont) {
        boolean lastCharSlash = false;
        for (;;) {
          if (Character.isWhitespace(ch)) {
          } else if (ch == '>') {
            this.justReadTagEnd = true;
            this.justReadTagBegin = false;
            this.justReadEmptyElement = lastCharSlash;
            final String tag = sb.toString();
            return tag;
          } else if (ch == '/') {
            lastCharSlash = true;
          } else {
            if (lastCharSlash) {
            lastCharSlash = false;
          chInt =;
          if (chInt == -1) {
          ch = (char) chInt;
    if (sb.length() > 0) {
      this.justReadTagEnd = false;
      this.justReadTagBegin = false;
      this.justReadEmptyElement = false;
    final String tag = sb.toString();
    return tag;

  private final StringBuffer passEndOfComment(final LineNumberReader reader) throws IOException {
    if (this.justReadTagEnd) {
      return new StringBuffer(0);
    final StringBuffer sb = new StringBuffer();
    OUTER: for (;;) {
      int chInt =;
      if (chInt == -1) {
        break OUTER;
      char ch = (char) chInt;
      if (ch == '-') {
        chInt =;
        if (chInt == -1) {
          break OUTER;
        ch = (char) chInt;
        if (ch == '-') {
          StringBuffer extra = null;
          INNER: for (;;) {
            chInt =;
            if (chInt == -1) {
              if (extra != null) {
              break OUTER;
            ch = (char) chInt;
            if (ch == '>') {
              this.justReadTagBegin = false;
              this.justReadTagEnd = true;
              return sb;
            } else if (ch == '-') {
              // Allow any number of dashes at the end
              if (extra == null) {
                extra = new StringBuffer();
            } else if (Character.isWhitespace(ch)) {
              if (extra == null) {
                extra = new StringBuffer();
            } else {
              if (extra != null) {
              break INNER;
        } else {
      } else {
    if (sb.length() > 0) {
      this.justReadTagBegin = false;
      this.justReadTagEnd = false;
    return sb;

  private final String parseEndOfTag(final Reader reader) throws IOException {
    if (this.justReadTagEnd) {
      return "";
    StringBuilder result = new StringBuilder();
    boolean readSomething = false;
    for (;;) {
      final int chInt =;
      if (chInt == -1) {
      result.append((char) chInt);
      readSomething = true;
      final char ch = (char) chInt;
      if (ch == '>') {
        this.justReadTagEnd = true;
        this.justReadTagBegin = false;
        return result.toString();
    if (readSomething) {
      this.justReadTagBegin = false;
      this.justReadTagEnd = false;
    return result.toString();

  private final void passEndOfTag(final Reader reader) throws IOException {
    if (this.justReadTagEnd) {
    boolean readSomething = false;
    for (;;) {
      final int chInt =;
      if (chInt == -1) {
      readSomething = true;
      final char ch = (char) chInt;
      if (ch == '>') {
        this.justReadTagEnd = true;
        this.justReadTagBegin = false;
    if (readSomething) {
      this.justReadTagBegin = false;
      this.justReadTagEnd = false;

  private final StringBuffer readProcessingInstruction(final LineNumberReader reader) throws IOException {
    final StringBuffer pidata = new StringBuffer();
    if (this.justReadTagEnd) {
      return pidata;
    int ch;
    for (ch =; (ch != -1) && (ch != '>'); ch = {
      pidata.append((char) ch);
    this.justReadTagBegin = false;
    this.justReadTagEnd = ch != -1;
    return pidata;

  private final boolean readAttribute(final LineNumberReader reader, final Element element) throws IOException, SAXException {
    if (this.justReadTagEnd) {
      return false;

    // Read attribute name up to '=' character.
    // May read several attribute names without explicit values.

    StringBuffer attributeName = null;
    boolean blankFound = false;
    boolean lastCharSlash = false;
    for (;;) {
      final int chInt =;
      if (chInt == -1) {
        if ((attributeName != null) && (attributeName.length() != 0)) {
          final String attributeNameStr = attributeName.toString();
          element.setAttribute(attributeNameStr, attributeNameStr);
        this.justReadTagBegin = false;
        this.justReadTagEnd = false;
        this.justReadEmptyElement = false;
        return false;
      final char ch = (char) chInt;
      if (ch == '=') {
        lastCharSlash = false;
        blankFound = false;
      } else if (ch == '>') {
        if ((attributeName != null) && (attributeName.length() != 0)) {
          final String attributeNameStr = attributeName.toString();
          element.setAttribute(attributeNameStr, attributeNameStr);
        this.justReadTagBegin = false;
        this.justReadTagEnd = true;
        this.justReadEmptyElement = lastCharSlash;
        return false;
      } else if (ch == '/') {
        blankFound = true;
        lastCharSlash = true;
      } else if (Character.isWhitespace(ch)) {
        lastCharSlash = false;
        blankFound = true;
      } else {
        lastCharSlash = false;
        if (blankFound) {
          blankFound = false;
          if ((attributeName != null) && (attributeName.length() != 0)) {
            final String attributeNameStr = attributeName.toString();
            element.setAttribute(attributeNameStr, attributeNameStr);
        if (attributeName == null) {
          attributeName = new StringBuffer(6);
    // Read blanks up to open quote or first non-blank.
    StringBuffer attributeValue = null;
    int openQuote = -1;
    for (;;) {
      final int chInt =;
      if (chInt == -1) {
      final char ch = (char) chInt;
      if (ch == '>') {
        if ((attributeName != null) && (attributeName.length() != 0)) {
          final String attributeNameStr = attributeName.toString();
          element.setAttribute(attributeNameStr, attributeNameStr);
        this.justReadTagBegin = false;
        this.justReadTagEnd = true;
        this.justReadEmptyElement = lastCharSlash;
        return false;
      } else if (ch == '/') {
        lastCharSlash = true;
      } else if (Character.isWhitespace(ch)) {
        lastCharSlash = false;
      } else {
        if (ch == '"') {
          openQuote = '"';
        } else if (ch == '\'') {
          openQuote = '\'';
        } else {
          openQuote = -1;
          attributeValue = new StringBuffer(6);
          if (lastCharSlash) {
        lastCharSlash = false;

    // Read attribute value

    for (;;) {
      final int chInt =;
      if (chInt == -1) {
      final char ch = (char) chInt;
      if ((openQuote != -1) && (ch == openQuote)) {
        lastCharSlash = false;
        if (attributeName != null) {
          final String attributeNameStr = attributeName.toString();
          if (attributeValue == null) {
            // Quotes are closed. There's a distinction
            // between blank values and null in HTML, as
            // processed by major browsers.
            element.setAttribute(attributeNameStr, "");
          } else {
            final StringBuffer actualAttributeValue = entityDecode(attributeValue);
            element.setAttribute(attributeNameStr, actualAttributeValue.toString());
        this.justReadTagBegin = false;
        this.justReadTagEnd = false;
        return true;
      } else if ((openQuote == -1) && (ch == '>')) {
        if (attributeName != null) {
          final String attributeNameStr = attributeName.toString();
          if (attributeValue == null) {
            element.setAttribute(attributeNameStr, null);
          } else {
            final StringBuffer actualAttributeValue = entityDecode(attributeValue);
            element.setAttribute(attributeNameStr, actualAttributeValue.toString());
        this.justReadTagBegin = false;
        this.justReadTagEnd = true;
        this.justReadEmptyElement = lastCharSlash;
        return false;
      } else if ((openQuote == -1) && Character.isWhitespace(ch)) {
        lastCharSlash = false;
        if (attributeName != null) {
          final String attributeNameStr = attributeName.toString();
          if (attributeValue == null) {
            element.setAttribute(attributeNameStr, null);
          } else {
            final StringBuffer actualAttributeValue = entityDecode(attributeValue);
            element.setAttribute(attributeNameStr, actualAttributeValue.toString());
        this.justReadTagBegin = false;
        this.justReadTagEnd = false;
        return true;
      } else {
        if (attributeValue == null) {
          attributeValue = new StringBuffer(6);
        if (lastCharSlash) {
        lastCharSlash = false;
    this.justReadTagBegin = false;
    this.justReadTagEnd = false;
    if (attributeName != null) {
      final String attributeNameStr = attributeName.toString();
      if (attributeValue == null) {
        element.setAttribute(attributeNameStr, null);
      } else {
        final StringBuffer actualAttributeValue = entityDecode(attributeValue);
        element.setAttribute(attributeNameStr, actualAttributeValue.toString());
    return false;

  private final static StringBuffer entityDecode(final StringBuffer rawText) throws SAXException {
    int startIdx = 0;
    StringBuffer sb = null;
    for (;;) {
      final int ampIdx = rawText.indexOf("&", startIdx);
      if (ampIdx == -1) {
        if (sb == null) {
          return rawText;
        } else {
          return sb;
      if (sb == null) {
        sb = new StringBuffer();
      sb.append(rawText.substring(startIdx, ampIdx));
      final int colonIdx = rawText.indexOf(";", ampIdx);
      if (colonIdx == -1) {
        startIdx = ampIdx + 1;
      final String spec = rawText.substring(ampIdx + 1, colonIdx);
      if (spec.startsWith("#")) {
        final String number = spec.substring(1).toLowerCase();
        int decimal;
        try {
          if (number.startsWith("x")) {
            decimal = Integer.parseInt(number.substring(1), 16);
          } else {
            decimal = Integer.parseInt(number);
        } catch (final NumberFormatException nfe) {
          logger.log(Level.WARNING, "entityDecode()", nfe);
          decimal = 0;
        sb.append((char) decimal);
      } else {
        final int chInt = getEntityChar(spec);
        if (chInt == -1) {
        } else {
          sb.append((char) chInt);
      startIdx = colonIdx + 1;

  private final static int getEntityChar(final String spec) {
    // TODO: Declared entities
    Character c = ENTITIES.get(spec);
    if (c == null) {
      final String specTL = spec.toLowerCase();
      c = ENTITIES.get(specTL);
      if (c == null) {
        return -1;
    return c.charValue();

© 2015 - 2024 Weber Informatics LLC | Privacy Policy