org.sonar.plugins.html.lex.PageLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sonar-html-plugin Show documentation
Code analyzer for HTML
The newest version!
/*
 * SonarQube HTML
 * Copyright (C) 2010-2024 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the Sonar Source-Available License Version 1, as published by SonarSource SA.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the Sonar Source-Available License for more details.
 *
 * You should have received a copy of the Sonar Source-Available License
 * along with this program; if not, see https://sonarsource.com/license/ssal/
 */
package org.sonar.plugins.html.lex;

import java.io.Reader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.sonar.plugins.html.node.Node;
import org.sonar.plugins.html.node.NodeType;
import org.sonar.plugins.html.node.TagNode;
import org.sonar.sslr.channel.Channel;
import org.sonar.sslr.channel.ChannelDispatcher;
import org.sonar.sslr.channel.CodeReader;

/**
 * Lexical analysis of a web page.

 */
@SuppressWarnings("unchecked")
public class PageLexer {

  /**
   * The order of the tokenizers is significant, as they are processed in this order.
   * 
   * TextTokenizer must be last, it will always consume the characters until the next token arrives.
   */
  @SuppressWarnings("rawtypes")
  private static List tokenizers = Arrays.asList(
    /* HTML Comments */
    new CommentTokenizer("", true),
    /* JSP Comments */
    new CommentTokenizer("<%--", "--%>", false),
    /* HTML Directive */
    new DoctypeTokenizer(""),
    /* XML Directives */
    new DirectiveTokenizer(""),
    /* JSP Directives */
    new DirectiveTokenizer("<%@", "%>"),
    /* JSP Expressions */
    new ExpressionTokenizer("<%", "%>"),
    /* CDATA */
    new CdataTokenizer(),
    /* XML and HTML Tags */
    new NormalElementTokenizer(),
    /* Text (for everything else) */
    new TextTokenizer());

  /**
   * Void elements can't have any content
   * See https://html.spec.whatwg.org/multipage/syntax.html#void-elements
   */
  private static final Set VOID_ELEMENTS = new HashSet<>(Arrays.asList("area", "base", "br", "col", "embed",
    "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"));

  private static final Set METADATA_CONTENT = new HashSet<>(Arrays.asList("base", "link", "meta", "noscript",
    "script", "style", "template", "title"));

  private static final Set PARAGRAPH_CLOSING = new HashSet<>(Arrays.asList("address", "article", "aside",
    "blockquote", "details", "div", "dl", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4",
    "h5", "h6", "header", "hr", "main", "nav", "ol", "p", "pre", "section", "table", "ul"));

  private static final Set RUBY_CLOSING = new HashSet<>(Arrays.asList("rtc", "rb", "rp", "rt"));

  private static final Set TABLE_DESCENDANTS = new HashSet<>(Arrays.asList("caption", "colgroup", "thead",
    "tbody", "tr", "tfoot"));

  private static final Set HTML_ELEMENTS = new HashSet<>(Arrays.asList("a", "abbr", "acronym", "address",
    "applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "bgsound", "big", "blink",
    "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "content",
    "data", "datalist", "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed", "fieldset",
    "figcaption", "figure", "font", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head",
    "header", "hgroup", "hr", "html", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "keygen", "label",
    "legend", "li", "link", "listing", "main", "map", "mark", "marquee", "menu", "menuitem", "meta", "meter", "nav",
    "nobr", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "picture",
    "plaintext", "pre", "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", "script", "section", "select",
    "shadow", "slot", "small", "source", "spacer", "span", "strike", "strong", "style", "sub", "summary", "sup", "table",
    "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul",
    "var", "video", "wbr", "xmp"));

  /**
   * Parse a nested node.
   */
  @SuppressWarnings("rawtypes")
  public List nestedParse(CodeReader reader) {
    List nodeList = new ArrayList<>();
    for (AbstractTokenizer tokenizer : (List) tokenizers) {
      if (tokenizer.consume(reader, nodeList)) {
        break;
      }
    }
    return nodeList;
  }

  /**
   * Parse the input into a list of tokens, with parent/child relations between the tokens.
   */
  public List parse(Reader reader) {

    // CodeReader reads the file stream
    CodeReader codeReader = new CodeReader(reader);

    // ArrayList collects the nodes
    List nodeList = new ArrayList<>();

    // ChannelDispatcher manages the tokenizers
    ChannelDispatcher> channelDispatcher = ChannelDispatcher.builder().addChannels((Channel[]) tokenizers.toArray(new Channel[tokenizers.size()])).build();
    channelDispatcher.consume(codeReader, nodeList);

    createNodeHierarchy(nodeList);

    return nodeList;
  }

  /**
   * Scan the nodes and build the hierarchy of parent and child nodes.
   */
  private static void createNodeHierarchy(List nodeList) {
    Deque openElementStack = new ArrayDeque<>();
    for (Node node : nodeList) {
      if (node.getNodeType() != NodeType.TAG) {
        continue;
      }
      TagNode element = (TagNode) node;

      // start element
      if (!element.isEndElement()) {
        TagNode parent = openElementStack.peek();
        while (parent != null
          && (shouldCloseParent(nodeName(element), nodeName(parent)) || isVoidElement(parent))) {
          openElementStack.pop();
          parent = openElementStack.peek();
        }
        element.setParent(parent);
        openElementStack.push(element);
      }

      // end element
      if (isEndElement(element) && !openElementStack.isEmpty()) {
        TagNode openElement = openElementStack.peek();
        if (openElement.equalsElementName(element.getNodeName())) {
          openElementStack.pop();
        } else {
          // non-well formed, close HTML elements if there is matching open element
          if (openElementStack.stream().anyMatch(tag -> tag.equalsElementName(element.getNodeName()))) {
            while (!openElement.equalsElementName(element.getNodeName()) && isHtmlElement(openElement)) {
              openElement = openElementStack.pop();
            }
          }
        }
      }
    }
  }

  private static boolean isVoidElement(TagNode parent) {
    return VOID_ELEMENTS.contains(nodeName(parent));
  }

  private static boolean isHtmlElement(TagNode parent) {
    return HTML_ELEMENTS.contains(nodeName(parent));
  }

  private static boolean isEndElement(TagNode element) {
    return element.isEndElement() || element.hasEnd();
  }

  private static boolean shouldCloseParent(String element, String parent) {
    // see https://www.w3.org/TR/html52/syntax.html#optional-start-and-end-tags
    switch (parent) {
      case "head":
        return !METADATA_CONTENT.contains(element);
      case "li":
        return "li".equals(element);
      case "dt", "dd":
        return "dt".equals(element) || "dd".equals(element);
      case "p":
        // note that we don't validate the parent of the  as described in spec
        return PARAGRAPH_CLOSING.contains(element);
      case "rb", "rp", "rt":
        return RUBY_CLOSING.contains(element);
      case "rtc":
        return "rb".equals(element) || "rtc".equals(element);
      case "optgroup":
        return "optgroup".equals(element);
      case "option":
        return "option".equals(element) || "optgroup".equals(element);
      case "colgroup":
        return !("col".equals(element) || "template".equals(element));
      case "caption":
        return TABLE_DESCENDANTS.contains(element);
      case "thead", "tbody":
        return "tbody".equals(element) || "tfoot".equals(element);
      case "tr":
        return !("td".equals(element) || "th".equals(element));
      case "td", "th":
        return "td".equals(element) || "th".equals(element);
      default:
        return false;
    }
  }

  private static String nodeName(TagNode node) {
    return node.getNodeName().toLowerCase(Locale.ROOT);
  }
}