
org.sonar.plugins.html.lex.PageLexer Maven / Gradle / Ivy
/*
* SonarQube HTML
* Copyright (C) 2010-2024 SonarSource SA
* mailto:info AT sonarsource DOT com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the Sonar Source-Available License Version 1, as published by SonarSource SA.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the Sonar Source-Available License for more details.
*
* You should have received a copy of the Sonar Source-Available License
* along with this program; if not, see https://sonarsource.com/license/ssal/
*/
package org.sonar.plugins.html.lex;
import java.io.Reader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.sonar.plugins.html.node.Node;
import org.sonar.plugins.html.node.NodeType;
import org.sonar.plugins.html.node.TagNode;
import org.sonar.sslr.channel.Channel;
import org.sonar.sslr.channel.ChannelDispatcher;
import org.sonar.sslr.channel.CodeReader;
/**
* Lexical analysis of a web page.
*/
@SuppressWarnings("unchecked")
public class PageLexer {
/**
* The order of the tokenizers is significant, as they are processed in this order.
*
* TextTokenizer must be last, it will always consume the characters until the next token arrives.
*/
@SuppressWarnings("rawtypes")
private static List tokenizers = Arrays.asList(
/* HTML Comments */
new CommentTokenizer("", true),
/* JSP Comments */
new CommentTokenizer("<%--", "--%>", false),
/* HTML Directive */
new DoctypeTokenizer(""),
/* XML Directives */
new DirectiveTokenizer("", "?>"),
/* JSP Directives */
new DirectiveTokenizer("<%@", "%>"),
/* JSP Expressions */
new ExpressionTokenizer("<%", "%>"),
/* CDATA */
new CdataTokenizer(),
/* XML and HTML Tags */
new NormalElementTokenizer(),
/* Text (for everything else) */
new TextTokenizer());
/**
* Void elements can't have any content
* See https://html.spec.whatwg.org/multipage/syntax.html#void-elements
*/
private static final Set VOID_ELEMENTS = new HashSet<>(Arrays.asList("area", "base", "br", "col", "embed",
"hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"));
private static final Set METADATA_CONTENT = new HashSet<>(Arrays.asList("base", "link", "meta", "noscript",
"script", "style", "template", "title"));
private static final Set PARAGRAPH_CLOSING = new HashSet<>(Arrays.asList("address", "article", "aside",
"blockquote", "details", "div", "dl", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "header", "hr", "main", "nav", "ol", "p", "pre", "section", "table", "ul"));
private static final Set RUBY_CLOSING = new HashSet<>(Arrays.asList("rtc", "rb", "rp", "rt"));
private static final Set TABLE_DESCENDANTS = new HashSet<>(Arrays.asList("caption", "colgroup", "thead",
"tbody", "tr", "tfoot"));
private static final Set HTML_ELEMENTS = new HashSet<>(Arrays.asList("a", "abbr", "acronym", "address",
"applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "bgsound", "big", "blink",
"blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "content",
"data", "datalist", "dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed", "fieldset",
"figcaption", "figure", "font", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head",
"header", "hgroup", "hr", "html", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "keygen", "label",
"legend", "li", "link", "listing", "main", "map", "mark", "marquee", "menu", "menuitem", "meta", "meter", "nav",
"nobr", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "picture",
"plaintext", "pre", "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", "script", "section", "select",
"shadow", "slot", "small", "source", "spacer", "span", "strike", "strong", "style", "sub", "summary", "sup", "table",
"tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul",
"var", "video", "wbr", "xmp"));
/**
* Parse a nested node.
*/
@SuppressWarnings("rawtypes")
public List nestedParse(CodeReader reader) {
List nodeList = new ArrayList<>();
for (AbstractTokenizer tokenizer : (List) tokenizers) {
if (tokenizer.consume(reader, nodeList)) {
break;
}
}
return nodeList;
}
/**
* Parse the input into a list of tokens, with parent/child relations between the tokens.
*/
public List parse(Reader reader) {
// CodeReader reads the file stream
CodeReader codeReader = new CodeReader(reader);
// ArrayList collects the nodes
List nodeList = new ArrayList<>();
// ChannelDispatcher manages the tokenizers
ChannelDispatcher> channelDispatcher = ChannelDispatcher.builder().addChannels((Channel[]) tokenizers.toArray(new Channel[tokenizers.size()])).build();
channelDispatcher.consume(codeReader, nodeList);
createNodeHierarchy(nodeList);
return nodeList;
}
/**
* Scan the nodes and build the hierarchy of parent and child nodes.
*/
private static void createNodeHierarchy(List nodeList) {
Deque openElementStack = new ArrayDeque<>();
for (Node node : nodeList) {
if (node.getNodeType() != NodeType.TAG) {
continue;
}
TagNode element = (TagNode) node;
// start element
if (!element.isEndElement()) {
TagNode parent = openElementStack.peek();
while (parent != null
&& (shouldCloseParent(nodeName(element), nodeName(parent)) || isVoidElement(parent))) {
openElementStack.pop();
parent = openElementStack.peek();
}
element.setParent(parent);
openElementStack.push(element);
}
// end element
if (isEndElement(element) && !openElementStack.isEmpty()) {
TagNode openElement = openElementStack.peek();
if (openElement.equalsElementName(element.getNodeName())) {
openElementStack.pop();
} else {
// non-well formed, close HTML elements if there is matching open element
if (openElementStack.stream().anyMatch(tag -> tag.equalsElementName(element.getNodeName()))) {
while (!openElement.equalsElementName(element.getNodeName()) && isHtmlElement(openElement)) {
openElement = openElementStack.pop();
}
}
}
}
}
}
private static boolean isVoidElement(TagNode parent) {
return VOID_ELEMENTS.contains(nodeName(parent));
}
private static boolean isHtmlElement(TagNode parent) {
return HTML_ELEMENTS.contains(nodeName(parent));
}
private static boolean isEndElement(TagNode element) {
return element.isEndElement() || element.hasEnd();
}
private static boolean shouldCloseParent(String element, String parent) {
// see https://www.w3.org/TR/html52/syntax.html#optional-start-and-end-tags
switch (parent) {
case "head":
return !METADATA_CONTENT.contains(element);
case "li":
return "li".equals(element);
case "dt", "dd":
return "dt".equals(element) || "dd".equals(element);
case "p":
// note that we don't validate the parent of the as described in spec
return PARAGRAPH_CLOSING.contains(element);
case "rb", "rp", "rt":
return RUBY_CLOSING.contains(element);
case "rtc":
return "rb".equals(element) || "rtc".equals(element);
case "optgroup":
return "optgroup".equals(element);
case "option":
return "option".equals(element) || "optgroup".equals(element);
case "colgroup":
return !("col".equals(element) || "template".equals(element));
case "caption":
return TABLE_DESCENDANTS.contains(element);
case "thead", "tbody":
return "tbody".equals(element) || "tfoot".equals(element);
case "tr":
return !("td".equals(element) || "th".equals(element));
case "td", "th":
return "td".equals(element) || "th".equals(element);
default:
return false;
}
}
private static String nodeName(TagNode node) {
return node.getNodeName().toLowerCase(Locale.ROOT);
}
}