com.google.common.css.compiler.ast.CssStringNode Maven / Gradle / Ivy
Show all versions of closure-stylesheets Show documentation
/*
* Copyright 2011 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.common.css.compiler.ast;
import com.google.common.base.CharMatcher;
import com.google.common.css.SourceCodeLocation;
import java.util.function.Function;
import java.util.function.UnaryOperator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Node corresponding to a string value.
*
* TODO(user): refactor the encoding/decoding logic out of this
* class, thereby restoring the architectural integrity of the CssNode
* type hierarchy; all these classes should be value types with
* no elaborate behaviors.
*
* This node represents both a CSS fragment and the abstract value
* represented by that concrete syntax. For example, the following
* three declarations mean precisely the same thing in CSS:
* {@code
* .warning { content-before: 'warning:'; }
* .warning { content-before: "warning:"; }
* .warning { content-before: 'war\06e ing:"; }
* }
*
* Some clients care about concrete CSS syntax. For high-fidelity
* roundtrip CSS processing it is necessary to preserve the original
* author's choice of quote character. On the other hand, some clients
* care about abstract values. For purposes of machine translation or
* typeface resolution, we are uninterested in the differences that
* distinguish the cases shown above; in these applications we would
* like to deal in terms of the simple Java String {@code warning:}.
*
* Java's {@code Character} and {@code String} classes represent
* values in the UTF-16 encoding; some codepoints are represented by
* surrogate pairs of {@code Character} instances. CSS escape sequences are
* designed without a particular encoding in mind; there are CSS
* escape sequences that correspond to a single Unicode character
* and multiple Java {@code Character} instances.
*
* Java's {@code Character} repertoire is a strict subset of the
* codepoints that can be represented in CSS. When decoding CSS
* escape sequences, this class substitutes the Unicode replacement
* character for characters that cannot be represented in Java
* {@code Strings}, as permitted by
* http://www.w3.org/TR/CSS2/syndata.html#characters
*/
public class CssStringNode extends CssValueNode {
private static final String LINE_BREAK_PATTERN_STRING =
"\\n|\\r\\n|\\r|\\f";
private static final CharMatcher CONSUMABLE_WHITESPACE =
CharMatcher.anyOf("\n\t ");
private static final Pattern ESCAPE_CHAR_STRING_CONTINUATION_PATTERN =
Pattern.compile("\\\\" + LINE_BREAK_PATTERN_STRING);
private static final Pattern ESCAPE_CHAR_NOT_SPECIAL =
Pattern.compile("\\\\([^0-9a-fA-F\\n\\r\\f])");
private static final Pattern ESCAPE_CHAR_HARD_TO_TYPE =
Pattern.compile("\\\\([0-9a-fA-F]{1,6})(\\r\\n|[ \\t\\r\\n\\f])?");
/**
* The pattern for HTML markup special characters
*/
private static final Pattern HTML_PATTERN =
Pattern.compile("[<>\"&']");
private static final Pattern LINE_BREAK_PATTERN =
Pattern.compile(LINE_BREAK_PATTERN_STRING);
private static final Pattern HEX_PATTERN =
Pattern.compile("[0-9a-fA-F]+");
private static final Pattern WIDE_NONASCII_PATTERN =
Pattern.compile("\\P{ASCII}");
private final Type type;
/**
* The characters between the quotes in concrete CSS syntax. Some
* clients will want exact control and high fidelity for values in
* these nodes. Some will expect an AST to disregard unimportant
* detail and provide convenient access to a normalized
* representation of the stylesheet. This field stores a verbatim
* snippet of CSS corresponding to this node.
*/
private String concreteValue;
/**
* Constructor of a string node.
*
* @param type CSS provides multiple syntax alternatives for strings;
* which was used for this term?
* @param location The location in source code corresponding to this node
*/
public CssStringNode(Type type, SourceCodeLocation location) {
super("", location);
setConcreteValue(location.getSourceCode().getFileContents()
.substring(location.getBeginCharacterIndex()
// for the quote
+ 1,
// we end after the quote, so adjust
location.getEndCharacterIndex() - 1));
this.type = type;
}
/**
* Constructor of a string node.
*
* @param type CSS provides multiple syntax alternatives for strings;
* which was used for this term?
* @param value the Java String representation of this string (not its
* concrete CSS syntax)
*/
public CssStringNode(Type type, String value) {
super(value, /* location */ null);
this.type = type;
setValue(value);
}
/**
* Copy constructor.
*/
public CssStringNode(CssStringNode node) {
super(node);
type = node.type;
this.concreteValue = node.getConcreteValue();
}
public Type getType() {
return type;
}
/**
* Specifies the characters that should appear between the quotes
* when this node is written as CSS, and updates this node's value
* accordingly.
*
* For example, the Java method invocation: {@code
* n.setConcreteValue("hi\\\"");
* }
* could result in the CSS: {@code
* p { content-after: "hi\""; }
* } or perhaps {@code
* p { content-after: 'hi\"'; }
* }, depending on the {@code CssStringNode.Type} of {@code n} and
* the {@code CssTree} in which it occurs, but it would never
* result in {@code
* p { content-after: "hi\000022"; }
* }
*/
public String setConcreteValue(String concreteValue) {
this.concreteValue = concreteValue;
super.setValue(unescape(concreteValue));
return concreteValue;
}
/**
* Retrieves the characters that should appear between the quotes
* when this node is written in concrete CSS syntax.
*/
public String getConcreteValue() {
return concreteValue;
}
/**
* Establishes a value for this node by conservatively escaping
* {@code value} and delegating to {@link #setConcreteValue} to
* maintain consistency between the {@link #value} and the
* {@link #concreteValue}.
*
* This function stores a normalized representation of the given
* {@code value}; if you want to work in more exact terms, try
* {@link #setConcreteValue}.
*
* For example, the Java snippet: {@code
* n.setValue("Senator's Response")
* } could result in the CSS snippet: {@code
* p { content-before: "Senator's Response"; }
* }
* or {@code
* p { content-before: 'Senator\'s Response'; }
* }
* or {@code
* p { content-before: 'Senator\27 s Response'; }
* }, depending on the {@code CssStringNode.Type} of {@code n} and
* the {@code CssTree} in which it occurs and the choice of the
* {@code CssTreeVisitor} that renders the output.
*
* Note that the {@code value} parameter here will normally not
* begin or end with a quotation mark.
*/
@Override
public void setValue(String value) {
setConcreteValue(escape(type, HTML_ESCAPER, value));
}
@Override
public CssStringNode deepCopy() {
return new CssStringNode(this);
}
@Override
public String toString() {
return type.toString(getValue(), SHORT_ESCAPER);
}
/**
* Determines the canonical Java String representation of a value encoded
* in CSS syntax.
*
* @param escaped whatever lies between the quotes (excluding the quotes
* themselves).
*/
public static String unescape(String escaped) {
String result =
ESCAPE_CHAR_STRING_CONTINUATION_PATTERN.matcher(escaped).replaceAll("");
result = ESCAPE_CHAR_NOT_SPECIAL.matcher(result).replaceAll("$1");
Matcher unicode = ESCAPE_CHAR_HARD_TO_TYPE.matcher(result);
StringBuffer sb = new StringBuffer();
while (unicode.find()) {
// CSS allows us to substitute characters above 0x110000. Java
// requires us to stay at or below MAX_CODE_POINT. If we are
// allowed to substitute, and Java requires us to substitute,
// then we substitute. Otherwise: (a) everything's fine without
// substitution or (b) CSS does not permit a substitution we
// need to make for Java's happiness or (c) CSS allows a
// substitution but we don't need it. For (a) and (c) we don't
// substitute and that's fine. For (b) we don't substitute,
// probably that will produce an exception below, and then we'll
// know it's worth thinking about that case some more.
int codepoint = Integer.parseInt(unicode.group(1), 16);
if (codepoint > 0x10FFFF && codepoint > Character.MAX_CODE_POINT) {
// CSS allows us to substitute, and Java requires us not to use the
// character we were given, so here is a character specifically
// for replacements:
codepoint = 0xfffd;
// TODO(user): this would be a good spot for a warning.
}
String replacement =
codepoint == 0 ? "" : new String(Character.toChars(codepoint));
unicode.appendReplacement(sb, replacement);
}
unicode.appendTail(sb);
result = sb.toString();
return result;
}
private static String escapeLineBreaks(String input) {
Matcher linebreak = LINE_BREAK_PATTERN.matcher(input);
StringBuilder sb = new StringBuilder(input.length());
int left = 0;
while (linebreak.find()) {
if (linebreak.start() > left) {
sb.append(input.subSequence(left, linebreak.start()));
}
sb.append("\\00000a");
int right = linebreak.end();
if (right < input.length()) {
char c = input.charAt(right);
if (CONSUMABLE_WHITESPACE.matches(c)) {
// add sacrificial whitespace to preserve the original
sb.append(" ");
}
}
left = right;
}
if (left < input.length()) {
sb.append(input.subSequence(left, input.length()));
}
return sb.toString();
}
/**
* Encodes a CSS term denoting {@code raw}. In general, there are multiple
* representations in CSS of the same value; we allow clients to influence
* this choice through {@code discretionaryEscaper}.
*
* @see #HTML_ESCAPER
* @see #SHORT_ESCAPER
*/
public static String escape(
Type type, Function super String, String> discretionaryEscaper,
String raw) {
String result = raw.replaceAll(
// the Java String encoding of the regex encoding of a slash
"\\\\",
// the Java String encoding of a regex replacement encoding of a
// CSS-escaped slash
"\\\\\\\\");
result = escapeLineBreaks(result);
result = discretionaryEscaper.apply(result);
result = type.escapeForDelimiters(result);
return result;
}
/**
* Represents this node's value in CSS syntax that is also safe for
* inclusion in HTML attribute values and element contents. This is a
* good choice when you want defense in depth against client code that
* fails to escape things properly.
*/
public static final UnaryOperator HTML_ESCAPER =
input -> paranoidEscapeChars(WIDE_NONASCII_PATTERN, paranoidEscapeChars(HTML_PATTERN, input));
/**
* Replaces characters of questionable safety in {@code context} by
* CSS escape sequences that are safe for DOUBLE_QUOTED_STRING and
* SINGLE_QUOTED_STRING nodes and also in HTML attribute values and
* element content. This implementation's code is especially simple
* in hopes of improving safety.
*
* @param banned a {@code Pattern} matching strings of length one
* that should be escaped in the output.
* @param context a {@code String} input potentially containing
* codepoints that are {@code banned}.
*/
private static String paranoidEscapeChars(Pattern banned, String context) {
StringBuffer sb = new StringBuffer();
Matcher markup = banned.matcher(context);
while (markup.find()) {
String match = markup.group(0);
assert (
// We don't insert characters from whole cloth
match.length() > 0
// Our replacement accounts for the entire banned snippet,
// which is one codepoint but potentially multiple UTF-16
// Java Characters.
&& match.length() == match.offsetByCodePoints(0, 1));
markup.appendReplacement(
sb,
String.format("\\\\%06x", markup.group(0).codePointAt(0)));
if (markup.end() < context.length()
&& CONSUMABLE_WHITESPACE.matches(context.charAt(markup.end()))) {
// a whitespace after an escaped character requires the
// insertion of an additional whitespace between the
// escape sequence and the original whitespace.
sb.append(" ");
}
}
markup.appendTail(sb);
return sb.toString();
}
/**
* Replaces characters that have no literal representation in CSS with
* their escape codes. This implementation compromises computational
* efficiency in order to produce the shortest possible output for each
* replaced character. This is a good choice for readability.
*/
public static final Function SHORT_ESCAPER =
input -> {
StringBuffer sb = new StringBuffer();
Matcher m = WIDE_NONASCII_PATTERN.matcher(input);
while (m.find()) {
String match = m.group(0);
assert (
// We don't insert characters from whole cloth
match.length() > 0
// Our replacement accounts for the entire banned snippet,
// which is one codepoint but potentially multiple UTF-16
// Java Characters.
&& match.length() == match.offsetByCodePoints(0, 1));
/* Escape codes can have up to 6 digits. We are allowed to pad
* with 0s on the left.
* When the escaped character ends the string, we simply
* substitute the escape code for the escaped character.
* Otherwise, there are two cases in which we must insert a
* whitespace after our escape sequence:
* (1) We have fewer than 6 digits and the escaped character
* appears immediately before a hexadecimal digit in the
* input.
* (2) The escaped character appears immediately before a
* whitespace in the input Adding the space never results in
* longer CSS than adding zero padding, and sometimes it
* shortens our output, so we never pad with zeroes.
*/
String hexDigits = String.format("%x", match.codePointAt(0));
String trailer;
if (input.length() <= m.end()) {
// simple: the end of the escape sequence is the end of the string.
trailer = "";
} else if (hexDigits.length() < 6 && HEX_PATTERN.matcher(
input.subSequence(m.end(), m.end() + 1)).matches()) {
// a hex digit after a short escape sequence requires
// separation by an inserted whitespace.
trailer = " ";
} else if (CONSUMABLE_WHITESPACE.matches(input.charAt(m.end()))) {
// a whitespace after an escaped character requires the
// insertion of an additional whitespace between the
// escape sequence and the original whitespace.
trailer = " ";
} else {
trailer = "";
}
m.appendReplacement(
sb, String.format("\\\\%s%s", hexDigits, trailer));
}
m.appendTail(sb);
return sb.toString();
};
/**
* Generates a CSS snippet representing this node.
* This may differ in semantically unimportant ways from the snippet
* from which this node was originally parsed.
*
* You might reasonably {@code n.setConcreteValue(n.toString(ESC))}
* because that will not change what you get from {@code n.getValue()}.
* But it is probably an error to write
* {@code n.setValue(n.toString(ESC))}; you can pump the string
* to unbounded length by putting the latter snippet in the body
* of a loop.
*
* @return a {@code String} corresponding to this node's
* abstract value, but suitable for inclusion in CSS.
* @see #getValue
* @see #getConcreteValue
*/
public String toString(
Function super String, String> discretionaryEscaper) {
return this.type.toString(this.getValue(), discretionaryEscaper);
}
/**
* CSS syntax permits strings to be expressed either using
* single-quotes or double-quotes.
*/
public enum Type {
/* double-quoted string */
DOUBLE_QUOTED_STRING("\""),
/* single-quoted string */
SINGLE_QUOTED_STRING("'");
public final String delimiter;
public final String format;
Type(String delimiter) {
this.delimiter = delimiter;
this.format = String.format("%s%%s%s", delimiter, delimiter);
}
public String toString(
String value, Function super String, String> discretionaryEscaper) {
return String.format(format, escape(this, discretionaryEscaper, value));
}
/**
* Escape delimiters found in input so that they will not begin
* or end new lexemes.
*/
public String escapeForDelimiters(String input) {
return input.replaceAll(
delimiter,
// Java String literal encoding of a regex-replacement encoding of
// a slash used to cancel the meaning of the special CSS character
// (the delimiter) that follows.
"\\\\" + delimiter);
}
}
}