com.puppycrawl.tools.checkstyle.checks.AvoidEscapedUnicodeCharactersCheck Maven / Gradle / Ivy
Show all versions of checkstyle Show documentation
////////////////////////////////////////////////////////////////////////////////
// checkstyle: Checks Java source code for adherence to a set of rules.
// Copyright (C) 2001-2021 the original author or authors.
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////////////////////////////////////////////////////
package com.puppycrawl.tools.checkstyle.checks;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
import com.puppycrawl.tools.checkstyle.api.DetailAST;
import com.puppycrawl.tools.checkstyle.api.TextBlock;
import com.puppycrawl.tools.checkstyle.api.TokenTypes;
import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
/**
*
* Restricts using
*
* Unicode escapes
* (such as \u221e). It is possible to allow using escapes for
*
* non-printable, control characters.
* Also, this check can be configured to allow using escapes
* if trail comment is present. By the option it is possible to
* allow using escapes if literal contains only them.
*
*
* -
* Property {@code allowEscapesForControlCharacters} - Allow use escapes for
* non-printable, control characters.
* Type is {@code boolean}.
* Default value is {@code false}.
*
* -
* Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
* Type is {@code boolean}.
* Default value is {@code false}.
*
* -
* Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
* Type is {@code boolean}.
* Default value is {@code false}.
*
* -
* Property {@code allowNonPrintableEscapes} - Allow use escapes for
* non-printable, whitespace characters.
* Type is {@code boolean}.
* Default value is {@code false}.
*
*
*
* To configure the check:
*
*
* <module name="AvoidEscapedUnicodeCharacters"/>
*
*
* Examples of using Unicode:
*
* String unitAbbrev = "μs"; // OK, perfectly clear even without a comment.
* String unitAbbrev = "\u03bcs";// violation, the reader has no idea what this is.
* return '\ufeff' + content; // OK, an example of non-printable,
* // control characters (byte order mark).
*
*
* An example of how to configure the check to allow using escapes
* for non-printable, control characters:
*
*
* <module name="AvoidEscapedUnicodeCharacters">
* <property name="allowEscapesForControlCharacters" value="true"/>
* </module>
*
*
* Example of using escapes for non-printable, control characters:
*
*
* String unitAbbrev = "μs"; // OK, a normal String
* String unitAbbrev = "\u03bcs"; // violation, "\u03bcs" is a printable character.
* return '\ufeff' + content; // OK, non-printable control character.
*
*
* An example of how to configure the check to allow using escapes
* if trail comment is present:
*
*
* <module name="AvoidEscapedUnicodeCharacters">
* <property name="allowByTailComment" value="true"/>
* </module>
*
* Example of using escapes if trail comment is present:
*
*
* String unitAbbrev = "μs"; // OK, a normal String
* String unitAbbrev = "\u03bcs"; // OK, Greek letter mu, "s"
* return '\ufeff' + content;
* // -----^--------------------- violation, comment is not used within same line.
*
*
* An example of how to configure the check to allow if
* all characters in literal are escaped.
*
*
* <module name="AvoidEscapedUnicodeCharacters">
* <property name="allowIfAllCharactersEscaped" value="true"/>
* </module>
*
* Example of using escapes if all characters in literal are escaped:
*
* String unitAbbrev = "μs"; // OK, a normal String
* String unitAbbrev = "\u03bcs"; // violation, not all characters are escaped ('s').
* String unitAbbrev = "\u03bc\u03bc\u03bc"; // OK
* String unitAbbrev = "\u03bc\u03bcs";// violation, not all characters are escaped ('s').
* return '\ufeff' + content; // OK, all control characters are escaped
*
* An example of how to configure the check to allow using escapes
* for non-printable whitespace characters:
*
*
* <module name="AvoidEscapedUnicodeCharacters">
* <property name="allowNonPrintableEscapes" value="true"/>
* </module>
*
* Example of using escapes for non-printable whitespace characters:
*
* String unitAbbrev = "μs"; // OK, a normal String
* String unitAbbrev1 = "\u03bcs"; // violation, printable escape character.
* String unitAbbrev2 = "\u03bc\u03bc\u03bc"; // violation, printable escape character.
* String unitAbbrev3 = "\u03bc\u03bcs";// violation, printable escape character.
* return '\ufeff' + content; // OK, non-printable escape character.
*
*
* Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
*
*
* Violation Message Keys:
*
*
* -
* {@code forbid.escaped.unicode.char}
*
*
*
* @since 5.8
*/
@FileStatefulCheck
public class AvoidEscapedUnicodeCharactersCheck
extends AbstractCheck {
/**
* A key is pointing to the warning message text in "messages.properties"
* file.
*/
public static final String MSG_KEY = "forbid.escaped.unicode.char";
/** Regular expression for Unicode chars. */
private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F0-9]{4}");
/**
* Regular expression Unicode control characters.
*
* @see
* Appendix:Control characters
*/
private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
+ "(00[0-1][0-9A-Fa-f]"
+ "|00[8-9][0-9A-Fa-f]"
+ "|00[aA][dD]"
+ "|034[fF]"
+ "|070[fF]"
+ "|180[eE]"
+ "|200[b-fB-F]"
+ "|202[a-eA-E]"
+ "|206[0-4a-fA-F]"
+ "|[fF]{3}[9a-bA-B]"
+ "|[fF][eE][fF]{2})");
/**
* Regular expression for all escaped chars.
* See "EscapeSequence" at
* https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7
*/
private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
+ UNICODE_REGEXP.pattern()
+ "|\""
+ "|'"
+ "|\\\\"
+ "|\\\\b"
+ "|\\\\f"
+ "|\\\\n"
+ "|\\R"
+ "|\\\\r"
+ "|\\\\s"
+ "|\\\\t"
+ ")+$");
/** Regular expression for escaped backslash. */
private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
/** Regular expression for non-printable unicode chars. */
private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
+ "|\\\\u0009"
+ "|\\\\u000[bB]"
+ "|\\\\u000[cC]"
+ "|\\\\u0020"
+ "|\\\\u007[fF]"
+ "|\\\\u0085"
+ "|\\\\u009[fF]"
+ "|\\\\u00[aA]0"
+ "|\\\\u00[aA][dD]"
+ "|\\\\u04[fF]9"
+ "|\\\\u05[bB][eE]"
+ "|\\\\u05[dD]0"
+ "|\\\\u05[eE][aA]"
+ "|\\\\u05[fF]3"
+ "|\\\\u05[fF]4"
+ "|\\\\u0600"
+ "|\\\\u0604"
+ "|\\\\u061[cC]"
+ "|\\\\u06[dD]{2}"
+ "|\\\\u06[fF]{2}"
+ "|\\\\u070[fF]"
+ "|\\\\u0750"
+ "|\\\\u077[fF]"
+ "|\\\\u0[eE]00"
+ "|\\\\u0[eE]7[fF]"
+ "|\\\\u1680"
+ "|\\\\u180[eE]"
+ "|\\\\u1[eE]00"
+ "|\\\\u2000"
+ "|\\\\u2001"
+ "|\\\\u2002"
+ "|\\\\u2003"
+ "|\\\\u2004"
+ "|\\\\u2005"
+ "|\\\\u2006"
+ "|\\\\u2007"
+ "|\\\\u2008"
+ "|\\\\u2009"
+ "|\\\\u200[aA]"
+ "|\\\\u200[fF]"
+ "|\\\\u2025"
+ "|\\\\u2028"
+ "|\\\\u2029"
+ "|\\\\u202[fF]"
+ "|\\\\u205[fF]"
+ "|\\\\u2064"
+ "|\\\\u2066"
+ "|\\\\u2067"
+ "|\\\\u2068"
+ "|\\\\u2069"
+ "|\\\\u206[aA]"
+ "|\\\\u206[fF]"
+ "|\\\\u20[aA][fF]"
+ "|\\\\u2100"
+ "|\\\\u213[aA]"
+ "|\\\\u3000"
+ "|\\\\u[dD]800"
+ "|\\\\u[fF]8[fF]{2}"
+ "|\\\\u[fF][bB]50"
+ "|\\\\u[fF][dD][fF]{2}"
+ "|\\\\u[fF][eE]70"
+ "|\\\\u[fF][eE][fF]{2}"
+ "|\\\\u[fF]{2}0[eE]"
+ "|\\\\u[fF]{2}61"
+ "|\\\\u[fF]{2}[dD][cC]"
+ "|\\\\u[fF]{3}9"
+ "|\\\\u[fF]{3}[aA]"
+ "|\\\\u[fF]{3}[bB]"
+ "|\\\\u[fF]{4}");
/** Cpp style comments. */
private Map singlelineComments;
/** C style comments. */
private Map> blockComments;
/** Allow use escapes for non-printable, control characters. */
private boolean allowEscapesForControlCharacters;
/** Allow use escapes if trail comment is present. */
private boolean allowByTailComment;
/** Allow if all characters in literal are escaped. */
private boolean allowIfAllCharactersEscaped;
/** Allow use escapes for non-printable, whitespace characters. */
private boolean allowNonPrintableEscapes;
/**
* Setter to allow use escapes for non-printable, control characters.
*
* @param allow user's value.
*/
public final void setAllowEscapesForControlCharacters(boolean allow) {
allowEscapesForControlCharacters = allow;
}
/**
* Setter to allow use escapes if trail comment is present.
*
* @param allow user's value.
*/
public final void setAllowByTailComment(boolean allow) {
allowByTailComment = allow;
}
/**
* Setter to allow if all characters in literal are escaped.
*
* @param allow user's value.
*/
public final void setAllowIfAllCharactersEscaped(boolean allow) {
allowIfAllCharactersEscaped = allow;
}
/**
* Setter to allow use escapes for non-printable, whitespace characters.
*
* @param allow user's value.
*/
public final void setAllowNonPrintableEscapes(boolean allow) {
allowNonPrintableEscapes = allow;
}
@Override
public int[] getDefaultTokens() {
return getRequiredTokens();
}
@Override
public int[] getAcceptableTokens() {
return getRequiredTokens();
}
@Override
public int[] getRequiredTokens() {
return new int[] {
TokenTypes.STRING_LITERAL,
TokenTypes.CHAR_LITERAL,
TokenTypes.TEXT_BLOCK_CONTENT,
};
}
@Override
public void beginTree(DetailAST rootAST) {
singlelineComments = getFileContents().getSingleLineComments();
blockComments = getFileContents().getBlockComments();
}
@Override
public void visitToken(DetailAST ast) {
final String literal =
CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
|| isAllCharactersEscaped(literal)
|| allowEscapesForControlCharacters
&& isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
|| allowNonPrintableEscapes
&& isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
log(ast, MSG_KEY);
}
}
/**
* Checks if literal has Unicode chars.
*
* @param literal String literal.
* @return true if literal has Unicode chars.
*/
private static boolean hasUnicodeChar(String literal) {
final String literalWithoutEscapedBackslashes =
ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
}
/**
* Check if String literal contains Unicode control chars.
*
* @param literal String literal.
* @param pattern RegExp for valid characters.
* @return true, if String literal contains Unicode control chars.
*/
private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
final int unicodeMatchesCounter =
countMatches(UNICODE_REGEXP, literal);
final int unicodeValidMatchesCounter =
countMatches(pattern, literal);
return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
}
/**
* Check if trail comment is present after ast token.
*
* @param ast current token.
* @return true if trail comment is present after ast token.
*/
private boolean hasTrailComment(DetailAST ast) {
int lineNo = ast.getLineNo();
// Since the trailing comment in the case of text blocks must follow the """ delimiter,
// we need to look for it after TEXT_BLOCK_LITERAL_END.
if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
lineNo = ast.getNextSibling().getLineNo();
}
boolean result = false;
if (singlelineComments.containsKey(lineNo)) {
result = true;
}
else {
final List commentList = blockComments.get(lineNo);
if (commentList != null) {
final TextBlock comment = commentList.get(commentList.size() - 1);
final String line = getLines()[lineNo - 1];
result = isTrailingBlockComment(comment, line);
}
}
return result;
}
/**
* Whether the C style comment is trailing.
*
* @param comment the comment to check.
* @param line the line where the comment starts.
* @return true if the comment is trailing.
*/
private static boolean isTrailingBlockComment(TextBlock comment, String line) {
return comment.getText().length != 1
|| CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1));
}
/**
* Count regexp matches into String literal.
*
* @param pattern pattern.
* @param target String literal.
* @return count of regexp matches.
*/
private static int countMatches(Pattern pattern, String target) {
int matcherCounter = 0;
final Matcher matcher = pattern.matcher(target);
while (matcher.find()) {
matcherCounter++;
}
return matcherCounter;
}
/**
* Checks if all characters in String literal is escaped.
*
* @param literal current literal.
* @return true if all characters in String literal is escaped.
*/
private boolean isAllCharactersEscaped(String literal) {
return allowIfAllCharactersEscaped
&& ALL_ESCAPED_CHARS.matcher(literal).find();
}
}