001/////////////////////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code and other text files for adherence to a set of rules. 003// Copyright (C) 2001-2025 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018/////////////////////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.Arrays; 023import java.util.List; 024import java.util.Map; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 029import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 030import com.puppycrawl.tools.checkstyle.api.DetailAST; 031import com.puppycrawl.tools.checkstyle.api.TextBlock; 032import com.puppycrawl.tools.checkstyle.api.TokenTypes; 033import com.puppycrawl.tools.checkstyle.utils.CheckUtil; 034import com.puppycrawl.tools.checkstyle.utils.CodePointUtil; 035 036/** 037 * <div> 038 * Restricts using 039 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 040 * Unicode escapes</a> 041 * (such as \u221e). It is possible to allow using escapes for 042 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 043 * non-printable, control characters</a>. 044 * Also, this check can be configured to allow using escapes 045 * if trail comment is present. By the option it is possible to 046 * allow using escapes if literal contains only them. 047 * </div> 048 * 049 * @since 5.8 050 */ 051@FileStatefulCheck 052public class AvoidEscapedUnicodeCharactersCheck 053 extends AbstractCheck { 054 055 /** 056 * A key is pointing to the warning message text in "messages.properties" 057 * file. 058 */ 059 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 060 061 /** Regular expression for Unicode chars. */ 062 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}"); 063 064 /** 065 * Regular expression Unicode control characters. 066 * 067 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 068 * Appendix:Control characters</a> 069 */ 070 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+" 071 + "(00[0-1][\\dA-Fa-f]" 072 + "|00[8-9][\\dA-Fa-f]" 073 + "|00[aA][dD]" 074 + "|034[fF]" 075 + "|070[fF]" 076 + "|180[eE]" 077 + "|200[b-fB-F]" 078 + "|202[a-eA-E]" 079 + "|206[0-4a-fA-F]" 080 + "|[fF]{3}[9a-bA-B]" 081 + "|[fF][eE][fF]{2})"); 082 083 /** 084 * Regular expression for all escaped chars. 085 * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7"> 086 * EscapeSequence</a> 087 */ 088 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^(" 089 + UNICODE_REGEXP.pattern() 090 + "|\"" 091 + "|'" 092 + "|\\\\" 093 + "|\\\\b" 094 + "|\\\\f" 095 + "|\\\\n" 096 + "|\\R" 097 + "|\\\\r" 098 + "|\\\\s" 099 + "|\\\\t" 100 + ")+$"); 101 102 /** Regular expression for escaped backslash. */ 103 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 104 105 /** Regular expression for non-printable unicode chars. */ 106 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 107 + "|\\\\u0009" 108 + "|\\\\u000[bB]" 109 + "|\\\\u000[cC]" 110 + "|\\\\u0020" 111 + "|\\\\u007[fF]" 112 + "|\\\\u0085" 113 + "|\\\\u009[fF]" 114 + "|\\\\u00[aA]0" 115 + "|\\\\u00[aA][dD]" 116 + "|\\\\u04[fF]9" 117 + "|\\\\u05[bB][eE]" 118 + "|\\\\u05[dD]0" 119 + "|\\\\u05[eE][aA]" 120 + "|\\\\u05[fF]3" 121 + "|\\\\u05[fF]4" 122 + "|\\\\u0600" 123 + "|\\\\u0604" 124 + "|\\\\u061[cC]" 125 + "|\\\\u06[dD]{2}" 126 + "|\\\\u06[fF]{2}" 127 + "|\\\\u070[fF]" 128 + "|\\\\u0750" 129 + "|\\\\u077[fF]" 130 + "|\\\\u0[eE]00" 131 + "|\\\\u0[eE]7[fF]" 132 + "|\\\\u1680" 133 + "|\\\\u180[eE]" 134 + "|\\\\u1[eE]00" 135 + "|\\\\u2000" 136 + "|\\\\u2001" 137 + "|\\\\u2002" 138 + "|\\\\u2003" 139 + "|\\\\u2004" 140 + "|\\\\u2005" 141 + "|\\\\u2006" 142 + "|\\\\u2007" 143 + "|\\\\u2008" 144 + "|\\\\u2009" 145 + "|\\\\u200[aA]" 146 + "|\\\\u200[fF]" 147 + "|\\\\u2025" 148 + "|\\\\u2028" 149 + "|\\\\u2029" 150 + "|\\\\u202[fF]" 151 + "|\\\\u205[fF]" 152 + "|\\\\u2064" 153 + "|\\\\u2066" 154 + "|\\\\u2067" 155 + "|\\\\u2068" 156 + "|\\\\u2069" 157 + "|\\\\u206[aA]" 158 + "|\\\\u206[fF]" 159 + "|\\\\u20[aA][fF]" 160 + "|\\\\u2100" 161 + "|\\\\u213[aA]" 162 + "|\\\\u3000" 163 + "|\\\\u[dD]800" 164 + "|\\\\u[fF]8[fF]{2}" 165 + "|\\\\u[fF][bB]50" 166 + "|\\\\u[fF][dD][fF]{2}" 167 + "|\\\\u[fF][eE]70" 168 + "|\\\\u[fF][eE][fF]{2}" 169 + "|\\\\u[fF]{2}0[eE]" 170 + "|\\\\u[fF]{2}61" 171 + "|\\\\u[fF]{2}[dD][cC]" 172 + "|\\\\u[fF]{3}9" 173 + "|\\\\u[fF]{3}[aA]" 174 + "|\\\\u[fF]{3}[bB]" 175 + "|\\\\u[fF]{4}"); 176 177 /** Cpp style comments. */ 178 private Map<Integer, TextBlock> singlelineComments; 179 /** C style comments. */ 180 private Map<Integer, List<TextBlock>> blockComments; 181 182 /** Allow use escapes for non-printable, control characters. */ 183 private boolean allowEscapesForControlCharacters; 184 185 /** Allow use escapes if trail comment is present. */ 186 private boolean allowByTailComment; 187 188 /** Allow if all characters in literal are escaped. */ 189 private boolean allowIfAllCharactersEscaped; 190 191 /** Allow use escapes for non-printable, whitespace characters. */ 192 private boolean allowNonPrintableEscapes; 193 194 /** 195 * Setter to allow use escapes for non-printable, control characters. 196 * 197 * @param allow user's value. 198 * @since 5.8 199 */ 200 public final void setAllowEscapesForControlCharacters(boolean allow) { 201 allowEscapesForControlCharacters = allow; 202 } 203 204 /** 205 * Setter to allow use escapes if trail comment is present. 206 * 207 * @param allow user's value. 208 * @since 5.8 209 */ 210 public final void setAllowByTailComment(boolean allow) { 211 allowByTailComment = allow; 212 } 213 214 /** 215 * Setter to allow if all characters in literal are escaped. 216 * 217 * @param allow user's value. 218 * @since 5.8 219 */ 220 public final void setAllowIfAllCharactersEscaped(boolean allow) { 221 allowIfAllCharactersEscaped = allow; 222 } 223 224 /** 225 * Setter to allow use escapes for non-printable, whitespace characters. 226 * 227 * @param allow user's value. 228 * @since 5.8 229 */ 230 public final void setAllowNonPrintableEscapes(boolean allow) { 231 allowNonPrintableEscapes = allow; 232 } 233 234 @Override 235 public int[] getDefaultTokens() { 236 return getRequiredTokens(); 237 } 238 239 @Override 240 public int[] getAcceptableTokens() { 241 return getRequiredTokens(); 242 } 243 244 @Override 245 public int[] getRequiredTokens() { 246 return new int[] { 247 TokenTypes.STRING_LITERAL, 248 TokenTypes.CHAR_LITERAL, 249 TokenTypes.TEXT_BLOCK_CONTENT, 250 }; 251 } 252 253 // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166 254 @SuppressWarnings("deprecation") 255 @Override 256 public void beginTree(DetailAST rootAST) { 257 singlelineComments = getFileContents().getSingleLineComments(); 258 blockComments = getFileContents().getBlockComments(); 259 } 260 261 @Override 262 public void visitToken(DetailAST ast) { 263 final String literal = 264 CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText()); 265 266 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 267 || isAllCharactersEscaped(literal) 268 || allowEscapesForControlCharacters 269 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 270 || allowNonPrintableEscapes 271 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 272 log(ast, MSG_KEY); 273 } 274 } 275 276 /** 277 * Checks if literal has Unicode chars. 278 * 279 * @param literal String literal. 280 * @return true if literal has Unicode chars. 281 */ 282 private static boolean hasUnicodeChar(String literal) { 283 final String literalWithoutEscapedBackslashes = 284 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 285 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 286 } 287 288 /** 289 * Check if String literal contains Unicode control chars. 290 * 291 * @param literal String literal. 292 * @param pattern RegExp for valid characters. 293 * @return true, if String literal contains Unicode control chars. 294 */ 295 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 296 final int unicodeMatchesCounter = 297 countMatches(UNICODE_REGEXP, literal); 298 final int unicodeValidMatchesCounter = 299 countMatches(pattern, literal); 300 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 301 } 302 303 /** 304 * Check if trail comment is present after ast token. 305 * 306 * @param ast current token. 307 * @return true if trail comment is present after ast token. 308 */ 309 private boolean hasTrailComment(DetailAST ast) { 310 int lineNo = ast.getLineNo(); 311 312 // Since the trailing comment in the case of text blocks must follow the """ delimiter, 313 // we need to look for it after TEXT_BLOCK_LITERAL_END. 314 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) { 315 lineNo = ast.getNextSibling().getLineNo(); 316 } 317 boolean result = false; 318 if (singlelineComments.containsKey(lineNo)) { 319 result = true; 320 } 321 else { 322 final List<TextBlock> commentList = blockComments.get(lineNo); 323 if (commentList != null) { 324 final TextBlock comment = commentList.get(commentList.size() - 1); 325 final int[] codePoints = getLineCodePoints(lineNo - 1); 326 result = isTrailingBlockComment(comment, codePoints); 327 } 328 } 329 return result; 330 } 331 332 /** 333 * Whether the C style comment is trailing. 334 * 335 * @param comment the comment to check. 336 * @param codePoints the first line of the comment, in unicode code points 337 * @return true if the comment is trailing. 338 */ 339 private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) { 340 return comment.getText().length != 1 341 || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints, 342 comment.getEndColNo() + 1, codePoints.length)); 343 } 344 345 /** 346 * Count regexp matches into String literal. 347 * 348 * @param pattern pattern. 349 * @param target String literal. 350 * @return count of regexp matches. 351 */ 352 private static int countMatches(Pattern pattern, String target) { 353 int matcherCounter = 0; 354 final Matcher matcher = pattern.matcher(target); 355 while (matcher.find()) { 356 matcherCounter++; 357 } 358 return matcherCounter; 359 } 360 361 /** 362 * Checks if all characters in String literal is escaped. 363 * 364 * @param literal current literal. 365 * @return true if all characters in String literal is escaped. 366 */ 367 private boolean isAllCharactersEscaped(String literal) { 368 return allowIfAllCharactersEscaped 369 && ALL_ESCAPED_CHARS.matcher(literal).find(); 370 } 371 372}