001/////////////////////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code and other text files for adherence to a set of rules. 003// Copyright (C) 2001-2026 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018/////////////////////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks.coding; 021 022import java.util.HashSet; 023import java.util.Set; 024 025import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 026import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 027import com.puppycrawl.tools.checkstyle.api.DetailAST; 028import com.puppycrawl.tools.checkstyle.api.TokenTypes; 029import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 030 031/** 032 * <div> 033 * Checks that specified symbols (by Unicode code points or ranges) are not used in code. 034 * By default, blocks common symbol ranges. 035 * </div> 036 * 037 * <p> 038 * Rationale: This check helps prevent emoji symbols and special characters in code 039 * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters. 040 * </p> 041 * 042 * <p> 043 * Default ranges cover: 044 * </p> 045 * <ul> 046 * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes, 047 * Miscellaneous Symbols, and Dingbats</li> 048 * <li>U+1F600–U+1F64F: Emoticons</li> 049 * <li>U+1F680–U+1F6FF: Transport and Map Symbols</li> 050 * <li>U+1F700–U+10FFFF: Alchemical Symbols and other pictographic symbols</li> 051 * </ul> 052 * 053 * <p> 054 * For a complete list of Unicode characters and ranges, see: 055 * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters"> 056 * List of Unicode characters</a> 057 * </p> 058 * 059 * <ul> 060 * <li> 061 * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points 062 * or ranges. Format: comma-separated list of hex codes or ranges 063 * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters, 064 * use {@code "0x0080-0x10FFFF"}. 065 * Type is {@code java.lang.String}. 066 * Default value is {@code "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x1FFFFF"}. 067 * </li> 068 * </ul> 069 * 070 * @since 13.3.0 071 */ 072@FileStatefulCheck 073public class IllegalSymbolCheck extends AbstractCheck { 074 075 /** 076 * A key is pointing to the warning message text in "messages.properties" file. 077 */ 078 public static final String MSG_KEY = "illegal.symbol"; 079 080 /** String Range Separator. */ 081 private static final String RANGE_SEPARATOR = "-"; 082 083 /** Precomputed single code points. */ 084 private final Set<Integer> singleCodePoints = new HashSet<>(); 085 086 /** Precomputed code point ranges. */ 087 private final Set<CodePointRange> codePointRanges = new HashSet<>(); 088 089 /** Specify the symbols to check for, as Unicode code points or ranges. */ 090 private String symbolCodes = "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, " 091 + "0x1F700-0x1FFFFF"; 092 093 /** Flag to track if ranges have been initialized. */ 094 private boolean initialized; 095 096 /** 097 * Setter to specify the symbols to check for. 098 * Format: comma-separated list of hex codes or ranges 099 * (e.g., "0x2705, 0x1F600-0x1F64F"). 100 * 101 * @param symbols the symbols specification 102 * @throws IllegalArgumentException if the format is invalid 103 * @since 13.3.0 104 */ 105 public void setSymbolCodes(String symbols) { 106 symbolCodes = symbols; 107 singleCodePoints.clear(); 108 codePointRanges.clear(); 109 initialized = true; 110 111 if (!symbols.isEmpty()) { 112 final String[] parts = symbols.split(",", -1); 113 for (String part : parts) { 114 final String trimmed = part.trim(); 115 if (!trimmed.isEmpty()) { 116 try { 117 if (trimmed.contains(RANGE_SEPARATOR)) { 118 parseRange(trimmed); 119 } 120 else { 121 singleCodePoints.add(parseCodePoint(trimmed)); 122 } 123 } 124 catch (NumberFormatException exception) { 125 throw new IllegalArgumentException( 126 "Invalid symbol code format: " + trimmed, exception); 127 } 128 } 129 } 130 } 131 } 132 133 @Override 134 public int[] getDefaultTokens() { 135 return new int[] { 136 TokenTypes.COMMENT_CONTENT, 137 }; 138 } 139 140 @Override 141 public int[] getAcceptableTokens() { 142 return new int[] { 143 TokenTypes.COMMENT_CONTENT, 144 TokenTypes.STRING_LITERAL, 145 TokenTypes.CHAR_LITERAL, 146 TokenTypes.TEXT_BLOCK_CONTENT, 147 TokenTypes.IDENT, 148 }; 149 } 150 151 @Override 152 public int[] getRequiredTokens() { 153 return CommonUtil.EMPTY_INT_ARRAY; 154 } 155 156 @Override 157 public boolean isCommentNodesRequired() { 158 return true; 159 } 160 161 @Override 162 public void visitToken(DetailAST ast) { 163 final String text = ast.getText(); 164 checkText(text, ast); 165 } 166 167 /** 168 * Check the text for illegal symbols. 169 * 170 * @param text the text to check 171 * @param ast the AST node 172 */ 173 private void checkText(String text, DetailAST ast) { 174 final int length = text.length(); 175 int offset = 0; 176 177 while (offset < length) { 178 final int codePoint = text.codePointAt(offset); 179 180 if (isIllegalSymbol(codePoint)) { 181 log(ast, MSG_KEY); 182 break; 183 } 184 185 offset += Character.charCount(codePoint); 186 } 187 } 188 189 /** 190 * Check if a code point is illegal based on configured ranges. 191 * 192 * @param codePoint the code point to check 193 * @return true if the code point is illegal 194 */ 195 private boolean isIllegalSymbol(int codePoint) { 196 return !symbolCodes.isEmpty() && isInSymbolCodes(codePoint); 197 } 198 199 /** 200 * Check if code point is in the configured symbol codes. 201 * 202 * @param codePoint the code point to check 203 * @return true if in symbol codes 204 */ 205 private boolean isInSymbolCodes(int codePoint) { 206 if (!initialized) { 207 setSymbolCodes(symbolCodes); 208 } 209 210 boolean found = false; 211 212 // Check single code points 213 if (singleCodePoints.contains(codePoint)) { 214 found = true; 215 } 216 else { 217 // Check ranges 218 for (CodePointRange range : codePointRanges) { 219 if (range.contains(codePoint)) { 220 found = true; 221 break; 222 } 223 } 224 } 225 226 return found; 227 } 228 229 /** 230 * Parse and store a range. 231 * 232 * @param rangeStr the range string 233 * @throws IllegalArgumentException if range format is invalid 234 */ 235 private void parseRange(String rangeStr) { 236 final String[] range = rangeStr.split(RANGE_SEPARATOR, -1); 237 if (range.length != 2 238 || CommonUtil.isBlank(range[0]) 239 || CommonUtil.isBlank(range[1])) { 240 throw new IllegalArgumentException("Invalid range format: " + rangeStr); 241 } 242 243 final int start = parseCodePoint(range[0].trim()); 244 final int end = parseCodePoint(range[1].trim()); 245 246 if (start > end) { 247 throw new IllegalArgumentException( 248 "Range start must be <= end: " + rangeStr); 249 } 250 251 codePointRanges.add(new CodePointRange(start, end)); 252 } 253 254 /** 255 * Parse a code point from string representation. 256 * Supports formats: 0x1234, \\u1234, U+1234, or plain hex. 257 * 258 * @param str the string to parse 259 * @return the code point value 260 * @throws NumberFormatException if the string cannot be parsed 261 */ 262 private static int parseCodePoint(String str) { 263 final String cleaned = str.trim(); 264 final int hexRadix = 16; 265 final int result; 266 267 if (cleaned.startsWith("\\u") 268 || cleaned.startsWith("0x") 269 || cleaned.startsWith("0X") 270 || cleaned.startsWith("U+") 271 || cleaned.startsWith("u+")) { 272 if (cleaned.length() == 2) { 273 throw new NumberFormatException("Invalid code point format: " + cleaned); 274 } 275 result = Integer.parseInt(cleaned.substring(2), hexRadix); 276 } 277 else { 278 result = Integer.parseInt(cleaned, hexRadix); 279 } 280 return result; 281 } 282 283 /** 284 * Represents a parsed Unicode range. 285 * 286 * @param start start of range 287 * @param end end of range 288 */ 289 private record CodePointRange(int start, int end) { 290 291 /** 292 * Check if code point is in this range. 293 * 294 * @param codePoint code point to check 295 * @return true if in range 296 */ 297 /* package */ boolean contains(int codePoint) { 298 return codePoint >= start && codePoint <= end; 299 } 300 } 301}