001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2026 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks.coding;
021
022import com.puppycrawl.tools.checkstyle.StatelessCheck;
023import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
024import com.puppycrawl.tools.checkstyle.api.DetailAST;
025import com.puppycrawl.tools.checkstyle.api.TokenTypes;
026import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
027
028/**
029 * <div>
030 * Checks that specified symbols (by Unicode code points or ranges) are not used in code.
031 * By default, blocks common symbol ranges (U+2190–U+27BF and U+1F700–U+10FFFF).
032 * </div>
033 *
034 * <p>
035 * Rationale: This check helps prevent emoji symbols and special characters in code
036 * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters.
037 * </p>
038 *
039 * <p>
040 * Default ranges cover:
041 * </p>
042 * <ul>
043 * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes,
044 * Miscellaneous Symbols, and Dingbats</li>
045 * <li>U+1F700–U+10FFFF: Alchemical Symbols, Emoticons, Transport Symbols,
046 * and all other pictographic symbols</li>
047 * </ul>
048 *
049 * <p>
050 * For a complete list of Unicode characters and ranges, see:
051 * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters">
052 * List of Unicode characters</a>
053 * </p>
054 *
055 * <ul>
056 * <li>
057 * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points
058 * or ranges. Format: comma-separated list of hex codes or ranges
059 * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters,
060 * use {@code "0x0080-0x10FFFF"}.
061 * Type is {@code java.lang.String}.
062 * Default value is {@code "0x2190-0x27BF, 0x1F700-0x10FFFF"}.
063 * </li>
064 * </ul>
065 *
066 * @since 13.1.0
067 */
068@StatelessCheck
069public class IllegalSymbolCheck extends AbstractCheck {
070
071    /**
072     * A key is pointing to the warning message text in "messages.properties" file.
073     */
074    public static final String MSG_KEY = "illegal.symbol";
075
076    /** String Range Separator. */
077    private static final String RANGE_SEPARATOR = "-";
078
079    /** Specify the symbols to check for, as Unicode code points or ranges. */
080    private String symbolCodes = "0x2190-0x27BF, 0x1F700-0x10FFFF";
081
082    /**
083     * Setter to specify the symbols to check for.
084     * Format: comma-separated list of hex codes or ranges
085     * (e.g., "0x2705, 0xd83c-0xd83e").
086     *
087     * @param symbols the symbols specification
088     * @since 13.1.0
089     */
090    public void setSymbolCodes(String symbols) {
091        symbolCodes = symbols;
092    }
093
094    @Override
095    public int[] getDefaultTokens() {
096        return new int[] {
097            TokenTypes.COMMENT_CONTENT,
098        };
099    }
100
101    @Override
102    public int[] getAcceptableTokens() {
103        return new int[] {
104            TokenTypes.COMMENT_CONTENT,
105            TokenTypes.STRING_LITERAL,
106            TokenTypes.CHAR_LITERAL,
107            TokenTypes.TEXT_BLOCK_CONTENT,
108            TokenTypes.IDENT,
109        };
110    }
111
112    @Override
113    public int[] getRequiredTokens() {
114        return CommonUtil.EMPTY_INT_ARRAY;
115    }
116
117    @Override
118    public boolean isCommentNodesRequired() {
119        return true;
120    }
121
122    @Override
123    public void visitToken(DetailAST ast) {
124        final String text = ast.getText();
125        checkText(text, ast);
126    }
127
128    /**
129     * Check the text for illegal symbols.
130     *
131     * @param text the text to check
132     * @param ast the AST node
133     */
134    private void checkText(String text, DetailAST ast) {
135        final int length = text.length();
136        int offset = 0;
137
138        while (offset < length) {
139            final int codePoint = text.codePointAt(offset);
140
141            if (isIllegalSymbol(codePoint)) {
142                log(ast, MSG_KEY);
143                break;
144            }
145
146            offset += Character.charCount(codePoint);
147        }
148    }
149
150    /**
151     * Check if a code point is illegal based on configured ranges.
152     *
153     * @param codePoint the code point to check
154     * @return true if the code point is illegal
155     */
156    private boolean isIllegalSymbol(int codePoint) {
157        return !symbolCodes.isEmpty() && isInSymbolCodes(codePoint);
158    }
159
160    /**
161     * Check if code point is in the configured symbol codes.
162     *
163     * @param codePoint the code point to check
164     * @return true if in symbol codes
165     */
166    private boolean isInSymbolCodes(int codePoint) {
167        boolean found = false;
168        final String[] parts = symbolCodes.split(",", -1);
169
170        for (String part : parts) {
171            final String trimmed = part.trim();
172            if (trimmed.contains(RANGE_SEPARATOR)) {
173                // Range format
174                found = isInRange(codePoint, trimmed);
175            }
176            else {
177                // Single code point
178                final int checkPoint = parseCodePoint(trimmed);
179                found = codePoint == checkPoint;
180            }
181
182            if (found) {
183                break;
184            }
185        }
186
187        return found;
188    }
189
190    /**
191     * Check if code point is in the specified range.
192     *
193     * @param codePoint the code point to check
194     * @param rangeStr the range string (e.g., "0x1F600-0x1F64F")
195     * @return true if in range
196     */
197    private static boolean isInRange(int codePoint, String rangeStr) {
198        final String[] range = rangeStr.split(RANGE_SEPARATOR, -1);
199        boolean result = false;
200
201        if (range.length == 2) {
202            final int start = parseCodePoint(range[0].trim());
203            final int end = parseCodePoint(range[1].trim());
204            result = codePoint >= start && codePoint <= end;
205        }
206
207        return result;
208    }
209
210    /**
211     * Parse a code point from string representation.
212     * Supports formats: 0x1234, \\u1234, U+1234, or decimal.
213     *
214     * @param str the string to parse
215     * @return the code point value
216     */
217    private static int parseCodePoint(String str) {
218        final String cleaned = str.trim();
219        final int hexRadix = 16;
220        final int result;
221
222        if (cleaned.startsWith("\\u")
223                || cleaned.startsWith("0x")
224                || cleaned.startsWith("0X")
225                || cleaned.startsWith("U+")
226                || cleaned.startsWith("u+")) {
227            result = Integer.parseInt(cleaned.substring(2), hexRadix);
228        }
229        else {
230            result = Integer.parseInt(cleaned, hexRadix);
231        }
232        return result;
233    }
234}