001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2026 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks.coding;
021
022import java.util.HashSet;
023import java.util.Set;
024
025import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
026import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
027import com.puppycrawl.tools.checkstyle.api.DetailAST;
028import com.puppycrawl.tools.checkstyle.api.TokenTypes;
029import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
030
031/**
032 * <div>
033 * Checks that specified symbols (by Unicode code points or ranges) are not used in code.
034 * By default, blocks common symbol ranges.
035 * </div>
036 *
037 * <p>
038 * Rationale: This check helps prevent emoji symbols and special characters in code
039 * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters.
040 * </p>
041 *
042 * <p>
043 * Default ranges cover:
044 * </p>
045 * <ul>
046 * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes,
047 * Miscellaneous Symbols, and Dingbats</li>
048 * <li>U+1F600–U+1F64F: Emoticons</li>
049 * <li>U+1F680–U+1F6FF: Transport and Map Symbols</li>
050 * <li>U+1F700–U+10FFFF: Alchemical Symbols and other pictographic symbols</li>
051 * </ul>
052 *
053 * <p>
054 * For a complete list of Unicode characters and ranges, see:
055 * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters">
056 * List of Unicode characters</a>
057 * </p>
058 *
059 * <ul>
060 * <li>
061 * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points
062 * or ranges. Format: comma-separated list of hex codes or ranges
063 * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters,
064 * use {@code "0x0080-0x10FFFF"}.
065 * Type is {@code java.lang.String}.
066 * Default value is {@code "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x1FFFFF"}.
067 * </li>
068 * </ul>
069 *
070 * @since 13.3.0
071 */
072@FileStatefulCheck
073public class IllegalSymbolCheck extends AbstractCheck {
074
075    /**
076     * A key is pointing to the warning message text in "messages.properties" file.
077     */
078    public static final String MSG_KEY = "illegal.symbol";
079
080    /** String Range Separator. */
081    private static final String RANGE_SEPARATOR = "-";
082
083    /** Precomputed single code points. */
084    private final Set<Integer> singleCodePoints = new HashSet<>();
085
086    /** Precomputed code point ranges. */
087    private final Set<CodePointRange> codePointRanges = new HashSet<>();
088
089    /** Specify the symbols to check for, as Unicode code points or ranges. */
090    private String symbolCodes = "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, "
091            + "0x1F700-0x1FFFFF";
092
093    /** Flag to track if ranges have been initialized. */
094    private boolean initialized;
095
096    /**
097     * Setter to specify the symbols to check for.
098     * Format: comma-separated list of hex codes or ranges
099     * (e.g., "0x2705, 0x1F600-0x1F64F").
100     *
101     * @param symbols the symbols specification
102     * @throws IllegalArgumentException if the format is invalid
103     * @since 13.3.0
104     */
105    public void setSymbolCodes(String symbols) {
106        symbolCodes = symbols;
107        singleCodePoints.clear();
108        codePointRanges.clear();
109        initialized = true;
110
111        if (!symbols.isEmpty()) {
112            final String[] parts = symbols.split(",", -1);
113            for (String part : parts) {
114                final String trimmed = part.trim();
115                if (!trimmed.isEmpty()) {
116                    try {
117                        if (trimmed.contains(RANGE_SEPARATOR)) {
118                            parseRange(trimmed);
119                        }
120                        else {
121                            singleCodePoints.add(parseCodePoint(trimmed));
122                        }
123                    }
124                    catch (NumberFormatException exception) {
125                        throw new IllegalArgumentException(
126                                "Invalid symbol code format: " + trimmed, exception);
127                    }
128                }
129            }
130        }
131    }
132
133    @Override
134    public int[] getDefaultTokens() {
135        return new int[] {
136            TokenTypes.COMMENT_CONTENT,
137        };
138    }
139
140    @Override
141    public int[] getAcceptableTokens() {
142        return new int[] {
143            TokenTypes.COMMENT_CONTENT,
144            TokenTypes.STRING_LITERAL,
145            TokenTypes.CHAR_LITERAL,
146            TokenTypes.TEXT_BLOCK_CONTENT,
147            TokenTypes.IDENT,
148        };
149    }
150
151    @Override
152    public int[] getRequiredTokens() {
153        return CommonUtil.EMPTY_INT_ARRAY;
154    }
155
156    @Override
157    public boolean isCommentNodesRequired() {
158        return true;
159    }
160
161    @Override
162    public void visitToken(DetailAST ast) {
163        final String text = ast.getText();
164        checkText(text, ast);
165    }
166
167    /**
168     * Check the text for illegal symbols.
169     *
170     * @param text the text to check
171     * @param ast the AST node
172     */
173    private void checkText(String text, DetailAST ast) {
174        final int length = text.length();
175        int offset = 0;
176
177        while (offset < length) {
178            final int codePoint = text.codePointAt(offset);
179
180            if (isIllegalSymbol(codePoint)) {
181                log(ast, MSG_KEY);
182                break;
183            }
184
185            offset += Character.charCount(codePoint);
186        }
187    }
188
189    /**
190     * Check if a code point is illegal based on configured ranges.
191     *
192     * @param codePoint the code point to check
193     * @return true if the code point is illegal
194     */
195    private boolean isIllegalSymbol(int codePoint) {
196        return !symbolCodes.isEmpty() && isInSymbolCodes(codePoint);
197    }
198
199    /**
200     * Check if code point is in the configured symbol codes.
201     *
202     * @param codePoint the code point to check
203     * @return true if in symbol codes
204     */
205    private boolean isInSymbolCodes(int codePoint) {
206        if (!initialized) {
207            setSymbolCodes(symbolCodes);
208        }
209
210        boolean found = false;
211
212        // Check single code points
213        if (singleCodePoints.contains(codePoint)) {
214            found = true;
215        }
216        else {
217            // Check ranges
218            for (CodePointRange range : codePointRanges) {
219                if (range.contains(codePoint)) {
220                    found = true;
221                    break;
222                }
223            }
224        }
225
226        return found;
227    }
228
229    /**
230     * Parse and store a range.
231     *
232     * @param rangeStr the range string
233     * @throws IllegalArgumentException if range format is invalid
234     */
235    private void parseRange(String rangeStr) {
236        final String[] range = rangeStr.split(RANGE_SEPARATOR, -1);
237        if (range.length != 2
238                || CommonUtil.isBlank(range[0])
239                || CommonUtil.isBlank(range[1])) {
240            throw new IllegalArgumentException("Invalid range format: " + rangeStr);
241        }
242
243        final int start = parseCodePoint(range[0].trim());
244        final int end = parseCodePoint(range[1].trim());
245
246        if (start > end) {
247            throw new IllegalArgumentException(
248                    "Range start must be <= end: " + rangeStr);
249        }
250
251        codePointRanges.add(new CodePointRange(start, end));
252    }
253
254    /**
255     * Parse a code point from string representation.
256     * Supports formats: 0x1234, \\u1234, U+1234, or plain hex.
257     *
258     * @param str the string to parse
259     * @return the code point value
260     * @throws NumberFormatException if the string cannot be parsed
261     */
262    private static int parseCodePoint(String str) {
263        final String cleaned = str.trim();
264        final int hexRadix = 16;
265        final int result;
266
267        if (cleaned.startsWith("\\u")
268                || cleaned.startsWith("0x")
269                || cleaned.startsWith("0X")
270                || cleaned.startsWith("U+")
271                || cleaned.startsWith("u+")) {
272            if (cleaned.length() == 2) {
273                throw new NumberFormatException("Invalid code point format: " + cleaned);
274            }
275            result = Integer.parseInt(cleaned.substring(2), hexRadix);
276        }
277        else {
278            result = Integer.parseInt(cleaned, hexRadix);
279        }
280        return result;
281    }
282
283    /**
284     * Represents a parsed Unicode range.
285     *
286     * @param start start of range
287     * @param end end of range
288     */
289    private record CodePointRange(int start, int end) {
290
291        /**
292         * Check if code point is in this range.
293         *
294         * @param codePoint code point to check
295         * @return true if in range
296         */
297        /* package */ boolean contains(int codePoint) {
298            return codePoint >= start && codePoint <= end;
299        }
300    }
301}