001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2025 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.Arrays;
023import java.util.List;
024import java.util.Map;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
029import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
030import com.puppycrawl.tools.checkstyle.api.DetailAST;
031import com.puppycrawl.tools.checkstyle.api.TextBlock;
032import com.puppycrawl.tools.checkstyle.api.TokenTypes;
033import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
034import com.puppycrawl.tools.checkstyle.utils.CodePointUtil;
035
036/**
037 * <div>
038 * Restricts using
039 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
040 * Unicode escapes</a>
041 * (such as &#92;u221e). It is possible to allow using escapes for
042 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
043 * non-printable, control characters</a>.
044 * Also, this check can be configured to allow using escapes
045 * if trail comment is present. By the option it is possible to
046 * allow using escapes if literal contains only them.
047 * </div>
048 *
049 * @since 5.8
050 */
051@FileStatefulCheck
052public class AvoidEscapedUnicodeCharactersCheck
053    extends AbstractCheck {
054
055    /**
056     * A key is pointing to the warning message text in "messages.properties"
057     * file.
058     */
059    public static final String MSG_KEY = "forbid.escaped.unicode.char";
060
061    /** Regular expression for Unicode chars. */
062    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
063
064    /**
065     * Regular expression Unicode control characters.
066     *
067     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
068     *     Appendix:Control characters</a>
069     */
070    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
071            + "(00[0-1][\\dA-Fa-f]"
072            + "|00[8-9][\\dA-Fa-f]"
073            + "|00[aA][dD]"
074            + "|034[fF]"
075            + "|070[fF]"
076            + "|180[eE]"
077            + "|200[b-fB-F]"
078            + "|202[a-eA-E]"
079            + "|206[0-4a-fA-F]"
080            + "|[fF]{3}[9a-bA-B]"
081            + "|[fF][eE][fF]{2})");
082
083    /**
084     * Regular expression for all escaped chars.
085     * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
086     * EscapeSequence</a>
087     */
088    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
089            + UNICODE_REGEXP.pattern()
090            + "|\""
091            + "|'"
092            + "|\\\\"
093            + "|\\\\b"
094            + "|\\\\f"
095            + "|\\\\n"
096            + "|\\R"
097            + "|\\\\r"
098            + "|\\\\s"
099            + "|\\\\t"
100            + ")+$");
101
102    /** Regular expression for escaped backslash. */
103    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
104
105    /** Regular expression for non-printable unicode chars. */
106    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
107            + "|\\\\u0009"
108            + "|\\\\u000[bB]"
109            + "|\\\\u000[cC]"
110            + "|\\\\u0020"
111            + "|\\\\u007[fF]"
112            + "|\\\\u0085"
113            + "|\\\\u009[fF]"
114            + "|\\\\u00[aA]0"
115            + "|\\\\u00[aA][dD]"
116            + "|\\\\u04[fF]9"
117            + "|\\\\u05[bB][eE]"
118            + "|\\\\u05[dD]0"
119            + "|\\\\u05[eE][aA]"
120            + "|\\\\u05[fF]3"
121            + "|\\\\u05[fF]4"
122            + "|\\\\u0600"
123            + "|\\\\u0604"
124            + "|\\\\u061[cC]"
125            + "|\\\\u06[dD]{2}"
126            + "|\\\\u06[fF]{2}"
127            + "|\\\\u070[fF]"
128            + "|\\\\u0750"
129            + "|\\\\u077[fF]"
130            + "|\\\\u0[eE]00"
131            + "|\\\\u0[eE]7[fF]"
132            + "|\\\\u1680"
133            + "|\\\\u180[eE]"
134            + "|\\\\u1[eE]00"
135            + "|\\\\u2000"
136            + "|\\\\u2001"
137            + "|\\\\u2002"
138            + "|\\\\u2003"
139            + "|\\\\u2004"
140            + "|\\\\u2005"
141            + "|\\\\u2006"
142            + "|\\\\u2007"
143            + "|\\\\u2008"
144            + "|\\\\u2009"
145            + "|\\\\u200[aA]"
146            + "|\\\\u200[fF]"
147            + "|\\\\u2025"
148            + "|\\\\u2028"
149            + "|\\\\u2029"
150            + "|\\\\u202[fF]"
151            + "|\\\\u205[fF]"
152            + "|\\\\u2064"
153            + "|\\\\u2066"
154            + "|\\\\u2067"
155            + "|\\\\u2068"
156            + "|\\\\u2069"
157            + "|\\\\u206[aA]"
158            + "|\\\\u206[fF]"
159            + "|\\\\u20[aA][fF]"
160            + "|\\\\u2100"
161            + "|\\\\u213[aA]"
162            + "|\\\\u3000"
163            + "|\\\\u[dD]800"
164            + "|\\\\u[fF]8[fF]{2}"
165            + "|\\\\u[fF][bB]50"
166            + "|\\\\u[fF][dD][fF]{2}"
167            + "|\\\\u[fF][eE]70"
168            + "|\\\\u[fF][eE][fF]{2}"
169            + "|\\\\u[fF]{2}0[eE]"
170            + "|\\\\u[fF]{2}61"
171            + "|\\\\u[fF]{2}[dD][cC]"
172            + "|\\\\u[fF]{3}9"
173            + "|\\\\u[fF]{3}[aA]"
174            + "|\\\\u[fF]{3}[bB]"
175            + "|\\\\u[fF]{4}");
176
177    /** Cpp style comments. */
178    private Map<Integer, TextBlock> singlelineComments;
179    /** C style comments. */
180    private Map<Integer, List<TextBlock>> blockComments;
181
182    /** Allow use escapes for non-printable, control characters. */
183    private boolean allowEscapesForControlCharacters;
184
185    /** Allow use escapes if trail comment is present. */
186    private boolean allowByTailComment;
187
188    /** Allow if all characters in literal are escaped. */
189    private boolean allowIfAllCharactersEscaped;
190
191    /** Allow use escapes for non-printable, whitespace characters. */
192    private boolean allowNonPrintableEscapes;
193
194    /**
195     * Setter to allow use escapes for non-printable, control characters.
196     *
197     * @param allow user's value.
198     * @since 5.8
199     */
200    public final void setAllowEscapesForControlCharacters(boolean allow) {
201        allowEscapesForControlCharacters = allow;
202    }
203
204    /**
205     * Setter to allow use escapes if trail comment is present.
206     *
207     * @param allow user's value.
208     * @since 5.8
209     */
210    public final void setAllowByTailComment(boolean allow) {
211        allowByTailComment = allow;
212    }
213
214    /**
215     * Setter to allow if all characters in literal are escaped.
216     *
217     * @param allow user's value.
218     * @since 5.8
219     */
220    public final void setAllowIfAllCharactersEscaped(boolean allow) {
221        allowIfAllCharactersEscaped = allow;
222    }
223
224    /**
225     * Setter to allow use escapes for non-printable, whitespace characters.
226     *
227     * @param allow user's value.
228     * @since 5.8
229     */
230    public final void setAllowNonPrintableEscapes(boolean allow) {
231        allowNonPrintableEscapes = allow;
232    }
233
234    @Override
235    public int[] getDefaultTokens() {
236        return getRequiredTokens();
237    }
238
239    @Override
240    public int[] getAcceptableTokens() {
241        return getRequiredTokens();
242    }
243
244    @Override
245    public int[] getRequiredTokens() {
246        return new int[] {
247            TokenTypes.STRING_LITERAL,
248            TokenTypes.CHAR_LITERAL,
249            TokenTypes.TEXT_BLOCK_CONTENT,
250        };
251    }
252
253    // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166
254    @SuppressWarnings("deprecation")
255    @Override
256    public void beginTree(DetailAST rootAST) {
257        singlelineComments = getFileContents().getSingleLineComments();
258        blockComments = getFileContents().getBlockComments();
259    }
260
261    @Override
262    public void visitToken(DetailAST ast) {
263        final String literal =
264            CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
265
266        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
267                || isAllCharactersEscaped(literal)
268                || allowEscapesForControlCharacters
269                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
270                || allowNonPrintableEscapes
271                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
272            log(ast, MSG_KEY);
273        }
274    }
275
276    /**
277     * Checks if literal has Unicode chars.
278     *
279     * @param literal String literal.
280     * @return true if literal has Unicode chars.
281     */
282    private static boolean hasUnicodeChar(String literal) {
283        final String literalWithoutEscapedBackslashes =
284                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
285        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
286    }
287
288    /**
289     * Check if String literal contains Unicode control chars.
290     *
291     * @param literal String literal.
292     * @param pattern RegExp for valid characters.
293     * @return true, if String literal contains Unicode control chars.
294     */
295    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
296        final int unicodeMatchesCounter =
297                countMatches(UNICODE_REGEXP, literal);
298        final int unicodeValidMatchesCounter =
299                countMatches(pattern, literal);
300        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
301    }
302
303    /**
304     * Check if trail comment is present after ast token.
305     *
306     * @param ast current token.
307     * @return true if trail comment is present after ast token.
308     */
309    private boolean hasTrailComment(DetailAST ast) {
310        int lineNo = ast.getLineNo();
311
312        // Since the trailing comment in the case of text blocks must follow the """ delimiter,
313        // we need to look for it after TEXT_BLOCK_LITERAL_END.
314        if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
315            lineNo = ast.getNextSibling().getLineNo();
316        }
317        boolean result = false;
318        if (singlelineComments.containsKey(lineNo)) {
319            result = true;
320        }
321        else {
322            final List<TextBlock> commentList = blockComments.get(lineNo);
323            if (commentList != null) {
324                final TextBlock comment = commentList.get(commentList.size() - 1);
325                final int[] codePoints = getLineCodePoints(lineNo - 1);
326                result = isTrailingBlockComment(comment, codePoints);
327            }
328        }
329        return result;
330    }
331
332    /**
333     * Whether the C style comment is trailing.
334     *
335     * @param comment the comment to check.
336     * @param codePoints the first line of the comment, in unicode code points
337     * @return true if the comment is trailing.
338     */
339    private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) {
340        return comment.getText().length != 1
341            || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints,
342                comment.getEndColNo() + 1, codePoints.length));
343    }
344
345    /**
346     * Count regexp matches into String literal.
347     *
348     * @param pattern pattern.
349     * @param target String literal.
350     * @return count of regexp matches.
351     */
352    private static int countMatches(Pattern pattern, String target) {
353        int matcherCounter = 0;
354        final Matcher matcher = pattern.matcher(target);
355        while (matcher.find()) {
356            matcherCounter++;
357        }
358        return matcherCounter;
359    }
360
361    /**
362     * Checks if all characters in String literal is escaped.
363     *
364     * @param literal current literal.
365     * @return true if all characters in String literal is escaped.
366     */
367    private boolean isAllCharactersEscaped(String literal) {
368        return allowIfAllCharactersEscaped
369                && ALL_ESCAPED_CHARS.matcher(literal).find();
370    }
371
372}