001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2018 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
028import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
029import com.puppycrawl.tools.checkstyle.api.DetailAST;
030import com.puppycrawl.tools.checkstyle.api.TextBlock;
031import com.puppycrawl.tools.checkstyle.api.TokenTypes;
032import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
033
034/**
035 * <p>
036 * Restrict using <a href =
037 * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
038 * Unicode escapes</a> (such as <code>&#92;u221e</code>).
039 * It is possible to allow using escapes for
040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
041 * non-printable(control) characters</a>.
042 * Also, this check can be configured to allow using escapes
043 * if trail comment is present. By the option it is possible to
044 * allow using escapes if literal contains only them. By the option it
045 * is possible to allow using escapes for space literals.
046 * </p>
047 * <p>
048 * Examples of using Unicode:</p>
049 * <pre>
050 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
051 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
052 * </pre>
053 * <p>
054 * An example of how to configure the check is:
055 * </p>
056 * <pre>
057 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
058 * </pre>
059 * <p>
060 * An example of non-printable(control) characters.
061 * </p>
062 * <pre>
063 * return '&#92;ufeff' + content; // byte order mark
064 * </pre>
065 * <p>
066 * An example of how to configure the check to allow using escapes
067 * for non-printable(control) characters:
068 * </p>
069 * <pre>
070 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
071 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
072 * &lt;/module&gt;
073 * </pre>
074 * <p>
075 * Example of using escapes with trail comment:
076 * </p>
077 * <pre>
078 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
079 * </pre>
080 * <p>An example of how to configure the check to allow using escapes
081 * if trail comment is present:
082 * </p>
083 * <pre>
084 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
085 *     &lt;property name="allowByTailComment" value="true"/&gt;
086 * &lt;/module&gt;
087 * </pre>
088 * <p>Example of using escapes if literal contains only them:
089 * </p>
090 * <pre>
091 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
092 * </pre>
093 * <p>An example of how to configure the check to allow escapes
094 * if literal contains only them:
095 * </p>
096 * <pre>
097 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
098 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
099 * &lt;/module&gt;
100 * </pre>
101 * <p>An example of how to configure the check to allow non-printable escapes:
102 * </p>
103 * <pre>
104 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
105 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
106 * &lt;/module&gt;
107 * </pre>
108 *
109 * @noinspection HtmlTagCanBeJavadocTag
110 */
111@FileStatefulCheck
112public class AvoidEscapedUnicodeCharactersCheck
113    extends AbstractCheck {
114
115    /**
116     * A key is pointing to the warning message text in "messages.properties"
117     * file.
118     */
119    public static final String MSG_KEY = "forbid.escaped.unicode.char";
120
121    /** Regular expression for Unicode chars. */
122    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
123
124    /**
125     * Regular expression Unicode control characters.
126     *
127     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
128     *     Appendix:Control characters</a>
129     */
130    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]"
131            + "(00[0-1][0-9A-Fa-f]"
132            + "|00[8-9][0-9A-Fa-f]"
133            + "|00[aA][dD]"
134            + "|034[fF]"
135            + "|070[fF]"
136            + "|180[eE]"
137            + "|200[b-fB-F]"
138            + "|202[a-eA-E]"
139            + "|206[0-4a-fA-F]"
140            + "|[fF]{3}[9a-bA-B]"
141            + "|[fF][eE][fF]{2})");
142
143    /** Regular expression for all escaped chars. */
144    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
145            + "|\""
146            + "|\'"
147            + "|\\\\"
148            + "|\\\\b"
149            + "|\\\\f"
150            + "|\\\\n"
151            + "|\\\\r"
152            + "|\\\\t"
153            + ")+$");
154
155    /** Regular expression for escaped backslash. */
156    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
157
158    /** Regular expression for non-printable unicode chars. */
159    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
160            + "|\\\\u0009"
161            + "|\\\\u000[bB]"
162            + "|\\\\u000[cC]"
163            + "|\\\\u0020"
164            + "|\\\\u007[fF]"
165            + "|\\\\u0085"
166            + "|\\\\u009[fF]"
167            + "|\\\\u00[aA]0"
168            + "|\\\\u00[aA][dD]"
169            + "|\\\\u04[fF]9"
170            + "|\\\\u05[bB][eE]"
171            + "|\\\\u05[dD]0"
172            + "|\\\\u05[eE][aA]"
173            + "|\\\\u05[fF]3"
174            + "|\\\\u05[fF]4"
175            + "|\\\\u0600"
176            + "|\\\\u0604"
177            + "|\\\\u061[cC]"
178            + "|\\\\u06[dD]{2}"
179            + "|\\\\u06[fF]{2}"
180            + "|\\\\u070[fF]"
181            + "|\\\\u0750"
182            + "|\\\\u077[fF]"
183            + "|\\\\u0[eE]00"
184            + "|\\\\u0[eE]7[fF]"
185            + "|\\\\u1680"
186            + "|\\\\u180[eE]"
187            + "|\\\\u1[eE]00"
188            + "|\\\\u2000"
189            + "|\\\\u2001"
190            + "|\\\\u2002"
191            + "|\\\\u2003"
192            + "|\\\\u2004"
193            + "|\\\\u2005"
194            + "|\\\\u2006"
195            + "|\\\\u2007"
196            + "|\\\\u2008"
197            + "|\\\\u2009"
198            + "|\\\\u200[aA]"
199            + "|\\\\u200[fF]"
200            + "|\\\\u2025"
201            + "|\\\\u2028"
202            + "|\\\\u2029"
203            + "|\\\\u202[fF]"
204            + "|\\\\u205[fF]"
205            + "|\\\\u2064"
206            + "|\\\\u2066"
207            + "|\\\\u2067"
208            + "|\\\\u2068"
209            + "|\\\\u2069"
210            + "|\\\\u206[aA]"
211            + "|\\\\u206[fF]"
212            + "|\\\\u20[aA][fF]"
213            + "|\\\\u2100"
214            + "|\\\\u213[aA]"
215            + "|\\\\u3000"
216            + "|\\\\u[dD]800"
217            + "|\\\\u[fF]8[fF]{2}"
218            + "|\\\\u[fF][bB]50"
219            + "|\\\\u[fF][dD][fF]{2}"
220            + "|\\\\u[fF][eE]70"
221            + "|\\\\u[fF][eE][fF]{2}"
222            + "|\\\\u[fF]{2}0[eE]"
223            + "|\\\\u[fF]{2}61"
224            + "|\\\\u[fF]{2}[dD][cC]"
225            + "|\\\\u[fF]{3}9"
226            + "|\\\\u[fF]{3}[aA]"
227            + "|\\\\u[fF]{3}[bB]"
228            + "|\\\\u[fF]{4}");
229
230    /** Cpp style comments. */
231    private Map<Integer, TextBlock> singlelineComments;
232    /** C style comments. */
233    private Map<Integer, List<TextBlock>> blockComments;
234
235    /** Allow use escapes for non-printable(control) characters.  */
236    private boolean allowEscapesForControlCharacters;
237
238    /** Allow use escapes if trail comment is present. */
239    private boolean allowByTailComment;
240
241    /** Allow if all characters in literal are escaped. */
242    private boolean allowIfAllCharactersEscaped;
243
244    /** Allow escapes for space literals. */
245    private boolean allowNonPrintableEscapes;
246
247    /**
248     * Set allowIfAllCharactersEscaped.
249     * @param allow user's value.
250     */
251    public final void setAllowEscapesForControlCharacters(boolean allow) {
252        allowEscapesForControlCharacters = allow;
253    }
254
255    /**
256     * Set allowByTailComment.
257     * @param allow user's value.
258     */
259    public final void setAllowByTailComment(boolean allow) {
260        allowByTailComment = allow;
261    }
262
263    /**
264     * Set allowIfAllCharactersEscaped.
265     * @param allow user's value.
266     */
267    public final void setAllowIfAllCharactersEscaped(boolean allow) {
268        allowIfAllCharactersEscaped = allow;
269    }
270
271    /**
272     * Set allowSpaceEscapes.
273     * @param allow user's value.
274     */
275    public final void setAllowNonPrintableEscapes(boolean allow) {
276        allowNonPrintableEscapes = allow;
277    }
278
279    @Override
280    public int[] getDefaultTokens() {
281        return getRequiredTokens();
282    }
283
284    @Override
285    public int[] getAcceptableTokens() {
286        return getRequiredTokens();
287    }
288
289    @Override
290    public int[] getRequiredTokens() {
291        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
292    }
293
294    @Override
295    public void beginTree(DetailAST rootAST) {
296        singlelineComments = getFileContents().getSingleLineComments();
297        blockComments = getFileContents().getBlockComments();
298    }
299
300    @Override
301    public void visitToken(DetailAST ast) {
302        final String literal = ast.getText();
303
304        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
305                || isAllCharactersEscaped(literal)
306                || allowEscapesForControlCharacters
307                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
308                || allowNonPrintableEscapes
309                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
310            log(ast.getLineNo(), MSG_KEY);
311        }
312    }
313
314    /**
315     * Checks if literal has Unicode chars.
316     * @param literal String literal.
317     * @return true if literal has Unicode chars.
318     */
319    private static boolean hasUnicodeChar(String literal) {
320        final String literalWithoutEscapedBackslashes =
321                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
322        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
323    }
324
325    /**
326     * Check if String literal contains Unicode control chars.
327     * @param literal String literal.
328     * @param pattern RegExp for valid characters.
329     * @return true, if String literal contains Unicode control chars.
330     */
331    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
332        final int unicodeMatchesCounter =
333                countMatches(UNICODE_REGEXP, literal);
334        final int unicodeValidMatchesCounter =
335                countMatches(pattern, literal);
336        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
337    }
338
339    /**
340     * Check if trail comment is present after ast token.
341     * @param ast current token.
342     * @return true if trail comment is present after ast token.
343     */
344    private boolean hasTrailComment(DetailAST ast) {
345        boolean result = false;
346        final int lineNo = ast.getLineNo();
347        if (singlelineComments.containsKey(lineNo)) {
348            result = true;
349        }
350        else {
351            final List<TextBlock> commentList = blockComments.get(lineNo);
352            if (commentList != null) {
353                final TextBlock comment = commentList.get(commentList.size() - 1);
354                final String line = getLines()[lineNo - 1];
355                result = isTrailingBlockComment(comment, line);
356            }
357        }
358        return result;
359    }
360
361    /**
362     * Whether the C style comment is trailing.
363     * @param comment the comment to check.
364     * @param line the line where the comment starts.
365     * @return true if the comment is trailing.
366     */
367    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
368        return comment.getText().length != 1
369            || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
370    }
371
372    /**
373     * Count regexp matches into String literal.
374     * @param pattern pattern.
375     * @param target String literal.
376     * @return count of regexp matches.
377     */
378    private static int countMatches(Pattern pattern, String target) {
379        int matcherCounter = 0;
380        final Matcher matcher = pattern.matcher(target);
381        while (matcher.find()) {
382            matcherCounter++;
383        }
384        return matcherCounter;
385    }
386
387    /**
388     * Checks if all characters in String literal is escaped.
389     * @param literal current literal.
390     * @return true if all characters in String literal is escaped.
391     */
392    private boolean isAllCharactersEscaped(String literal) {
393        return allowIfAllCharactersEscaped
394                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
395                        literal.length() - 1)).find();
396    }
397
398}