View Javadoc
1   ////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code for adherence to a set of rules.
3   // Copyright (C) 2001-2018 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks;
21  
22  import java.util.List;
23  import java.util.Map;
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  
27  import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
28  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
29  import com.puppycrawl.tools.checkstyle.api.DetailAST;
30  import com.puppycrawl.tools.checkstyle.api.TextBlock;
31  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
32  import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
33  
34  /**
35   * <p>
36   * Restrict using <a href =
37   * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
38   * Unicode escapes</a> (such as <code>&#92;u221e</code>).
39   * It is possible to allow using escapes for
40   * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
41   * non-printable(control) characters</a>.
42   * Also, this check can be configured to allow using escapes
43   * if trail comment is present. By the option it is possible to
44   * allow using escapes if literal contains only them. By the option it
45   * is possible to allow using escapes for space literals.
46   * </p>
47   * <p>
48   * Examples of using Unicode:</p>
49   * <pre>
50   * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
51   * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
52   * </pre>
53   * <p>
54   * An example of how to configure the check is:
55   * </p>
56   * <pre>
57   * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
58   * </pre>
59   * <p>
60   * An example of non-printable(control) characters.
61   * </p>
62   * <pre>
63   * return '&#92;ufeff' + content; // byte order mark
64   * </pre>
65   * <p>
66   * An example of how to configure the check to allow using escapes
67   * for non-printable(control) characters:
68   * </p>
69   * <pre>
70   * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
71   *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
72   * &lt;/module&gt;
73   * </pre>
74   * <p>
75   * Example of using escapes with trail comment:
76   * </p>
77   * <pre>
78   * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
79   * </pre>
80   * <p>An example of how to configure the check to allow using escapes
81   * if trail comment is present:
82   * </p>
83   * <pre>
84   * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
85   *     &lt;property name="allowByTailComment" value="true"/&gt;
86   * &lt;/module&gt;
87   * </pre>
88   * <p>Example of using escapes if literal contains only them:
89   * </p>
90   * <pre>
91   * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
92   * </pre>
93   * <p>An example of how to configure the check to allow escapes
94   * if literal contains only them:
95   * </p>
96   * <pre>
97   * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
98   *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
99   * &lt;/module&gt;
100  * </pre>
101  * <p>An example of how to configure the check to allow non-printable escapes:
102  * </p>
103  * <pre>
104  * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
105  *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
106  * &lt;/module&gt;
107  * </pre>
108  *
109  * @author maxvetrenko
110  * @noinspection HtmlTagCanBeJavadocTag
111  */
112 @FileStatefulCheck
113 public class AvoidEscapedUnicodeCharactersCheck
114     extends AbstractCheck {
115 
116     /**
117      * A key is pointing to the warning message text in "messages.properties"
118      * file.
119      */
120     public static final String MSG_KEY = "forbid.escaped.unicode.char";
121 
122     /** Regular expression for Unicode chars. */
123     private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
124 
125     /**
126      * Regular expression Unicode control characters.
127      *
128      * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
129      *     Appendix:Control characters</a>
130      */
131     private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]"
132             + "(00[0-1][0-9A-Fa-f]"
133             + "|00[8-9][0-9A-Fa-f]"
134             + "|00[aA][dD]"
135             + "|034[fF]"
136             + "|070[fF]"
137             + "|180[eE]"
138             + "|200[b-fB-F]"
139             + "|202[a-eA-E]"
140             + "|206[0-4a-fA-F]"
141             + "|[fF]{3}[9a-bA-B]"
142             + "|[fF][eE][fF]{2})");
143 
144     /** Regular expression for all escaped chars. */
145     private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
146             + "|\""
147             + "|\'"
148             + "|\\\\"
149             + "|\\\\b"
150             + "|\\\\f"
151             + "|\\\\n"
152             + "|\\\\r"
153             + "|\\\\t"
154             + ")+$");
155 
156     /** Regular expression for escaped backslash. */
157     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
158 
159     /** Regular expression for non-printable unicode chars. */
160     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
161             + "|\\\\u0009"
162             + "|\\\\u000[bB]"
163             + "|\\\\u000[cC]"
164             + "|\\\\u0020"
165             + "|\\\\u007[fF]"
166             + "|\\\\u0085"
167             + "|\\\\u009[fF]"
168             + "|\\\\u00[aA]0"
169             + "|\\\\u00[aA][dD]"
170             + "|\\\\u04[fF]9"
171             + "|\\\\u05[bB][eE]"
172             + "|\\\\u05[dD]0"
173             + "|\\\\u05[eE][aA]"
174             + "|\\\\u05[fF]3"
175             + "|\\\\u05[fF]4"
176             + "|\\\\u0600"
177             + "|\\\\u0604"
178             + "|\\\\u061[cC]"
179             + "|\\\\u06[dD]{2}"
180             + "|\\\\u06[fF]{2}"
181             + "|\\\\u070[fF]"
182             + "|\\\\u0750"
183             + "|\\\\u077[fF]"
184             + "|\\\\u0[eE]00"
185             + "|\\\\u0[eE]7[fF]"
186             + "|\\\\u1680"
187             + "|\\\\u180[eE]"
188             + "|\\\\u1[eE]00"
189             + "|\\\\u2000"
190             + "|\\\\u2001"
191             + "|\\\\u2002"
192             + "|\\\\u2003"
193             + "|\\\\u2004"
194             + "|\\\\u2005"
195             + "|\\\\u2006"
196             + "|\\\\u2007"
197             + "|\\\\u2008"
198             + "|\\\\u2009"
199             + "|\\\\u200[aA]"
200             + "|\\\\u200[fF]"
201             + "|\\\\u2025"
202             + "|\\\\u2028"
203             + "|\\\\u2029"
204             + "|\\\\u202[fF]"
205             + "|\\\\u205[fF]"
206             + "|\\\\u2064"
207             + "|\\\\u2066"
208             + "|\\\\u2067"
209             + "|\\\\u2068"
210             + "|\\\\u2069"
211             + "|\\\\u206[aA]"
212             + "|\\\\u206[fF]"
213             + "|\\\\u20[aA][fF]"
214             + "|\\\\u2100"
215             + "|\\\\u213[aA]"
216             + "|\\\\u3000"
217             + "|\\\\u[dD]800"
218             + "|\\\\u[fF]8[fF]{2}"
219             + "|\\\\u[fF][bB]50"
220             + "|\\\\u[fF][dD][fF]{2}"
221             + "|\\\\u[fF][eE]70"
222             + "|\\\\u[fF][eE][fF]{2}"
223             + "|\\\\u[fF]{2}0[eE]"
224             + "|\\\\u[fF]{2}61"
225             + "|\\\\u[fF]{2}[dD][cC]"
226             + "|\\\\u[fF]{3}9"
227             + "|\\\\u[fF]{3}[aA]"
228             + "|\\\\u[fF]{3}[bB]"
229             + "|\\\\u[fF]{4}");
230 
231     /** Cpp style comments. */
232     private Map<Integer, TextBlock> singlelineComments;
233     /** C style comments. */
234     private Map<Integer, List<TextBlock>> blockComments;
235 
236     /** Allow use escapes for non-printable(control) characters.  */
237     private boolean allowEscapesForControlCharacters;
238 
239     /** Allow use escapes if trail comment is present. */
240     private boolean allowByTailComment;
241 
242     /** Allow if all characters in literal are escaped. */
243     private boolean allowIfAllCharactersEscaped;
244 
245     /** Allow escapes for space literals. */
246     private boolean allowNonPrintableEscapes;
247 
248     /**
249      * Set allowIfAllCharactersEscaped.
250      * @param allow user's value.
251      */
252     public final void setAllowEscapesForControlCharacters(boolean allow) {
253         allowEscapesForControlCharacters = allow;
254     }
255 
256     /**
257      * Set allowByTailComment.
258      * @param allow user's value.
259      */
260     public final void setAllowByTailComment(boolean allow) {
261         allowByTailComment = allow;
262     }
263 
264     /**
265      * Set allowIfAllCharactersEscaped.
266      * @param allow user's value.
267      */
268     public final void setAllowIfAllCharactersEscaped(boolean allow) {
269         allowIfAllCharactersEscaped = allow;
270     }
271 
272     /**
273      * Set allowSpaceEscapes.
274      * @param allow user's value.
275      */
276     public final void setAllowNonPrintableEscapes(boolean allow) {
277         allowNonPrintableEscapes = allow;
278     }
279 
280     @Override
281     public int[] getDefaultTokens() {
282         return getRequiredTokens();
283     }
284 
285     @Override
286     public int[] getAcceptableTokens() {
287         return getRequiredTokens();
288     }
289 
290     @Override
291     public int[] getRequiredTokens() {
292         return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
293     }
294 
295     @Override
296     public void beginTree(DetailAST rootAST) {
297         singlelineComments = getFileContents().getSingleLineComments();
298         blockComments = getFileContents().getBlockComments();
299     }
300 
301     @Override
302     public void visitToken(DetailAST ast) {
303         final String literal = ast.getText();
304 
305         if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
306                 || isAllCharactersEscaped(literal)
307                 || allowEscapesForControlCharacters
308                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
309                 || allowNonPrintableEscapes
310                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
311             log(ast.getLineNo(), MSG_KEY);
312         }
313     }
314 
315     /**
316      * Checks if literal has Unicode chars.
317      * @param literal String literal.
318      * @return true if literal has Unicode chars.
319      */
320     private static boolean hasUnicodeChar(String literal) {
321         final String literalWithoutEscapedBackslashes =
322                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
323         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
324     }
325 
326     /**
327      * Check if String literal contains Unicode control chars.
328      * @param literal String literal.
329      * @param pattern RegExp for valid characters.
330      * @return true, if String literal contains Unicode control chars.
331      */
332     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
333         final int unicodeMatchesCounter =
334                 countMatches(UNICODE_REGEXP, literal);
335         final int unicodeValidMatchesCounter =
336                 countMatches(pattern, literal);
337         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
338     }
339 
340     /**
341      * Check if trail comment is present after ast token.
342      * @param ast current token.
343      * @return true if trail comment is present after ast token.
344      */
345     private boolean hasTrailComment(DetailAST ast) {
346         boolean result = false;
347         final int lineNo = ast.getLineNo();
348         if (singlelineComments.containsKey(lineNo)) {
349             result = true;
350         }
351         else {
352             final List<TextBlock> commentList = blockComments.get(lineNo);
353             if (commentList != null) {
354                 final TextBlock comment = commentList.get(commentList.size() - 1);
355                 final String line = getLines()[lineNo - 1];
356                 result = isTrailingBlockComment(comment, line);
357             }
358         }
359         return result;
360     }
361 
362     /**
363      * Whether the C style comment is trailing.
364      * @param comment the comment to check.
365      * @param line the line where the comment starts.
366      * @return true if the comment is trailing.
367      */
368     private static boolean isTrailingBlockComment(TextBlock comment, String line) {
369         return comment.getText().length != 1
370             || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
371     }
372 
373     /**
374      * Count regexp matches into String literal.
375      * @param pattern pattern.
376      * @param target String literal.
377      * @return count of regexp matches.
378      */
379     private static int countMatches(Pattern pattern, String target) {
380         int matcherCounter = 0;
381         final Matcher matcher = pattern.matcher(target);
382         while (matcher.find()) {
383             matcherCounter++;
384         }
385         return matcherCounter;
386     }
387 
388     /**
389      * Checks if all characters in String literal is escaped.
390      * @param literal current literal.
391      * @return true if all characters in String literal is escaped.
392      */
393     private boolean isAllCharactersEscaped(String literal) {
394         return allowIfAllCharactersEscaped
395                 && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
396                         literal.length() - 1)).find();
397     }
398 
399 }