View Javadoc
1   ////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code for adherence to a set of rules.
3   // Copyright (C) 2001-2017 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks;
21  
22  import java.util.List;
23  import java.util.Map;
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  
27  import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
28  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
29  import com.puppycrawl.tools.checkstyle.api.DetailAST;
30  import com.puppycrawl.tools.checkstyle.api.TextBlock;
31  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
32  import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
33  
34  /**
35   * <p>
36   * Restrict using <a href =
37   * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
38   * Unicode escapes</a> (such as {@code &#92;u221e}).
39   * It is possible to allow using escapes for
40   * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
41   * non-printable(control) characters</a>.
42   * Also, this check can be configured to allow using escapes
43   * if trail comment is present. By the option it is possible to
44   * allow using escapes if literal contains only them. By the option it
45   * is possible to allow using escapes for space literals.
46   * </p>
47   * <p>
48   * Examples of using Unicode:</p>
49   * <pre>
50   * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
51   * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
52   * </pre>
53   * <p>
54   * An example of how to configure the check is:
55   * </p>
56   * <pre>
57   * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
58   * </pre>
59   * <p>
60   * An example of non-printable(control) characters.
61   * </p>
62   * <pre>
63   * return '&#92;ufeff' + content; // byte order mark
64   * </pre>
65   * <p>
66   * An example of how to configure the check to allow using escapes
67   * for non-printable(control) characters:
68   * </p>
69   * <pre>
70   * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
71   *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
72   * &lt;/module&gt;
73   * </pre>
74   * <p>
75   * Example of using escapes with trail comment:
76   * </p>
77   * <pre>
78   * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
79   * </pre>
80   * <p>An example of how to configure the check to allow using escapes
81   * if trail comment is present:
82   * </p>
83   * <pre>
84   * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
85   *     &lt;property name="allowByTailComment" value="true"/&gt;
86   * &lt;/module&gt;
87   * </pre>
88   * <p>Example of using escapes if literal contains only them:
89   * </p>
90   * <pre>
91   * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
92   * </pre>
93   * <p>An example of how to configure the check to allow escapes
94   * if literal contains only them:
95   * </p>
96   * <pre>
97   * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
98   *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
99   * &lt;/module&gt;
100  * </pre>
101  * <p>An example of how to configure the check to allow non-printable escapes:
102  * </p>
103  * <pre>
104  * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
105  *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
106  * &lt;/module&gt;
107  * </pre>
108  *
109  * @author maxvetrenko
110  *
111  */
112 @FileStatefulCheck
113 public class AvoidEscapedUnicodeCharactersCheck
114     extends AbstractCheck {
115     /**
116      * A key is pointing to the warning message text in "messages.properties"
117      * file.
118      */
119     public static final String MSG_KEY = "forbid.escaped.unicode.char";
120 
121     /** Regular expression for Unicode chars. */
122     private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
123 
124     /**
125      * Regular expression Unicode control characters.
126      *
127      * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
128      *     Appendix:Control characters</a>
129      */
130     private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
131             + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)"
132             + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]"
133             + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
134 
135     /** Regular expression for all escaped chars. */
136     private static final Pattern ALL_ESCAPED_CHARS =
137             Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
138                     + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
139 
140     /** Regular expression for escaped backslash. */
141     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
142 
143     /** Regular expression for non-printable unicode chars. */
144     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
145             + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
146             + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
147             + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
148             + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
149             + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
150             + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
151             + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
152             + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
153             + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
154             + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
155             + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
156             + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
157             + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
158             + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
159             + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
160             + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
161             + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
162             + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
163             + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
164 
165     /** Cpp style comments. */
166     private Map<Integer, TextBlock> singlelineComments;
167     /** C style comments. */
168     private Map<Integer, List<TextBlock>> blockComments;
169 
170     /** Allow use escapes for non-printable(control) characters.  */
171     private boolean allowEscapesForControlCharacters;
172 
173     /** Allow use escapes if trail comment is present. */
174     private boolean allowByTailComment;
175 
176     /** Allow if all characters in literal are escaped. */
177     private boolean allowIfAllCharactersEscaped;
178 
179     /** Allow escapes for space literals. */
180     private boolean allowNonPrintableEscapes;
181 
182     /**
183      * Set allowIfAllCharactersEscaped.
184      * @param allow user's value.
185      */
186     public final void setAllowEscapesForControlCharacters(boolean allow) {
187         allowEscapesForControlCharacters = allow;
188     }
189 
190     /**
191      * Set allowByTailComment.
192      * @param allow user's value.
193      */
194     public final void setAllowByTailComment(boolean allow) {
195         allowByTailComment = allow;
196     }
197 
198     /**
199      * Set allowIfAllCharactersEscaped.
200      * @param allow user's value.
201      */
202     public final void setAllowIfAllCharactersEscaped(boolean allow) {
203         allowIfAllCharactersEscaped = allow;
204     }
205 
206     /**
207      * Set allowSpaceEscapes.
208      * @param allow user's value.
209      */
210     public final void setAllowNonPrintableEscapes(boolean allow) {
211         allowNonPrintableEscapes = allow;
212     }
213 
214     @Override
215     public int[] getDefaultTokens() {
216         return getRequiredTokens();
217     }
218 
219     @Override
220     public int[] getAcceptableTokens() {
221         return getRequiredTokens();
222     }
223 
224     @Override
225     public int[] getRequiredTokens() {
226         return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
227     }
228 
229     @Override
230     public void beginTree(DetailAST rootAST) {
231         singlelineComments = getFileContents().getSingleLineComments();
232         blockComments = getFileContents().getBlockComments();
233     }
234 
235     @Override
236     public void visitToken(DetailAST ast) {
237 
238         final String literal = ast.getText();
239 
240         if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
241                 || isAllCharactersEscaped(literal)
242                 || allowEscapesForControlCharacters
243                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
244                 || allowNonPrintableEscapes
245                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
246             log(ast.getLineNo(), MSG_KEY);
247         }
248     }
249 
250     /**
251      * Checks if literal has Unicode chars.
252      * @param literal String literal.
253      * @return true if literal has Unicode chars.
254      */
255     private static boolean hasUnicodeChar(String literal) {
256         final String literalWithoutEscapedBackslashes =
257                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
258         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
259     }
260 
261     /**
262      * Check if String literal contains Unicode control chars.
263      * @param literal String literal.
264      * @param pattern RegExp for valid characters.
265      * @return true, if String literal contains Unicode control chars.
266      */
267     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
268         final int unicodeMatchesCounter =
269                 countMatches(UNICODE_REGEXP, literal);
270         final int unicodeValidMatchesCounter =
271                 countMatches(pattern, literal);
272         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
273     }
274 
275     /**
276      * Check if trail comment is present after ast token.
277      * @param ast current token.
278      * @return true if trail comment is present after ast token.
279      */
280     private boolean hasTrailComment(DetailAST ast) {
281         boolean result = false;
282         final int lineNo = ast.getLineNo();
283         if (singlelineComments.containsKey(lineNo)) {
284             result = true;
285         }
286         else {
287             final List<TextBlock> commentList = blockComments.get(lineNo);
288             if (commentList != null) {
289                 final TextBlock comment = commentList.get(commentList.size() - 1);
290                 final String line = getLines()[lineNo - 1];
291                 result = isTrailingBlockComment(comment, line);
292             }
293         }
294         return result;
295     }
296 
297     /**
298      * Whether the C style comment is trailing.
299      * @param comment the comment to check.
300      * @param line the line where the comment starts.
301      * @return true if the comment is trailing.
302      */
303     private static boolean isTrailingBlockComment(TextBlock comment, String line) {
304         return comment.getText().length != 1
305             || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
306     }
307 
308     /**
309      * Count regexp matches into String literal.
310      * @param pattern pattern.
311      * @param target String literal.
312      * @return count of regexp matches.
313      */
314     private static int countMatches(Pattern pattern, String target) {
315         int matcherCounter = 0;
316         final Matcher matcher = pattern.matcher(target);
317         while (matcher.find()) {
318             matcherCounter++;
319         }
320         return matcherCounter;
321     }
322 
323     /**
324      * Checks if all characters in String literal is escaped.
325      * @param literal current literal.
326      * @return true if all characters in String literal is escaped.
327      */
328     private boolean isAllCharactersEscaped(String literal) {
329         return allowIfAllCharactersEscaped
330                 && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
331                         literal.length() - 1)).find();
332     }
333 }