Coverage Report - com.puppycrawl.tools.checkstyle.checks.AvoidEscapedUnicodeCharactersCheck
 
Classes in this File Line Coverage Branch Coverage Complexity
AvoidEscapedUnicodeCharactersCheck
100%
56/56
100%
32/32
1.867
 
 1  
 ////////////////////////////////////////////////////////////////////////////////
 2  
 // checkstyle: Checks Java source code for adherence to a set of rules.
 3  
 // Copyright (C) 2001-2017 the original author or authors.
 4  
 //
 5  
 // This library is free software; you can redistribute it and/or
 6  
 // modify it under the terms of the GNU Lesser General Public
 7  
 // License as published by the Free Software Foundation; either
 8  
 // version 2.1 of the License, or (at your option) any later version.
 9  
 //
 10  
 // This library is distributed in the hope that it will be useful,
 11  
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  
 // Lesser General Public License for more details.
 14  
 //
 15  
 // You should have received a copy of the GNU Lesser General Public
 16  
 // License along with this library; if not, write to the Free Software
 17  
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 18  
 ////////////////////////////////////////////////////////////////////////////////
 19  
 
 20  
 package com.puppycrawl.tools.checkstyle.checks;
 21  
 
 22  
 import java.util.List;
 23  
 import java.util.Map;
 24  
 import java.util.regex.Matcher;
 25  
 import java.util.regex.Pattern;
 26  
 
 27  
 import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
 28  
 import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
 29  
 import com.puppycrawl.tools.checkstyle.api.DetailAST;
 30  
 import com.puppycrawl.tools.checkstyle.api.TextBlock;
 31  
 import com.puppycrawl.tools.checkstyle.api.TokenTypes;
 32  
 import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
 33  
 
 34  
 /**
 35  
  * <p>
 36  
  * Restrict using <a href =
 37  
  * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
 38  
  * Unicode escapes</a> (such as {@code &#92;u221e}).
 39  
  * It is possible to allow using escapes for
 40  
  * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
 41  
  * non-printable(control) characters</a>.
 42  
  * Also, this check can be configured to allow using escapes
 43  
  * if trail comment is present. By the option it is possible to
 44  
  * allow using escapes if literal contains only them. By the option it
 45  
  * is possible to allow using escapes for space literals.
 46  
  * </p>
 47  
  * <p>
 48  
  * Examples of using Unicode:</p>
 49  
  * <pre>
 50  
  * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
 51  
  * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
 52  
  * </pre>
 53  
  * <p>
 54  
  * An example of how to configure the check is:
 55  
  * </p>
 56  
  * <pre>
 57  
  * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
 58  
  * </pre>
 59  
  * <p>
 60  
  * An example of non-printable(control) characters.
 61  
  * </p>
 62  
  * <pre>
 63  
  * return '&#92;ufeff' + content; // byte order mark
 64  
  * </pre>
 65  
  * <p>
 66  
  * An example of how to configure the check to allow using escapes
 67  
  * for non-printable(control) characters:
 68  
  * </p>
 69  
  * <pre>
 70  
  * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
 71  
  *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
 72  
  * &lt;/module&gt;
 73  
  * </pre>
 74  
  * <p>
 75  
  * Example of using escapes with trail comment:
 76  
  * </p>
 77  
  * <pre>
 78  
  * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
 79  
  * </pre>
 80  
  * <p>An example of how to configure the check to allow using escapes
 81  
  * if trail comment is present:
 82  
  * </p>
 83  
  * <pre>
 84  
  * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
 85  
  *     &lt;property name="allowByTailComment" value="true"/&gt;
 86  
  * &lt;/module&gt;
 87  
  * </pre>
 88  
  * <p>Example of using escapes if literal contains only them:
 89  
  * </p>
 90  
  * <pre>
 91  
  * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
 92  
  * </pre>
 93  
  * <p>An example of how to configure the check to allow escapes
 94  
  * if literal contains only them:
 95  
  * </p>
 96  
  * <pre>
 97  
  * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
 98  
  *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
 99  
  * &lt;/module&gt;
 100  
  * </pre>
 101  
  * <p>An example of how to configure the check to allow non-printable escapes:
 102  
  * </p>
 103  
  * <pre>
 104  
  * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
 105  
  *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
 106  
  * &lt;/module&gt;
 107  
  * </pre>
 108  
  *
 109  
  * @author maxvetrenko
 110  
  *
 111  
  */
 112  
 @FileStatefulCheck
 113  22
 public class AvoidEscapedUnicodeCharactersCheck
 114  
     extends AbstractCheck {
 115  
     /**
 116  
      * A key is pointing to the warning message text in "messages.properties"
 117  
      * file.
 118  
      */
 119  
     public static final String MSG_KEY = "forbid.escaped.unicode.char";
 120  
 
 121  
     /** Regular expression for Unicode chars. */
 122  2
     private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
 123  
 
 124  
     /**
 125  
      * Regular expression Unicode control characters.
 126  
      *
 127  
      * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
 128  
      *     Appendix:Control characters</a>
 129  
      */
 130  2
     private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
 131  
             + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)"
 132  
             + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]"
 133  
             + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
 134  
 
 135  
     /** Regular expression for all escaped chars. */
 136  2
     private static final Pattern ALL_ESCAPED_CHARS =
 137  2
             Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
 138  
                     + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
 139  
 
 140  
     /** Regular expression for escaped backslash. */
 141  2
     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
 142  
 
 143  
     /** Regular expression for non-printable unicode chars. */
 144  2
     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
 145  
             + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
 146  
             + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
 147  
             + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
 148  
             + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
 149  
             + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
 150  
             + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
 151  
             + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
 152  
             + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
 153  
             + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
 154  
             + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
 155  
             + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
 156  
             + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
 157  
             + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
 158  
             + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
 159  
             + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
 160  
             + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
 161  
             + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
 162  
             + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
 163  
             + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
 164  
 
 165  
     /** Cpp style comments. */
 166  
     private Map<Integer, TextBlock> singlelineComments;
 167  
     /** C style comments. */
 168  
     private Map<Integer, List<TextBlock>> blockComments;
 169  
 
 170  
     /** Allow use escapes for non-printable(control) characters.  */
 171  
     private boolean allowEscapesForControlCharacters;
 172  
 
 173  
     /** Allow use escapes if trail comment is present. */
 174  
     private boolean allowByTailComment;
 175  
 
 176  
     /** Allow if all characters in literal are escaped. */
 177  
     private boolean allowIfAllCharactersEscaped;
 178  
 
 179  
     /** Allow escapes for space literals. */
 180  
     private boolean allowNonPrintableEscapes;
 181  
 
 182  
     /**
 183  
      * Set allowIfAllCharactersEscaped.
 184  
      * @param allow user's value.
 185  
      */
 186  
     public final void setAllowEscapesForControlCharacters(boolean allow) {
 187  4
         allowEscapesForControlCharacters = allow;
 188  4
     }
 189  
 
 190  
     /**
 191  
      * Set allowByTailComment.
 192  
      * @param allow user's value.
 193  
      */
 194  
     public final void setAllowByTailComment(boolean allow) {
 195  3
         allowByTailComment = allow;
 196  3
     }
 197  
 
 198  
     /**
 199  
      * Set allowIfAllCharactersEscaped.
 200  
      * @param allow user's value.
 201  
      */
 202  
     public final void setAllowIfAllCharactersEscaped(boolean allow) {
 203  2
         allowIfAllCharactersEscaped = allow;
 204  2
     }
 205  
 
 206  
     /**
 207  
      * Set allowSpaceEscapes.
 208  
      * @param allow user's value.
 209  
      */
 210  
     public final void setAllowNonPrintableEscapes(boolean allow) {
 211  3
         allowNonPrintableEscapes = allow;
 212  3
     }
 213  
 
 214  
     @Override
 215  
     public int[] getDefaultTokens() {
 216  31
         return getRequiredTokens();
 217  
     }
 218  
 
 219  
     @Override
 220  
     public int[] getAcceptableTokens() {
 221  6
         return getRequiredTokens();
 222  
     }
 223  
 
 224  
     @Override
 225  
     public int[] getRequiredTokens() {
 226  69
         return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
 227  
     }
 228  
 
 229  
     @Override
 230  
     public void beginTree(DetailAST rootAST) {
 231  8
         singlelineComments = getFileContents().getSingleLineComments();
 232  8
         blockComments = getFileContents().getBlockComments();
 233  8
     }
 234  
 
 235  
     @Override
 236  
     public void visitToken(DetailAST ast) {
 237  
 
 238  66067
         final String literal = ast.getText();
 239  
 
 240  66067
         if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
 241  65704
                 || isAllCharactersEscaped(literal)
 242  
                 || allowEscapesForControlCharacters
 243  65571
                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
 244  
                 || allowNonPrintableEscapes
 245  37
                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
 246  65568
             log(ast.getLineNo(), MSG_KEY);
 247  
         }
 248  66067
     }
 249  
 
 250  
     /**
 251  
      * Checks if literal has Unicode chars.
 252  
      * @param literal String literal.
 253  
      * @return true if literal has Unicode chars.
 254  
      */
 255  
     private static boolean hasUnicodeChar(String literal) {
 256  66067
         final String literalWithoutEscapedBackslashes =
 257  66067
                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
 258  66067
         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
 259  
     }
 260  
 
 261  
     /**
 262  
      * Check if String literal contains Unicode control chars.
 263  
      * @param literal String literal.
 264  
      * @param pattern RegExp for valid characters.
 265  
      * @return true, if String literal contains Unicode control chars.
 266  
      */
 267  
     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
 268  65608
         final int unicodeMatchesCounter =
 269  65608
                 countMatches(UNICODE_REGEXP, literal);
 270  65608
         final int unicodeValidMatchesCounter =
 271  65608
                 countMatches(pattern, literal);
 272  65608
         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
 273  
     }
 274  
 
 275  
     /**
 276  
      * Check if trail comment is present after ast token.
 277  
      * @param ast current token.
 278  
      * @return true if trail comment is present after ast token.
 279  
      */
 280  
     private boolean hasTrailComment(DetailAST ast) {
 281  37
         boolean result = false;
 282  37
         final int lineNo = ast.getLineNo();
 283  37
         if (singlelineComments.containsKey(lineNo)) {
 284  12
             result = true;
 285  
         }
 286  
         else {
 287  25
             final List<TextBlock> commentList = blockComments.get(lineNo);
 288  25
             if (commentList != null) {
 289  4
                 final TextBlock comment = commentList.get(commentList.size() - 1);
 290  4
                 final String line = getLines()[lineNo - 1];
 291  4
                 result = isTrailingBlockComment(comment, line);
 292  
             }
 293  
         }
 294  37
         return result;
 295  
     }
 296  
 
 297  
     /**
 298  
      * Whether the C style comment is trailing.
 299  
      * @param comment the comment to check.
 300  
      * @param line the line where the comment starts.
 301  
      * @return true if the comment is trailing.
 302  
      */
 303  
     private static boolean isTrailingBlockComment(TextBlock comment, String line) {
 304  8
         return comment.getText().length != 1
 305  3
             || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
 306  
     }
 307  
 
 308  
     /**
 309  
      * Count regexp matches into String literal.
 310  
      * @param pattern pattern.
 311  
      * @param target String literal.
 312  
      * @return count of regexp matches.
 313  
      */
 314  
     private static int countMatches(Pattern pattern, String target) {
 315  131217
         int matcherCounter = 0;
 316  131217
         final Matcher matcher = pattern.matcher(target);
 317  197030
         while (matcher.find()) {
 318  65813
             matcherCounter++;
 319  
         }
 320  131217
         return matcherCounter;
 321  
     }
 322  
 
 323  
     /**
 324  
      * Checks if all characters in String literal is escaped.
 325  
      * @param literal current literal.
 326  
      * @return true if all characters in String literal is escaped.
 327  
      */
 328  
     private boolean isAllCharactersEscaped(String literal) {
 329  131408
         return allowIfAllCharactersEscaped
 330  37
                 && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
 331  74
                         literal.length() - 1)).find();
 332  
     }
 333  
 }