001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2017 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
028import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
029import com.puppycrawl.tools.checkstyle.api.DetailAST;
030import com.puppycrawl.tools.checkstyle.api.TextBlock;
031import com.puppycrawl.tools.checkstyle.api.TokenTypes;
032import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
033
034/**
035 * <p>
036 * Restrict using <a href =
037 * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
038 * Unicode escapes</a> (such as {@code &#92;u221e}).
039 * It is possible to allow using escapes for
040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
041 * non-printable(control) characters</a>.
042 * Also, this check can be configured to allow using escapes
043 * if trail comment is present. By the option it is possible to
044 * allow using escapes if literal contains only them. By the option it
045 * is possible to allow using escapes for space literals.
046 * </p>
047 * <p>
048 * Examples of using Unicode:</p>
049 * <pre>
050 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
051 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
052 * </pre>
053 * <p>
054 * An example of how to configure the check is:
055 * </p>
056 * <pre>
057 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
058 * </pre>
059 * <p>
060 * An example of non-printable(control) characters.
061 * </p>
062 * <pre>
063 * return '&#92;ufeff' + content; // byte order mark
064 * </pre>
065 * <p>
066 * An example of how to configure the check to allow using escapes
067 * for non-printable(control) characters:
068 * </p>
069 * <pre>
070 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
071 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
072 * &lt;/module&gt;
073 * </pre>
074 * <p>
075 * Example of using escapes with trail comment:
076 * </p>
077 * <pre>
078 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
079 * </pre>
080 * <p>An example of how to configure the check to allow using escapes
081 * if trail comment is present:
082 * </p>
083 * <pre>
084 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
085 *     &lt;property name="allowByTailComment" value="true"/&gt;
086 * &lt;/module&gt;
087 * </pre>
088 * <p>Example of using escapes if literal contains only them:
089 * </p>
090 * <pre>
091 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
092 * </pre>
093 * <p>An example of how to configure the check to allow escapes
094 * if literal contains only them:
095 * </p>
096 * <pre>
097 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
098 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
099 * &lt;/module&gt;
100 * </pre>
101 * <p>An example of how to configure the check to allow non-printable escapes:
102 * </p>
103 * <pre>
104 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
105 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
106 * &lt;/module&gt;
107 * </pre>
108 *
109 * @author maxvetrenko
110 *
111 */
112@FileStatefulCheck
113public class AvoidEscapedUnicodeCharactersCheck
114    extends AbstractCheck {
115    /**
116     * A key is pointing to the warning message text in "messages.properties"
117     * file.
118     */
119    public static final String MSG_KEY = "forbid.escaped.unicode.char";
120
121    /** Regular expression for Unicode chars. */
122    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
123
124    /**
125     * Regular expression Unicode control characters.
126     *
127     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
128     *     Appendix:Control characters</a>
129     */
130    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
131            + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)"
132            + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]"
133            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
134
135    /** Regular expression for all escaped chars. */
136    private static final Pattern ALL_ESCAPED_CHARS =
137            Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
138                    + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
139
140    /** Regular expression for escaped backslash. */
141    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
142
143    /** Regular expression for non-printable unicode chars. */
144    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
145            + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
146            + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
147            + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
148            + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
149            + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
150            + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
151            + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
152            + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
153            + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
154            + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
155            + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
156            + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
157            + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
158            + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
159            + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
160            + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
161            + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
162            + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
163            + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
164
165    /** Cpp style comments. */
166    private Map<Integer, TextBlock> singlelineComments;
167    /** C style comments. */
168    private Map<Integer, List<TextBlock>> blockComments;
169
170    /** Allow use escapes for non-printable(control) characters.  */
171    private boolean allowEscapesForControlCharacters;
172
173    /** Allow use escapes if trail comment is present. */
174    private boolean allowByTailComment;
175
176    /** Allow if all characters in literal are escaped. */
177    private boolean allowIfAllCharactersEscaped;
178
179    /** Allow escapes for space literals. */
180    private boolean allowNonPrintableEscapes;
181
182    /**
183     * Set allowIfAllCharactersEscaped.
184     * @param allow user's value.
185     */
186    public final void setAllowEscapesForControlCharacters(boolean allow) {
187        allowEscapesForControlCharacters = allow;
188    }
189
190    /**
191     * Set allowByTailComment.
192     * @param allow user's value.
193     */
194    public final void setAllowByTailComment(boolean allow) {
195        allowByTailComment = allow;
196    }
197
198    /**
199     * Set allowIfAllCharactersEscaped.
200     * @param allow user's value.
201     */
202    public final void setAllowIfAllCharactersEscaped(boolean allow) {
203        allowIfAllCharactersEscaped = allow;
204    }
205
206    /**
207     * Set allowSpaceEscapes.
208     * @param allow user's value.
209     */
210    public final void setAllowNonPrintableEscapes(boolean allow) {
211        allowNonPrintableEscapes = allow;
212    }
213
214    @Override
215    public int[] getDefaultTokens() {
216        return getRequiredTokens();
217    }
218
219    @Override
220    public int[] getAcceptableTokens() {
221        return getRequiredTokens();
222    }
223
224    @Override
225    public int[] getRequiredTokens() {
226        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
227    }
228
229    @Override
230    public void beginTree(DetailAST rootAST) {
231        singlelineComments = getFileContents().getSingleLineComments();
232        blockComments = getFileContents().getBlockComments();
233    }
234
235    @Override
236    public void visitToken(DetailAST ast) {
237
238        final String literal = ast.getText();
239
240        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
241                || isAllCharactersEscaped(literal)
242                || allowEscapesForControlCharacters
243                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
244                || allowNonPrintableEscapes
245                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
246            log(ast.getLineNo(), MSG_KEY);
247        }
248    }
249
250    /**
251     * Checks if literal has Unicode chars.
252     * @param literal String literal.
253     * @return true if literal has Unicode chars.
254     */
255    private static boolean hasUnicodeChar(String literal) {
256        final String literalWithoutEscapedBackslashes =
257                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
258        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
259    }
260
261    /**
262     * Check if String literal contains Unicode control chars.
263     * @param literal String literal.
264     * @param pattern RegExp for valid characters.
265     * @return true, if String literal contains Unicode control chars.
266     */
267    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
268        final int unicodeMatchesCounter =
269                countMatches(UNICODE_REGEXP, literal);
270        final int unicodeValidMatchesCounter =
271                countMatches(pattern, literal);
272        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
273    }
274
275    /**
276     * Check if trail comment is present after ast token.
277     * @param ast current token.
278     * @return true if trail comment is present after ast token.
279     */
280    private boolean hasTrailComment(DetailAST ast) {
281        boolean result = false;
282        final int lineNo = ast.getLineNo();
283        if (singlelineComments.containsKey(lineNo)) {
284            result = true;
285        }
286        else {
287            final List<TextBlock> commentList = blockComments.get(lineNo);
288            if (commentList != null) {
289                final TextBlock comment = commentList.get(commentList.size() - 1);
290                final String line = getLines()[lineNo - 1];
291                result = isTrailingBlockComment(comment, line);
292            }
293        }
294        return result;
295    }
296
297    /**
298     * Whether the C style comment is trailing.
299     * @param comment the comment to check.
300     * @param line the line where the comment starts.
301     * @return true if the comment is trailing.
302     */
303    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
304        return comment.getText().length != 1
305            || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
306    }
307
308    /**
309     * Count regexp matches into String literal.
310     * @param pattern pattern.
311     * @param target String literal.
312     * @return count of regexp matches.
313     */
314    private static int countMatches(Pattern pattern, String target) {
315        int matcherCounter = 0;
316        final Matcher matcher = pattern.matcher(target);
317        while (matcher.find()) {
318            matcherCounter++;
319        }
320        return matcherCounter;
321    }
322
323    /**
324     * Checks if all characters in String literal is escaped.
325     * @param literal current literal.
326     * @return true if all characters in String literal is escaped.
327     */
328    private boolean isAllCharactersEscaped(String literal) {
329        return allowIfAllCharactersEscaped
330                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
331                        literal.length() - 1)).find();
332    }
333}