View Javadoc
1   /*
2    * Copyright (C) 2008 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5    * in compliance with the License. You may obtain a copy of the License at
6    *
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software distributed under the License
10   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11   * or implied. See the License for the specific language governing permissions and limitations under
12   * the License.
13   */
14  
15  package com.google.common.escape;
16  
17  import com.google.common.annotations.GwtCompatible;
18  import com.google.common.base.Function;
19  
20  /**
21   * An object that converts literal text into a format safe for inclusion in a particular context
22   * (such as an XML document). Typically (but not always), the inverse process of "unescaping" the
23   * text is performed automatically by the relevant parser.
24   *
25   * <p>For example, an XML escaper would convert the literal string {@code "Foo<Bar>"} into {@code
26   * "Foo&lt;Bar&gt;"} to prevent {@code "<Bar>"} from being confused with an XML tag. When the
27   * resulting XML document is parsed, the parser API will return this text as the original literal
28   * string {@code "Foo<Bar>"}.
29   *
30   * <p>An {@code Escaper} instance is required to be stateless, and safe when used concurrently by
31   * multiple threads.
32   *
33   * <p>Because, in general, escaping operates on the code points of a string and not on its
34   * individual {@code char} values, it is not safe to assume that {@code escape(s)} is equivalent to
35   * {@code escape(s.substring(0, n)) + escape(s.substring(n))} for arbitrary {@code n}. This is
36   * because of the possibility of splitting a surrogate pair. The only case in which it is safe to
37   * escape strings and concatenate the results is if you can rule out this possibility, either by
38   * splitting an existing long string into short strings adaptively around
39   * {@linkplain Character#isHighSurrogate surrogate} {@linkplain Character#isLowSurrogate pairs}, or
40   * by starting with short strings already known to be free of unpaired surrogates.
41   *
42   * <p>The two primary implementations of this interface are {@link CharEscaper} and
43   * {@link UnicodeEscaper}. They are heavily optimized for performance and greatly simplify the task
44   * of implementing new escapers. It is strongly recommended that when implementing a new escaper you
45   * extend one of these classes. If you find that you are unable to achieve the desired behavior
46   * using either of these classes, please contact the Java libraries team for advice.
47   *
48   * <p>Popular escapers are defined as constants in classes like
49   * {@link com.google.common.html.HtmlEscapers} and {@link com.google.common.xml.XmlEscapers}. To
50   * create your own escapers, use {@link CharEscaperBuilder}, or extend {@code CharEscaper} or
51   * {@code UnicodeEscaper}.
52   *
53   * @author David Beaumont
54   * @since 15.0
55   */
56  @GwtCompatible
57  public abstract class Escaper {
58    // TODO(user): evaluate custom implementations, considering package private constructor.
59    /** Constructor for use by subclasses. */
60    protected Escaper() {}
61  
62    /**
63     * Returns the escaped form of a given literal string.
64     *
65     * <p>Note that this method may treat input characters differently depending on the specific
66     * escaper implementation.
67     *
68     * <ul>
69     * <li>{@link UnicodeEscaper} handles <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>
70     * correctly, including surrogate character pairs. If the input is badly formed the escaper should
71     * throw {@link IllegalArgumentException}.
72     * <li>{@link CharEscaper} handles Java characters independently and does not verify the input for
73     * well formed characters. A {@code CharEscaper} should not be used in situations where input is
74     * not guaranteed to be restricted to the Basic Multilingual Plane (BMP).
75     * </ul>
76     *
77     * @param string the literal string to be escaped
78     * @return the escaped form of {@code string}
79     * @throws NullPointerException if {@code string} is null
80     * @throws IllegalArgumentException if {@code string} contains badly formed UTF-16 or cannot be
81     *     escaped for any other reason
82     */
83    public abstract String escape(String string);
84  
85    private final Function<String, String> asFunction =
86        new Function<String, String>() {
87          @Override
88          public String apply(String from) {
89            return escape(from);
90          }
91        };
92  
93    /**
94     * Returns a {@link Function} that invokes {@link #escape(String)} on this escaper.
95     */
96    public final Function<String, String> asFunction() {
97      return asFunction;
98    }
99  }