View Javadoc
1   /*
2    * Copyright (C) 2009 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5    * in compliance with the License. You may obtain a copy of the License at
6    *
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software distributed under the License
10   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11   * or implied. See the License for the specific language governing permissions and limitations under
12   * the License.
13   */
14  
15  package com.google.common.escape;
16  
17  import static com.google.common.base.Preconditions.checkNotNull;
18  
19  import com.google.common.annotations.Beta;
20  import com.google.common.annotations.GwtCompatible;
21  import com.google.errorprone.annotations.CanIgnoreReturnValue;
22  import java.util.HashMap;
23  import java.util.Map;
24  import javax.annotation.Nullable;
25  
26  /**
27   * Static utility methods pertaining to {@link Escaper} instances.
28   *
29   * @author Sven Mawson
30   * @author David Beaumont
31   * @since 15.0
32   */
33  @Beta
34  @GwtCompatible
35  public final class Escapers {
36    private Escapers() {}
37  
38    /**
39     * Returns an {@link Escaper} that does no escaping, passing all character data through unchanged.
40     */
41    public static Escaper nullEscaper() {
42      return NULL_ESCAPER;
43    }
44  
45    // An Escaper that efficiently performs no escaping.
46    // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
47    private static final Escaper NULL_ESCAPER =
48        new CharEscaper() {
49          @Override
50          public String escape(String string) {
51            return checkNotNull(string);
52          }
53  
54          @Override
55          protected char[] escape(char c) {
56            // TODO: Fix tests not to call this directly and make it throw an error.
57            return null;
58          }
59        };
60  
61    /**
62     * Returns a builder for creating simple, fast escapers. A builder instance can be reused and each
63     * escaper that is created will be a snapshot of the current builder state. Builders are not
64     * thread safe.
65     *
66     * <p>The initial state of the builder is such that:
67     * <ul>
68     * <li>There are no replacement mappings
69     * <li>{@code safeMin == Character.MIN_VALUE}
70     * <li>{@code safeMax == Character.MAX_VALUE}
71     * <li>{@code unsafeReplacement == null}
72     * </ul>
73     * <p>For performance reasons escapers created by this builder are not Unicode aware and will not
74     * validate the well-formedness of their input.
75     */
76    public static Builder builder() {
77      return new Builder();
78    }
79  
80    /**
81     * A builder for simple, fast escapers.
82     *
83     * <p>Typically an escaper needs to deal with the escaping of high valued characters or code
84     * points. In these cases it is necessary to extend either {@link ArrayBasedCharEscaper} or
85     * {@link ArrayBasedUnicodeEscaper} to provide the desired behavior. However this builder is
86     * suitable for creating escapers that replace a relative small set of characters.
87     *
88     * @author David Beaumont
89     * @since 15.0
90     */
91    @Beta
92    public static final class Builder {
93      private final Map<Character, String> replacementMap = new HashMap<>();
94      private char safeMin = Character.MIN_VALUE;
95      private char safeMax = Character.MAX_VALUE;
96      private String unsafeReplacement = null;
97  
98      // The constructor is exposed via the builder() method above.
99      private Builder() {}
100 
101     /**
102      * Sets the safe range of characters for the escaper. Characters in this range that have no
103      * explicit replacement are considered 'safe' and remain unescaped in the output. If
104      * {@code safeMax < safeMin} then the safe range is empty.
105      *
106      * @param safeMin the lowest 'safe' character
107      * @param safeMax the highest 'safe' character
108      * @return the builder instance
109      */
110     @CanIgnoreReturnValue
111     public Builder setSafeRange(char safeMin, char safeMax) {
112       this.safeMin = safeMin;
113       this.safeMax = safeMax;
114       return this;
115     }
116 
117     /**
118      * Sets the replacement string for any characters outside the 'safe' range that have no explicit
119      * replacement. If {@code unsafeReplacement} is {@code null} then no replacement will occur, if
120      * it is {@code ""} then the unsafe characters are removed from the output.
121      *
122      * @param unsafeReplacement the string to replace unsafe characters
123      * @return the builder instance
124      */
125     @CanIgnoreReturnValue
126     public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
127       this.unsafeReplacement = unsafeReplacement;
128       return this;
129     }
130 
131     /**
132      * Adds a replacement string for the given input character. The specified character will be
133      * replaced by the given string whenever it occurs in the input, irrespective of whether it lies
134      * inside or outside the 'safe' range.
135      *
136      * @param c the character to be replaced
137      * @param replacement the string to replace the given character
138      * @return the builder instance
139      * @throws NullPointerException if {@code replacement} is null
140      */
141     @CanIgnoreReturnValue
142     public Builder addEscape(char c, String replacement) {
143       checkNotNull(replacement);
144       // This can replace an existing character (the builder is re-usable).
145       replacementMap.put(c, replacement);
146       return this;
147     }
148 
149     /**
150      * Returns a new escaper based on the current state of the builder.
151      */
152     public Escaper build() {
153       return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
154         private final char[] replacementChars =
155             unsafeReplacement != null ? unsafeReplacement.toCharArray() : null;
156 
157         @Override
158         protected char[] escapeUnsafe(char c) {
159           return replacementChars;
160         }
161       };
162     }
163   }
164 
165   /**
166    * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. If the escaper is
167    * already a UnicodeEscaper then it is simply returned, otherwise it is wrapped in a
168    * UnicodeEscaper.
169    *
170    * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires extra behavior with
171    * respect to the well-formedness of Unicode character sequences and will throw
172    * {@link IllegalArgumentException} when given bad input.
173    *
174    * @param escaper the instance to be wrapped
175    * @return a UnicodeEscaper with the same behavior as the given instance
176    * @throws NullPointerException if escaper is null
177    * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a CharEscaper
178    */
179   static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
180     checkNotNull(escaper);
181     if (escaper instanceof UnicodeEscaper) {
182       return (UnicodeEscaper) escaper;
183     } else if (escaper instanceof CharEscaper) {
184       return wrap((CharEscaper) escaper);
185     }
186     // In practice this shouldn't happen because it would be very odd not to
187     // extend either CharEscaper or UnicodeEscaper for non trivial cases.
188     throw new IllegalArgumentException(
189         "Cannot create a UnicodeEscaper from: " + escaper.getClass().getName());
190   }
191 
192   /**
193    * Returns a string that would replace the given character in the specified escaper, or
194    * {@code null} if no replacement should be made. This method is intended for use in tests through
195    * the {@code EscaperAsserts} class; production users of {@link CharEscaper} should limit
196    * themselves to its public interface.
197    *
198    * @param c the character to escape if necessary
199    * @return the replacement string, or {@code null} if no escaping was needed
200    */
201   public static String computeReplacement(CharEscaper escaper, char c) {
202     return stringOrNull(escaper.escape(c));
203   }
204 
205   /**
206    * Returns a string that would replace the given character in the specified escaper, or
207    * {@code null} if no replacement should be made. This method is intended for use in tests through
208    * the {@code EscaperAsserts} class; production users of {@link UnicodeEscaper} should limit
209    * themselves to its public interface.
210    *
211    * @param cp the Unicode code point to escape if necessary
212    * @return the replacement string, or {@code null} if no escaping was needed
213    */
214   public static String computeReplacement(UnicodeEscaper escaper, int cp) {
215     return stringOrNull(escaper.escape(cp));
216   }
217 
218   private static String stringOrNull(char[] in) {
219     return (in == null) ? null : new String(in);
220   }
221 
222   /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
223   private static UnicodeEscaper wrap(final CharEscaper escaper) {
224     return new UnicodeEscaper() {
225       @Override
226       protected char[] escape(int cp) {
227         // If a code point maps to a single character, just escape that.
228         if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
229           return escaper.escape((char) cp);
230         }
231         // Convert the code point to a surrogate pair and escape them both.
232         // Note: This code path is horribly slow and typically allocates 4 new
233         // char[] each time it is invoked. However this avoids any
234         // synchronization issues and makes the escaper thread safe.
235         char[] surrogateChars = new char[2];
236         Character.toChars(cp, surrogateChars, 0);
237         char[] hiChars = escaper.escape(surrogateChars[0]);
238         char[] loChars = escaper.escape(surrogateChars[1]);
239 
240         // If either hiChars or lowChars are non-null, the CharEscaper is trying
241         // to escape the characters of a surrogate pair separately. This is
242         // uncommon and applies only to escapers that assume UCS-2 rather than
243         // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
244         if (hiChars == null && loChars == null) {
245           // We expect this to be the common code path for most escapers.
246           return null;
247         }
248         // Combine the characters and/or escaped sequences into a single array.
249         int hiCount = hiChars != null ? hiChars.length : 1;
250         int loCount = loChars != null ? loChars.length : 1;
251         char[] output = new char[hiCount + loCount];
252         if (hiChars != null) {
253           // TODO: Is this faster than System.arraycopy() for small arrays?
254           for (int n = 0; n < hiChars.length; ++n) {
255             output[n] = hiChars[n];
256           }
257         } else {
258           output[0] = surrogateChars[0];
259         }
260         if (loChars != null) {
261           for (int n = 0; n < loChars.length; ++n) {
262             output[hiCount + n] = loChars[n];
263           }
264         } else {
265           output[hiCount] = surrogateChars[1];
266         }
267         return output;
268       }
269     };
270   }
271 }