View Javadoc
1   /*
2    * Copyright (C) 2009 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5    * in compliance with the License. You may obtain a copy of the License at
6    *
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software distributed under the License
10   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11   * or implied. See the License for the specific language governing permissions and limitations under
12   * the License.
13   */
14  
15  package com.google.common.net;
16  
17  import static com.google.common.base.Preconditions.checkArgument;
18  import static com.google.common.base.Preconditions.checkNotNull;
19  import static com.google.common.base.Preconditions.checkState;
20  
21  import com.google.common.annotations.Beta;
22  import com.google.common.annotations.GwtCompatible;
23  import com.google.common.base.Ascii;
24  import com.google.common.base.CharMatcher;
25  import com.google.common.base.Joiner;
26  import com.google.common.base.Splitter;
27  import com.google.common.collect.ImmutableList;
28  import com.google.thirdparty.publicsuffix.PublicSuffixPatterns;
29  import java.util.List;
30  import javax.annotation.Nullable;
31  
32  /**
33   * An immutable well-formed internet domain name, such as {@code com} or {@code
34   * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other network interactions
35   * take place. Thus there is no guarantee that the domain actually exists on the internet.
36   *
37   * <p>One common use of this class is to determine whether a given string is likely to represent an
38   * addressable domain on the web -- that is, for a candidate string {@code "xxx"}, might browsing to
39   * {@code "http://xxx/"} result in a webpage being displayed? In the past, this test was frequently
40   * done by determining whether the domain ended with a {@linkplain #isPublicSuffix() public suffix}
41   * but was not itself a public suffix. However, this test is no longer accurate. There are many
42   * domains which are both public suffixes and addressable as hosts; {@code "uk.com"} is one example.
43   * As a result, the only useful test to determine if a domain is a plausible web host is
44   * {@link #hasPublicSuffix()}. This will return {@code true} for many domains which (currently) are
45   * not hosts, such as {@code "com"}, but given that any public suffix may become a host without
46   * warning, it is better to err on the side of permissiveness and thus avoid spurious rejection of
47   * valid sites.
48   *
49   * <p>During construction, names are normalized in two ways:
50   *
51   * <ol>
52   * <li>ASCII uppercase characters are converted to lowercase.
53   * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are converted to the ASCII
54   *     period.
55   * </ol>
56   *
57   * <p>The normalized values will be returned from {@link #toString()} and {@link #parts()}, and will
58   * be reflected in the result of {@link #equals(Object)}.
59   *
60   * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">Internationalized domain
61   * names</a> such as {@code 网络.cn} are supported, as are the equivalent
62   * <a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA Punycode-encoded</a>
63   * versions.
64   *
65   * @author Craig Berry
66   * @since 5.0
67   */
68  @Beta
69  @GwtCompatible
70  public final class InternetDomainName {
71  
72    private static final CharMatcher DOTS_MATCHER = CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
73    private static final Splitter DOT_SPLITTER = Splitter.on('.');
74    private static final Joiner DOT_JOINER = Joiner.on('.');
75  
76    /**
77     * Value of {@link #publicSuffixIndex} which indicates that no public suffix was found.
78     */
79    private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
80  
81    private static final String DOT_REGEX = "\\.";
82  
83    /**
84     * Maximum parts (labels) in a domain name. This value arises from the 255-octet limit described
85     * in <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with the fact that the
86     * encoding of each part occupies at least two bytes (dot plus label externally, length byte plus
87     * label internally). Thus, if all labels have the minimum size of one byte, 127 of them will fit.
88     */
89    private static final int MAX_PARTS = 127;
90  
91    /**
92     * Maximum length of a full domain name, including separators, and leaving room for the root
93     * label. See <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
94     */
95    private static final int MAX_LENGTH = 253;
96  
97    /**
98     * Maximum size of a single part of a domain name. See
99     * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
100    */
101   private static final int MAX_DOMAIN_PART_LENGTH = 63;
102 
103   /**
104    * The full domain name, converted to lower case.
105    */
106   private final String name;
107 
108   /**
109    * The parts of the domain name, converted to lower case.
110    */
111   private final ImmutableList<String> parts;
112 
113   /**
114    * The index in the {@link #parts()} list at which the public suffix begins. For example, for the
115    * domain name {@code www.google.co.uk}, the value would be 2 (the index of the {@code co} part).
116    * The value is negative (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
117    * found.
118    */
119   private final int publicSuffixIndex;
120 
121   /**
122    * Constructor used to implement {@link #from(String)}, and from subclasses.
123    */
124   InternetDomainName(String name) {
125     // Normalize:
126     // * ASCII characters to lowercase
127     // * All dot-like characters to '.'
128     // * Strip trailing '.'
129 
130     name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
131 
132     if (name.endsWith(".")) {
133       name = name.substring(0, name.length() - 1);
134     }
135 
136     checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
137     this.name = name;
138 
139     this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
140     checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
141     checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
142 
143     this.publicSuffixIndex = findPublicSuffix();
144   }
145 
146   /**
147    * Returns the index of the leftmost part of the public suffix, or -1 if not found. Note that the
148    * value defined as the "public suffix" may not be a public suffix according to
149    * {@link #isPublicSuffix()} if the domain ends with an excluded domain pattern such as
150    * {@code "nhs.uk"}.
151    */
152   private int findPublicSuffix() {
153     final int partsSize = parts.size();
154 
155     for (int i = 0; i < partsSize; i++) {
156       String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
157 
158       if (PublicSuffixPatterns.EXACT.containsKey(ancestorName)) {
159         return i;
160       }
161 
162       // Excluded domains (e.g. !nhs.uk) use the next highest
163       // domain as the effective public suffix (e.g. uk).
164 
165       if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) {
166         return i + 1;
167       }
168 
169       if (matchesWildcardPublicSuffix(ancestorName)) {
170         return i;
171       }
172     }
173 
174     return NO_PUBLIC_SUFFIX_FOUND;
175   }
176 
177   /**
178    * Returns an instance of {@link InternetDomainName} after lenient validation. Specifically,
179    * validation against <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
180    * ("Internationalizing Domain Names in Applications") is skipped, while validation against
181    * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in the following ways:
182    * <ul>
183    * <li>Any part containing non-ASCII characters is considered valid.
184    * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
185    * <li>Parts other than the final part may start with a digit, as mandated by
186    * <a href="https://tools.ietf.org/html/rfc1123#section-2">RFC 1123</a>.
187    * </ul>
188    *
189    *
190    * @param domain A domain name (not IP address)
191    * @throws IllegalArgumentException if {@code name} is not syntactically valid according to
192    *     {@link #isValid}
193    * @since 10.0 (previously named {@code fromLenient})
194    */
195   public static InternetDomainName from(String domain) {
196     return new InternetDomainName(checkNotNull(domain));
197   }
198 
199   /**
200    * Validation method used by {@code from} to ensure that the domain name is syntactically valid
201    * according to RFC 1035.
202    *
203    * @return Is the domain name syntactically valid?
204    */
205   private static boolean validateSyntax(List<String> parts) {
206     final int lastIndex = parts.size() - 1;
207 
208     // Validate the last part specially, as it has different syntax rules.
209 
210     if (!validatePart(parts.get(lastIndex), true)) {
211       return false;
212     }
213 
214     for (int i = 0; i < lastIndex; i++) {
215       String part = parts.get(i);
216       if (!validatePart(part, false)) {
217         return false;
218       }
219     }
220 
221     return true;
222   }
223 
224   private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
225 
226   private static final CharMatcher PART_CHAR_MATCHER =
227       CharMatcher.javaLetterOrDigit().or(DASH_MATCHER);
228 
229   /**
230    * Helper method for {@link #validateSyntax(List)}. Validates that one part of a domain name is
231    * valid.
232    *
233    * @param part The domain name part to be validated
234    * @param isFinalPart Is this the final (rightmost) domain part?
235    * @return Whether the part is valid
236    */
237   private static boolean validatePart(String part, boolean isFinalPart) {
238 
239     // These tests could be collapsed into one big boolean expression, but
240     // they have been left as independent tests for clarity.
241 
242     if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
243       return false;
244     }
245 
246     /*
247      * GWT claims to support java.lang.Character's char-classification methods, but it actually only
248      * works for ASCII. So for now, assume any non-ASCII characters are valid. The only place this
249      * seems to be documented is here:
250      * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
251      *
252      * <p>ASCII characters in the part are expected to be valid per RFC 1035, with underscore also
253      * being allowed due to widespread practice.
254      */
255 
256     String asciiChars = CharMatcher.ascii().retainFrom(part);
257 
258     if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
259       return false;
260     }
261 
262     // No initial or final dashes or underscores.
263 
264     if (DASH_MATCHER.matches(part.charAt(0))
265         || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
266       return false;
267     }
268 
269     /*
270      * Note that we allow (in contravention of a strict interpretation of the relevant RFCs) domain
271      * parts other than the last may begin with a digit (for example, "3com.com"). It's important to
272      * disallow an initial digit in the last part; it's the only thing that stops an IPv4 numeric
273      * address like 127.0.0.1 from looking like a valid domain name.
274      */
275 
276     if (isFinalPart && CharMatcher.digit().matches(part.charAt(0))) {
277       return false;
278     }
279 
280     return true;
281   }
282 
283   /**
284    * Returns the individual components of this domain name, normalized to all lower case. For
285    * example, for the domain name {@code mail.google.com}, this method returns the list
286    * {@code ["mail", "google", "com"]}.
287    */
288   public ImmutableList<String> parts() {
289     return parts;
290   }
291 
292   /**
293    * Indicates whether this domain name represents a <i>public suffix</i>, as defined by the Mozilla
294    * Foundation's <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public suffix
295    * is one under which Internet users can directly register names, such as {@code com},
296    * {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain names that are <i>not</i> public
297    * suffixes include {@code google}, {@code google.com} and {@code foo.co.uk}.
298    *
299    * @return {@code true} if this domain name appears exactly on the public suffix list
300    * @since 6.0
301    */
302   public boolean isPublicSuffix() {
303     return publicSuffixIndex == 0;
304   }
305 
306   /**
307    * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix},
308    * including if it is a public suffix itself. For example, returns {@code true} for
309    * {@code www.google.com}, {@code foo.co.uk} and {@code com}, but not for {@code google} or
310    * {@code google.foo}. This is the recommended method for determining whether a domain is
311    * potentially an addressable host.
312    *
313    * @since 6.0
314    */
315   public boolean hasPublicSuffix() {
316     return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
317   }
318 
319   /**
320    * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the domain name, or
321    * {@code null} if no public suffix is present.
322    *
323    * @since 6.0
324    */
325   public InternetDomainName publicSuffix() {
326     return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
327   }
328 
329   /**
330    * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix},
331    * while not being a public suffix itself. For example, returns {@code true} for
332    * {@code www.google.com}, {@code foo.co.uk} and {@code bar.ca.us}, but not for {@code google},
333    * {@code com}, or {@code
334    * google.foo}.
335    *
336    * <p><b>Warning:</b> a {@code false} result from this method does not imply that the domain does
337    * not represent an addressable host, as many public suffixes are also addressable hosts. Use
338    * {@link #hasPublicSuffix()} for that test.
339    *
340    * <p>This method can be used to determine whether it will probably be possible to set cookies on
341    * the domain, though even that depends on individual browsers' implementations of cookie
342    * controls. See <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
343    *
344    * @since 6.0
345    */
346   public boolean isUnderPublicSuffix() {
347     return publicSuffixIndex > 0;
348   }
349 
350   /**
351    * Indicates whether this domain name is composed of exactly one subdomain component followed by a
352    * {@linkplain #isPublicSuffix() public suffix}. For example, returns {@code true} for
353    * {@code google.com} and {@code foo.co.uk}, but not for {@code www.google.com} or {@code co.uk}.
354    *
355    * <p><b>Warning:</b> A {@code true} result from this method does not imply that the domain is at
356    * the highest level which is addressable as a host, as many public suffixes are also addressable
357    * hosts. For example, the domain {@code bar.uk.com} has a public suffix of {@code uk.com}, so it
358    * would return {@code true} from this method. But {@code uk.com} is itself an addressable host.
359    *
360    * <p>This method can be used to determine whether a domain is probably the highest level for
361    * which cookies may be set, though even that depends on individual browsers' implementations of
362    * cookie controls. See <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
363    *
364    * @since 6.0
365    */
366   public boolean isTopPrivateDomain() {
367     return publicSuffixIndex == 1;
368   }
369 
370   /**
371    * Returns the portion of this domain name that is one level beneath the public suffix. For
372    * example, for {@code x.adwords.google.co.uk} it returns {@code google.co.uk}, since
373    * {@code co.uk} is a public suffix.
374    *
375    * <p>If {@link #isTopPrivateDomain()} is true, the current domain name instance is returned.
376    *
377    * <p>This method should not be used to determine the topmost parent domain which is addressable
378    * as a host, as many public suffixes are also addressable hosts. For example, the domain
379    * {@code foo.bar.uk.com} has a public suffix of {@code uk.com}, so it would return
380    * {@code bar.uk.com} from this method. But {@code uk.com} is itself an addressable host.
381    *
382    * <p>This method can be used to determine the probable highest level parent domain for which
383    * cookies may be set, though even that depends on individual browsers' implementations of cookie
384    * controls.
385    *
386    * @throws IllegalStateException if this domain does not end with a public suffix
387    * @since 6.0
388    */
389   public InternetDomainName topPrivateDomain() {
390     if (isTopPrivateDomain()) {
391       return this;
392     }
393     checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
394     return ancestor(publicSuffixIndex - 1);
395   }
396 
397   /**
398    * Indicates whether this domain is composed of two or more parts.
399    */
400   public boolean hasParent() {
401     return parts.size() > 1;
402   }
403 
404   /**
405    * Returns an {@code InternetDomainName} that is the immediate ancestor of this one; that is, the
406    * current domain with the leftmost part removed. For example, the parent of
407    * {@code www.google.com} is {@code google.com}.
408    *
409    * @throws IllegalStateException if the domain has no parent, as determined by {@link #hasParent}
410    */
411   public InternetDomainName parent() {
412     checkState(hasParent(), "Domain '%s' has no parent", name);
413     return ancestor(1);
414   }
415 
416   /**
417    * Returns the ancestor of the current domain at the given number of levels "higher" (rightward)
418    * in the subdomain list. The number of levels must be non-negative, and less than {@code N-1},
419    * where {@code N} is the number of parts in the domain.
420    *
421    * <p>TODO: Reasonable candidate for addition to public API.
422    */
423   private InternetDomainName ancestor(int levels) {
424     return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
425   }
426 
427   /**
428    * Creates and returns a new {@code InternetDomainName} by prepending the argument and a dot to
429    * the current name. For example, {@code
430    * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code InternetDomainName}
431    * with the value {@code www.bar.foo.com}. Only lenient validation is performed, as described
432    * {@link #from(String) here}.
433    *
434    * @throws NullPointerException if leftParts is null
435    * @throws IllegalArgumentException if the resulting name is not valid
436    */
437   public InternetDomainName child(String leftParts) {
438     return from(checkNotNull(leftParts) + "." + name);
439   }
440 
441   /**
442    * Indicates whether the argument is a syntactically valid domain name using lenient validation.
443    * Specifically, validation against <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
444    * ("Internationalizing Domain Names in Applications") is skipped.
445    *
446    * <p>The following two code snippets are equivalent:
447    *
448    * <pre>   {@code
449    *   domainName = InternetDomainName.isValid(name)
450    *       ? InternetDomainName.from(name)
451    *       : DEFAULT_DOMAIN;}</pre>
452    *
453    * <pre>   {@code
454    *   try {
455    *     domainName = InternetDomainName.from(name);
456    *   } catch (IllegalArgumentException e) {
457    *     domainName = DEFAULT_DOMAIN;
458    *   }}</pre>
459    *
460    * @since 8.0 (previously named {@code isValidLenient})
461    */
462   public static boolean isValid(String name) {
463     try {
464       from(name);
465       return true;
466     } catch (IllegalArgumentException e) {
467       return false;
468     }
469   }
470 
471   /**
472    * Does the domain name match one of the "wildcard" patterns (e.g. {@code "*.ar"})?
473    */
474   private static boolean matchesWildcardPublicSuffix(String domain) {
475     final String[] pieces = domain.split(DOT_REGEX, 2);
476     return pieces.length == 2 && PublicSuffixPatterns.UNDER.containsKey(pieces[1]);
477   }
478 
479   /**
480    * Returns the domain name, normalized to all lower case.
481    */
482   @Override
483   public String toString() {
484     return name;
485   }
486 
487   /**
488    * Equality testing is based on the text supplied by the caller, after normalization as described
489    * in the class documentation. For example, a non-ASCII Unicode domain name and the Punycode
490    * version of the same domain name would not be considered equal.
491    *
492    */
493   @Override
494   public boolean equals(@Nullable Object object) {
495     if (object == this) {
496       return true;
497     }
498 
499     if (object instanceof InternetDomainName) {
500       InternetDomainName that = (InternetDomainName) object;
501       return this.name.equals(that.name);
502     }
503 
504     return false;
505   }
506 
507   @Override
508   public int hashCode() {
509     return name.hashCode();
510   }
511 }