All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.common.net.InternetDomainName Maven / Gradle / Ivy

/*
 * Copyright (C) 2009 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.common.net;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.common.annotations.Beta;
import com.google.common.annotations.GwtCompatible;
import com.google.common.base.Ascii;
import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.errorprone.annotations.Immutable;
import com.google.thirdparty.publicsuffix.PublicSuffixPatterns;
import com.google.thirdparty.publicsuffix.PublicSuffixType;
import java.util.List;
import org.checkerframework.checker.nullness.compatqual.NullableDecl;

/**
 * An immutable well-formed internet domain name, such as {@code com} or {@code foo.co.uk}. Only
 * syntactic analysis is performed; no DNS lookups or other network interactions take place. Thus
 * there is no guarantee that the domain actually exists on the internet.
 *
 * 

One common use of this class is to determine whether a given string is likely to represent an * addressable domain on the web -- that is, for a candidate string {@code "xxx"}, might browsing to * {@code "http://xxx/"} result in a webpage being displayed? In the past, this test was frequently * done by determining whether the domain ended with a {@linkplain #isPublicSuffix() public suffix} * but was not itself a public suffix. However, this test is no longer accurate. There are many * domains which are both public suffixes and addressable as hosts; {@code "uk.com"} is one example. * Using the subset of public suffixes that are {@linkplain #isRegistrySuffix() registry suffixes}, * one can get a better result, as only a few registry suffixes are addressable. However, the most * useful test to determine if a domain is a plausible web host is {@link #hasPublicSuffix()}. This * will return {@code true} for many domains which (currently) are not hosts, such as {@code "com"}, * but given that any public suffix may become a host without warning, it is better to err on the * side of permissiveness and thus avoid spurious rejection of valid sites. Of course, to actually * determine addressability of any host, clients of this class will need to perform their own DNS * lookups. * *

During construction, names are normalized in two ways: * *

    *
  1. ASCII uppercase characters are converted to lowercase. *
  2. Unicode dot separators other than the ASCII period ({@code '.'}) are converted to the ASCII * period. *
* *

The normalized values will be returned from {@link #toString()} and {@link #parts()}, and will * be reflected in the result of {@link #equals(Object)}. * *

Internationalized domain * names such as {@code 网络.cn} are supported, as are the equivalent IDNA Punycode-encoded * versions. * * @author Catherine Berry * @since 5.0 */ @Beta @GwtCompatible @Immutable public final class InternetDomainName { private static final CharMatcher DOTS_MATCHER = CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); private static final Splitter DOT_SPLITTER = Splitter.on('.'); private static final Joiner DOT_JOINER = Joiner.on('.'); /** * Value of {@link #publicSuffixIndex} or {@link #registrySuffixIndex} which indicates that no * relevant suffix was found. */ private static final int NO_SUFFIX_FOUND = -1; /** * Maximum parts (labels) in a domain name. This value arises from the 255-octet limit described * in RFC 2181 part 11 with the fact that the * encoding of each part occupies at least two bytes (dot plus label externally, length byte plus * label internally). Thus, if all labels have the minimum size of one byte, 127 of them will fit. */ private static final int MAX_PARTS = 127; /** * Maximum length of a full domain name, including separators, and leaving room for the root * label. See RFC 2181 part 11. */ private static final int MAX_LENGTH = 253; /** * Maximum size of a single part of a domain name. See RFC 2181 part 11. */ private static final int MAX_DOMAIN_PART_LENGTH = 63; /** The full domain name, converted to lower case. */ private final String name; /** The parts of the domain name, converted to lower case. */ private final ImmutableList parts; /** * The index in the {@link #parts()} list at which the public suffix begins. For example, for the * domain name {@code myblog.blogspot.co.uk}, the value would be 1 (the index of the {@code * blogspot} part). The value is negative (specifically, {@link #NO_SUFFIX_FOUND}) if no public * suffix was found. */ private final int publicSuffixIndex; /** * The index in the {@link #parts()} list at which the registry suffix begins. For example, for * the domain name {@code myblog.blogspot.co.uk}, the value would be 2 (the index of the {@code * co} part). The value is negative (specifically, {@link #NO_SUFFIX_FOUND}) if no registry suffix * was found. */ private final int registrySuffixIndex; /** Constructor used to implement {@link #from(String)}, and from subclasses. */ InternetDomainName(String name) { // Normalize: // * ASCII characters to lowercase // * All dot-like characters to '.' // * Strip trailing '.' name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); if (name.endsWith(".")) { name = name.substring(0, name.length() - 1); } checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name); this.name = name; this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name); checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); this.publicSuffixIndex = findSuffixOfType(Optional.absent()); this.registrySuffixIndex = findSuffixOfType(Optional.of(PublicSuffixType.REGISTRY)); } /** * Returns the index of the leftmost part of the suffix, or -1 if not found. Note that the value * defined as a suffix may not produce {@code true} results from {@link #isPublicSuffix()} or * {@link #isRegistrySuffix()} if the domain ends with an excluded domain pattern such as {@code * "nhs.uk"}. * *

If a {@code desiredType} is specified, this method only finds suffixes of the given type. * Otherwise, it finds the first suffix of any type. */ private int findSuffixOfType(Optional desiredType) { final int partsSize = parts.size(); for (int i = 0; i < partsSize; i++) { String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); if (matchesType( desiredType, Optional.fromNullable(PublicSuffixPatterns.EXACT.get(ancestorName)))) { return i; } // Excluded domains (e.g. !nhs.uk) use the next highest // domain as the effective public suffix (e.g. uk). if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) { return i + 1; } if (matchesWildcardSuffixType(desiredType, ancestorName)) { return i; } } return NO_SUFFIX_FOUND; } /** * Returns an instance of {@link InternetDomainName} after lenient validation. Specifically, * validation against RFC 3490 * ("Internationalizing Domain Names in Applications") is skipped, while validation against RFC 1035 is relaxed in the following ways: * *

    *
  • Any part containing non-ASCII characters is considered valid. *
  • Underscores ('_') are permitted wherever dashes ('-') are permitted. *
  • Parts other than the final part may start with a digit, as mandated by RFC 1123. *
* * * @param domain A domain name (not IP address) * @throws IllegalArgumentException if {@code domain} is not syntactically valid according to * {@link #isValid} * @since 10.0 (previously named {@code fromLenient}) */ public static InternetDomainName from(String domain) { return new InternetDomainName(checkNotNull(domain)); } /** * Validation method used by {@code from} to ensure that the domain name is syntactically valid * according to RFC 1035. * * @return Is the domain name syntactically valid? */ private static boolean validateSyntax(List parts) { final int lastIndex = parts.size() - 1; // Validate the last part specially, as it has different syntax rules. if (!validatePart(parts.get(lastIndex), true)) { return false; } for (int i = 0; i < lastIndex; i++) { String part = parts.get(i); if (!validatePart(part, false)) { return false; } } return true; } private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); private static final CharMatcher PART_CHAR_MATCHER = CharMatcher.javaLetterOrDigit().or(DASH_MATCHER); /** * Helper method for {@link #validateSyntax(List)}. Validates that one part of a domain name is * valid. * * @param part The domain name part to be validated * @param isFinalPart Is this the final (rightmost) domain part? * @return Whether the part is valid */ private static boolean validatePart(String part, boolean isFinalPart) { // These tests could be collapsed into one big boolean expression, but // they have been left as independent tests for clarity. if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { return false; } /* * GWT claims to support java.lang.Character's char-classification methods, but it actually only * works for ASCII. So for now, assume any non-ASCII characters are valid. The only place this * seems to be documented is here: * https://groups.google.com/d/topic/google-web-toolkit-contributors/1UEzsryq1XI * *

ASCII characters in the part are expected to be valid per RFC 1035, with underscore also * being allowed due to widespread practice. */ String asciiChars = CharMatcher.ascii().retainFrom(part); if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { return false; } // No initial or final dashes or underscores. if (DASH_MATCHER.matches(part.charAt(0)) || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { return false; } /* * Note that we allow (in contravention of a strict interpretation of the relevant RFCs) domain * parts other than the last may begin with a digit (for example, "3com.com"). It's important to * disallow an initial digit in the last part; it's the only thing that stops an IPv4 numeric * address like 127.0.0.1 from looking like a valid domain name. */ if (isFinalPart && CharMatcher.digit().matches(part.charAt(0))) { return false; } return true; } /** * Returns the individual components of this domain name, normalized to all lower case. For * example, for the domain name {@code mail.google.com}, this method returns the list {@code * ["mail", "google", "com"]}. */ public ImmutableList parts() { return parts; } /** * Indicates whether this domain name represents a public suffix, as defined by the Mozilla * Foundation's Public Suffix List (PSL). A public suffix * is one under which Internet users can directly register names, such as {@code com}, {@code * co.uk} or {@code pvt.k12.wy.us}. Examples of domain names that are not public suffixes * include {@code google.com}, {@code foo.co.uk}, and {@code myblog.blogspot.com}. * *

Public suffixes are a proper superset of {@linkplain #isRegistrySuffix() registry suffixes}. * The list of public suffixes additionally contains privately owned domain names under which * Internet users can register subdomains. An example of a public suffix that is not a registry * suffix is {@code blogspot.com}. Note that it is true that all public suffixes have * registry suffixes, since domain name registries collectively control all internet domain names. * *

For considerations on whether the public suffix or registry suffix designation is more * suitable for your application, see this article. * * @return {@code true} if this domain name appears exactly on the public suffix list * @since 6.0 */ public boolean isPublicSuffix() { return publicSuffixIndex == 0; } /** * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix}, * including if it is a public suffix itself. For example, returns {@code true} for {@code * www.google.com}, {@code foo.co.uk} and {@code com}, but not for {@code invalid} or {@code * google.invalid}. This is the recommended method for determining whether a domain is potentially * an addressable host. * *

Note that this method is equivalent to {@link #hasRegistrySuffix()} because all registry * suffixes are public suffixes and all public suffixes have registry suffixes. * * @since 6.0 */ public boolean hasPublicSuffix() { return publicSuffixIndex != NO_SUFFIX_FOUND; } /** * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the domain name, or {@code * null} if no public suffix is present. * * @since 6.0 */ public InternetDomainName publicSuffix() { return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; } /** * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix}, * while not being a public suffix itself. For example, returns {@code true} for {@code * www.google.com}, {@code foo.co.uk} and {@code myblog.blogspot.com}, but not for {@code com}, * {@code co.uk}, {@code google.invalid}, or {@code blogspot.com}. * *

This method can be used to determine whether it will probably be possible to set cookies on * the domain, though even that depends on individual browsers' implementations of cookie * controls. See RFC 2109 for details. * * @since 6.0 */ public boolean isUnderPublicSuffix() { return publicSuffixIndex > 0; } /** * Indicates whether this domain name is composed of exactly one subdomain component followed by a * {@linkplain #isPublicSuffix() public suffix}. For example, returns {@code true} for {@code * google.com} {@code foo.co.uk}, and {@code myblog.blogspot.com}, but not for {@code * www.google.com}, {@code co.uk}, or {@code blogspot.com}. * *

This method can be used to determine whether a domain is probably the highest level for * which cookies may be set, though even that depends on individual browsers' implementations of * cookie controls. See RFC 2109 for details. * * @since 6.0 */ public boolean isTopPrivateDomain() { return publicSuffixIndex == 1; } /** * Returns the portion of this domain name that is one level beneath the {@linkplain * #isPublicSuffix() public suffix}. For example, for {@code x.adwords.google.co.uk} it returns * {@code google.co.uk}, since {@code co.uk} is a public suffix. Similarly, for {@code * myblog.blogspot.com} it returns the same domain, {@code myblog.blogspot.com}, since {@code * blogspot.com} is a public suffix. * *

If {@link #isTopPrivateDomain()} is true, the current domain name instance is returned. * *

This method can be used to determine the probable highest level parent domain for which * cookies may be set, though even that depends on individual browsers' implementations of cookie * controls. * * @throws IllegalStateException if this domain does not end with a public suffix * @since 6.0 */ public InternetDomainName topPrivateDomain() { if (isTopPrivateDomain()) { return this; } checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); return ancestor(publicSuffixIndex - 1); } /** * Indicates whether this domain name represents a registry suffix, as defined by a subset * of the Mozilla Foundation's Public Suffix List (PSL). A * registry suffix is one under which Internet users can directly register names via a domain name * registrar, and have such registrations lawfully protected by internet-governing bodies such as * ICANN. Examples of registry suffixes include {@code com}, {@code co.uk}, and {@code * pvt.k12.wy.us}. Examples of domain names that are not registry suffixes include {@code * google.com} and {@code foo.co.uk}. * *

Registry suffixes are a proper subset of {@linkplain #isPublicSuffix() public suffixes}. The * list of public suffixes additionally contains privately owned domain names under which Internet * users can register subdomains. An example of a public suffix that is not a registry suffix is * {@code blogspot.com}. Note that it is true that all public suffixes have registry * suffixes, since domain name registries collectively control all internet domain names. * *

For considerations on whether the public suffix or registry suffix designation is more * suitable for your application, see this article. * * @return {@code true} if this domain name appears exactly on the public suffix list as part of * the registry suffix section (labelled "ICANN"). * @since 23.3 */ public boolean isRegistrySuffix() { return registrySuffixIndex == 0; } /** * Indicates whether this domain name ends in a {@linkplain #isRegistrySuffix() registry suffix}, * including if it is a registry suffix itself. For example, returns {@code true} for {@code * www.google.com}, {@code foo.co.uk} and {@code com}, but not for {@code invalid} or {@code * google.invalid}. * *

Note that this method is equivalent to {@link #hasPublicSuffix()} because all registry * suffixes are public suffixes and all public suffixes have registry suffixes. * * @since 23.3 */ public boolean hasRegistrySuffix() { return registrySuffixIndex != NO_SUFFIX_FOUND; } /** * Returns the {@linkplain #isRegistrySuffix() registry suffix} portion of the domain name, or * {@code null} if no registry suffix is present. * * @since 23.3 */ public InternetDomainName registrySuffix() { return hasRegistrySuffix() ? ancestor(registrySuffixIndex) : null; } /** * Indicates whether this domain name ends in a {@linkplain #isRegistrySuffix() registry suffix}, * while not being a registry suffix itself. For example, returns {@code true} for {@code * www.google.com}, {@code foo.co.uk} and {@code blogspot.com}, but not for {@code com}, {@code * co.uk}, or {@code google.invalid}. * * @since 23.3 */ public boolean isUnderRegistrySuffix() { return registrySuffixIndex > 0; } /** * Indicates whether this domain name is composed of exactly one subdomain component followed by a * {@linkplain #isRegistrySuffix() registry suffix}. For example, returns {@code true} for {@code * google.com}, {@code foo.co.uk}, and {@code blogspot.com}, but not for {@code www.google.com}, * {@code co.uk}, or {@code myblog.blogspot.com}. * *

Warning: This method should not be used to determine the probable highest level * parent domain for which cookies may be set. Use {@link #topPrivateDomain()} for that purpose. * * @since 23.3 */ public boolean isTopDomainUnderRegistrySuffix() { return registrySuffixIndex == 1; } /** * Returns the portion of this domain name that is one level beneath the {@linkplain * #isRegistrySuffix() registry suffix}. For example, for {@code x.adwords.google.co.uk} it * returns {@code google.co.uk}, since {@code co.uk} is a registry suffix. Similarly, for {@code * myblog.blogspot.com} it returns {@code blogspot.com}, since {@code com} is a registry suffix. * *

If {@link #isTopDomainUnderRegistrySuffix()} is true, the current domain name instance is * returned. * *

Warning: This method should not be used to determine whether a domain is probably the * highest level for which cookies may be set. Use {@link #isTopPrivateDomain()} for that purpose. * * @throws IllegalStateException if this domain does not end with a registry suffix * @since 23.3 */ public InternetDomainName topDomainUnderRegistrySuffix() { if (isTopDomainUnderRegistrySuffix()) { return this; } checkState(isUnderRegistrySuffix(), "Not under a registry suffix: %s", name); return ancestor(registrySuffixIndex - 1); } /** Indicates whether this domain is composed of two or more parts. */ public boolean hasParent() { return parts.size() > 1; } /** * Returns an {@code InternetDomainName} that is the immediate ancestor of this one; that is, the * current domain with the leftmost part removed. For example, the parent of {@code * www.google.com} is {@code google.com}. * * @throws IllegalStateException if the domain has no parent, as determined by {@link #hasParent} */ public InternetDomainName parent() { checkState(hasParent(), "Domain '%s' has no parent", name); return ancestor(1); } /** * Returns the ancestor of the current domain at the given number of levels "higher" (rightward) * in the subdomain list. The number of levels must be non-negative, and less than {@code N-1}, * where {@code N} is the number of parts in the domain. * *

TODO: Reasonable candidate for addition to public API. */ private InternetDomainName ancestor(int levels) { return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); } /** * Creates and returns a new {@code InternetDomainName} by prepending the argument and a dot to * the current name. For example, {@code InternetDomainName.from("foo.com").child("www.bar")} * returns a new {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only lenient * validation is performed, as described {@link #from(String) here}. * * @throws NullPointerException if leftParts is null * @throws IllegalArgumentException if the resulting name is not valid */ public InternetDomainName child(String leftParts) { return from(checkNotNull(leftParts) + "." + name); } /** * Indicates whether the argument is a syntactically valid domain name using lenient validation. * Specifically, validation against RFC 3490 * ("Internationalizing Domain Names in Applications") is skipped. * *

The following two code snippets are equivalent: * *

{@code
   * domainName = InternetDomainName.isValid(name)
   *     ? InternetDomainName.from(name)
   *     : DEFAULT_DOMAIN;
   * }
* *
{@code
   * try {
   *   domainName = InternetDomainName.from(name);
   * } catch (IllegalArgumentException e) {
   *   domainName = DEFAULT_DOMAIN;
   * }
   * }
* * @since 8.0 (previously named {@code isValidLenient}) */ public static boolean isValid(String name) { try { from(name); return true; } catch (IllegalArgumentException e) { return false; } } /** * Does the domain name match one of the "wildcard" patterns (e.g. {@code "*.ar"})? If a {@code * desiredType} is specified, the wildcard pattern must also match that type. */ private static boolean matchesWildcardSuffixType( Optional desiredType, String domain) { List pieces = DOT_SPLITTER.limit(2).splitToList(domain); return pieces.size() == 2 && matchesType( desiredType, Optional.fromNullable(PublicSuffixPatterns.UNDER.get(pieces.get(1)))); } /** * If a {@code desiredType} is specified, returns true only if the {@code actualType} is * identical. Otherwise, returns true as long as {@code actualType} is present. */ private static boolean matchesType( Optional desiredType, Optional actualType) { return desiredType.isPresent() ? desiredType.equals(actualType) : actualType.isPresent(); } /** Returns the domain name, normalized to all lower case. */ @Override public String toString() { return name; } /** * Equality testing is based on the text supplied by the caller, after normalization as described * in the class documentation. For example, a non-ASCII Unicode domain name and the Punycode * version of the same domain name would not be considered equal. */ @Override public boolean equals(@NullableDecl Object object) { if (object == this) { return true; } if (object instanceof InternetDomainName) { InternetDomainName that = (InternetDomainName) object; return this.name.equals(that.name); } return false; } @Override public int hashCode() { return name.hashCode(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy