All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.gitee.l0km.aocache.guava.net.InternetDomainName Maven / Gradle / Ivy

There is a newer version: 0.4.5
Show newest version
/*
 * Copyright (C) 2009 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.gitee.l0km.aocache.guava.net;

import static com.gitee.l0km.aocache.guava.base.Preconditions.checkArgument;
import static com.gitee.l0km.aocache.guava.base.Preconditions.checkNotNull;
import static com.gitee.l0km.aocache.guava.base.Preconditions.checkState;

import com.gitee.l0km.aocache.guava.annotations.Beta;
import com.gitee.l0km.aocache.guava.annotations.GwtCompatible;
import com.gitee.l0km.aocache.guava.base.Ascii;
import com.gitee.l0km.aocache.guava.base.CharMatcher;
import com.gitee.l0km.aocache.guava.base.Joiner;
import com.gitee.l0km.aocache.guava.base.Splitter;
import com.gitee.l0km.aocache.guava.collect.ImmutableList;
import com.google.thirdparty.publicsuffix.PublicSuffixPatterns;
import java.util.List;
import javax.annotation.Nullable;

/**
 * An immutable well-formed internet domain name, such as {@code com} or {@code
 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other network interactions
 * take place. Thus there is no guarantee that the domain actually exists on the internet.
 *
 * 

One common use of this class is to determine whether a given string is likely to represent an * addressable domain on the web -- that is, for a candidate string {@code "xxx"}, might browsing to * {@code "http://xxx/"} result in a webpage being displayed? In the past, this test was frequently * done by determining whether the domain ended with a {@linkplain #isPublicSuffix() public suffix} * but was not itself a public suffix. However, this test is no longer accurate. There are many * domains which are both public suffixes and addressable as hosts; {@code "uk.com"} is one example. * As a result, the only useful test to determine if a domain is a plausible web host is * {@link #hasPublicSuffix()}. This will return {@code true} for many domains which (currently) are * not hosts, such as {@code "com"}, but given that any public suffix may become a host without * warning, it is better to err on the side of permissiveness and thus avoid spurious rejection of * valid sites. * *

During construction, names are normalized in two ways: * *

    *
  1. ASCII uppercase characters are converted to lowercase. *
  2. Unicode dot separators other than the ASCII period ({@code '.'}) are converted to the ASCII * period. *
* *

The normalized values will be returned from {@link #toString()} and {@link #parts()}, and will * be reflected in the result of {@link #equals(Object)}. * *

Internationalized domain * names such as {@code 网络.cn} are supported, as are the equivalent * IDNA Punycode-encoded * versions. * * @author Craig Berry * @since 5.0 */ @Beta @GwtCompatible public final class InternetDomainName { private static final CharMatcher DOTS_MATCHER = CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); private static final Splitter DOT_SPLITTER = Splitter.on('.'); private static final Joiner DOT_JOINER = Joiner.on('.'); /** * Value of {@link #publicSuffixIndex} which indicates that no public suffix was found. */ private static final int NO_PUBLIC_SUFFIX_FOUND = -1; private static final String DOT_REGEX = "\\."; /** * Maximum parts (labels) in a domain name. This value arises from the 255-octet limit described * in RFC 2181 part 11 with the fact that the * encoding of each part occupies at least two bytes (dot plus label externally, length byte plus * label internally). Thus, if all labels have the minimum size of one byte, 127 of them will fit. */ private static final int MAX_PARTS = 127; /** * Maximum length of a full domain name, including separators, and leaving room for the root * label. See RFC 2181 part 11. */ private static final int MAX_LENGTH = 253; /** * Maximum size of a single part of a domain name. See * RFC 2181 part 11. */ private static final int MAX_DOMAIN_PART_LENGTH = 63; /** * The full domain name, converted to lower case. */ private final String name; /** * The parts of the domain name, converted to lower case. */ private final ImmutableList parts; /** * The index in the {@link #parts()} list at which the public suffix begins. For example, for the * domain name {@code www.google.co.uk}, the value would be 2 (the index of the {@code co} part). * The value is negative (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was * found. */ private final int publicSuffixIndex; /** * Constructor used to implement {@link #from(String)}, and from subclasses. */ InternetDomainName(String name) { // Normalize: // * ASCII characters to lowercase // * All dot-like characters to '.' // * Strip trailing '.' name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); if (name.endsWith(".")) { name = name.substring(0, name.length() - 1); } checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name); this.name = name; this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name); checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); this.publicSuffixIndex = findPublicSuffix(); } /** * Returns the index of the leftmost part of the public suffix, or -1 if not found. Note that the * value defined as the "public suffix" may not be a public suffix according to * {@link #isPublicSuffix()} if the domain ends with an excluded domain pattern such as * {@code "nhs.uk"}. */ private int findPublicSuffix() { final int partsSize = parts.size(); for (int i = 0; i < partsSize; i++) { String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); if (PublicSuffixPatterns.EXACT.containsKey(ancestorName)) { return i; } // Excluded domains (e.g. !nhs.uk) use the next highest // domain as the effective public suffix (e.g. uk). if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) { return i + 1; } if (matchesWildcardPublicSuffix(ancestorName)) { return i; } } return NO_PUBLIC_SUFFIX_FOUND; } /** * Returns an instance of {@link InternetDomainName} after lenient validation. Specifically, * validation against RFC 3490 * ("Internationalizing Domain Names in Applications") is skipped, while validation against * RFC 1035 is relaxed in the following ways: *

    *
  • Any part containing non-ASCII characters is considered valid. *
  • Underscores ('_') are permitted wherever dashes ('-') are permitted. *
  • Parts other than the final part may start with a digit, as mandated by * RFC 1123. *
* * * @param domain A domain name (not IP address) * @throws IllegalArgumentException if {@code name} is not syntactically valid according to * {@link #isValid} * @since 10.0 (previously named {@code fromLenient}) */ public static InternetDomainName from(String domain) { return new InternetDomainName(checkNotNull(domain)); } /** * Validation method used by {@from} to ensure that the domain name is syntactically valid * according to RFC 1035. * * @return Is the domain name syntactically valid? */ private static boolean validateSyntax(List parts) { final int lastIndex = parts.size() - 1; // Validate the last part specially, as it has different syntax rules. if (!validatePart(parts.get(lastIndex), true)) { return false; } for (int i = 0; i < lastIndex; i++) { String part = parts.get(i); if (!validatePart(part, false)) { return false; } } return true; } private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); private static final CharMatcher PART_CHAR_MATCHER = CharMatcher.javaLetterOrDigit().or(DASH_MATCHER); /** * Helper method for {@link #validateSyntax(List)}. Validates that one part of a domain name is * valid. * * @param part The domain name part to be validated * @param isFinalPart Is this the final (rightmost) domain part? * @return Whether the part is valid */ private static boolean validatePart(String part, boolean isFinalPart) { // These tests could be collapsed into one big boolean expression, but // they have been left as independent tests for clarity. if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { return false; } /* * GWT claims to support java.lang.Character's char-classification methods, but it actually only * works for ASCII. So for now, assume any non-ASCII characters are valid. The only place this * seems to be documented is here: * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html * *

ASCII characters in the part are expected to be valid per RFC 1035, with underscore also * being allowed due to widespread practice. */ String asciiChars = CharMatcher.ascii().retainFrom(part); if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { return false; } // No initial or final dashes or underscores. if (DASH_MATCHER.matches(part.charAt(0)) || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { return false; } /* * Note that we allow (in contravention of a strict interpretation of the relevant RFCs) domain * parts other than the last may begin with a digit (for example, "3com.com"). It's important to * disallow an initial digit in the last part; it's the only thing that stops an IPv4 numeric * address like 127.0.0.1 from looking like a valid domain name. */ if (isFinalPart && CharMatcher.digit().matches(part.charAt(0))) { return false; } return true; } /** * Returns the individual components of this domain name, normalized to all lower case. For * example, for the domain name {@code mail.google.com}, this method returns the list * {@code ["mail", "google", "com"]}. */ public ImmutableList parts() { return parts; } /** * Indicates whether this domain name represents a public suffix, as defined by the Mozilla * Foundation's Public Suffix List (PSL). A public suffix * is one under which Internet users can directly register names, such as {@code com}, * {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain names that are not public * suffixes include {@code google}, {@code google.com} and {@code foo.co.uk}. * * @return {@code true} if this domain name appears exactly on the public suffix list * @since 6.0 */ public boolean isPublicSuffix() { return publicSuffixIndex == 0; } /** * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix}, * including if it is a public suffix itself. For example, returns {@code true} for * {@code www.google.com}, {@code foo.co.uk} and {@code com}, but not for {@code google} or * {@code google.foo}. This is the recommended method for determining whether a domain is * potentially an addressable host. * * @since 6.0 */ public boolean hasPublicSuffix() { return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; } /** * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the domain name, or * {@code null} if no public suffix is present. * * @since 6.0 */ public InternetDomainName publicSuffix() { return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; } /** * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix}, * while not being a public suffix itself. For example, returns {@code true} for * {@code www.google.com}, {@code foo.co.uk} and {@code bar.ca.us}, but not for {@code google}, * {@code com}, or {@code * google.foo}. * *

Warning: a {@code false} result from this method does not imply that the domain does * not represent an addressable host, as many public suffixes are also addressable hosts. Use * {@link #hasPublicSuffix()} for that test. * *

This method can be used to determine whether it will probably be possible to set cookies on * the domain, though even that depends on individual browsers' implementations of cookie * controls. See RFC 2109 for details. * * @since 6.0 */ public boolean isUnderPublicSuffix() { return publicSuffixIndex > 0; } /** * Indicates whether this domain name is composed of exactly one subdomain component followed by a * {@linkplain #isPublicSuffix() public suffix}. For example, returns {@code true} for * {@code google.com} and {@code foo.co.uk}, but not for {@code www.google.com} or {@code co.uk}. * *

Warning: A {@code true} result from this method does not imply that the domain is at * the highest level which is addressable as a host, as many public suffixes are also addressable * hosts. For example, the domain {@code bar.uk.com} has a public suffix of {@code uk.com}, so it * would return {@code true} from this method. But {@code uk.com} is itself an addressable host. * *

This method can be used to determine whether a domain is probably the highest level for * which cookies may be set, though even that depends on individual browsers' implementations of * cookie controls. See RFC 2109 for details. * * @since 6.0 */ public boolean isTopPrivateDomain() { return publicSuffixIndex == 1; } /** * Returns the portion of this domain name that is one level beneath the public suffix. For * example, for {@code x.adwords.google.co.uk} it returns {@code google.co.uk}, since * {@code co.uk} is a public suffix. * *

If {@link #isTopPrivateDomain()} is true, the current domain name instance is returned. * *

This method should not be used to determine the topmost parent domain which is addressable * as a host, as many public suffixes are also addressable hosts. For example, the domain * {@code foo.bar.uk.com} has a public suffix of {@code uk.com}, so it would return * {@code bar.uk.com} from this method. But {@code uk.com} is itself an addressable host. * *

This method can be used to determine the probable highest level parent domain for which * cookies may be set, though even that depends on individual browsers' implementations of cookie * controls. * * @throws IllegalStateException if this domain does not end with a public suffix * @since 6.0 */ public InternetDomainName topPrivateDomain() { if (isTopPrivateDomain()) { return this; } checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); return ancestor(publicSuffixIndex - 1); } /** * Indicates whether this domain is composed of two or more parts. */ public boolean hasParent() { return parts.size() > 1; } /** * Returns an {@code InternetDomainName} that is the immediate ancestor of this one; that is, the * current domain with the leftmost part removed. For example, the parent of * {@code www.google.com} is {@code google.com}. * * @throws IllegalStateException if the domain has no parent, as determined by {@link #hasParent} */ public InternetDomainName parent() { checkState(hasParent(), "Domain '%s' has no parent", name); return ancestor(1); } /** * Returns the ancestor of the current domain at the given number of levels "higher" (rightward) * in the subdomain list. The number of levels must be non-negative, and less than {@code N-1}, * where {@code N} is the number of parts in the domain. * *

TODO: Reasonable candidate for addition to public API. */ private InternetDomainName ancestor(int levels) { return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); } /** * Creates and returns a new {@code InternetDomainName} by prepending the argument and a dot to * the current name. For example, {@code * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code InternetDomainName} * with the value {@code www.bar.foo.com}. Only lenient validation is performed, as described * {@link #from(String) here}. * * @throws NullPointerException if leftParts is null * @throws IllegalArgumentException if the resulting name is not valid */ public InternetDomainName child(String leftParts) { return from(checkNotNull(leftParts) + "." + name); } /** * Indicates whether the argument is a syntactically valid domain name using lenient validation. * Specifically, validation against RFC 3490 * ("Internationalizing Domain Names in Applications") is skipped. * *

The following two code snippets are equivalent: * *

   {@code
   *   domainName = InternetDomainName.isValid(name)
   *       ? InternetDomainName.from(name)
   *       : DEFAULT_DOMAIN;}
* *
   {@code
   *   try {
   *     domainName = InternetDomainName.from(name);
   *   } catch (IllegalArgumentException e) {
   *     domainName = DEFAULT_DOMAIN;
   *   }}
* * @since 8.0 (previously named {@code isValidLenient}) */ public static boolean isValid(String name) { try { from(name); return true; } catch (IllegalArgumentException e) { return false; } } /** * Does the domain name match one of the "wildcard" patterns (e.g. {@code "*.ar"})? */ private static boolean matchesWildcardPublicSuffix(String domain) { final String[] pieces = domain.split(DOT_REGEX, 2); return pieces.length == 2 && PublicSuffixPatterns.UNDER.containsKey(pieces[1]); } /** * Returns the domain name, normalized to all lower case. */ @Override public String toString() { return name; } /** * Equality testing is based on the text supplied by the caller, after normalization as described * in the class documentation. For example, a non-ASCII Unicode domain name and the Punycode * version of the same domain name would not be considered equal. * */ @Override public boolean equals(@Nullable Object object) { if (object == this) { return true; } if (object instanceof InternetDomainName) { InternetDomainName that = (InternetDomainName) object; return this.name.equals(that.name); } return false; } @Override public int hashCode() { return name.hashCode(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy