All Downloads are FREE. Search and download functionalities are using the official Maven repository.

crawlercommons.domains.EffectiveTldFinder Maven / Gradle / Ivy

Go to download

crawler-commons is a set of reusable Java components that implement functionality common to any web crawler.

The newest version!
/**
 * Copyright 2016 Crawler-Commons
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package crawlercommons.domains;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.IDN;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Locale;
import java.nio.charset.StandardCharsets;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * To determine the actual domain name of a host name or URL requires knowledge
 * of the various domain registrars and their assignment policies. The best
 * publicly available knowledge base is the public suffix list maintained and
 * available at publicsuffix.org. This
 * class implements the
 * publicsuffix.org ruleset and
 * uses a copy of the public suffix list.
 * 
 * For more information, see
 * 
 * 
 * EffectiveTldFinder loads the public suffix list as file
 * "effective_tld_names.dat" from the Java classpath. Make sure your classpath
 * does not contain any other file with the same name, eg. an outdated list
 * shipped with a third party library. To force EffectiveTldFinder to load an
 * updated or modified public suffix list, call
 * {@link EffectiveTldFinder#getInstance()
 * EffectiveTldFinder.getInstance()}{@link EffectiveTldFinder#initialize(InputStream)
 * .initialize(InputStream)}. Updates to the public suffix list can be found
 * here:
 * 
 * 
 * 

ICANN vs. Private Domains

* * The public suffix list (see section * "divisions") is subdivided into "ICANN" and * "PRIVATE" domains. To restrict the EffectiveTldFinder to * "ICANN" domains only, pass "true" as flag * excludePrivate to * {@link EffectiveTldFinder#getAssignedDomain(String, boolean, boolean)} resp. * {@link EffectiveTldFinder#getEffectiveTLD(String, boolean)}. This will * exclude the eTLDs from the PRIVATE domain section of the public suffix list * while a domain or eTLD is matched. * */ public class EffectiveTldFinder { private static final Logger LOGGER = LoggerFactory.getLogger(EffectiveTldFinder.class); public static final String ETLD_DATA = "/effective_tld_names.dat"; public static final String COMMENT = "//"; public static final String DOT_REGEX = "\\."; public static final String EXCEPTION = "!"; public static final String WILD_CARD = "*."; public static final char DOT = '.'; /** * Max. length in ASCII characters of a dot-separated segment in host names * (applies to domain names as well), cf. * https://tools.ietf.org/html/rfc1034#section-3.1 and * https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames * * Note: We only have to validate domain names and not the host names passed * as input. For domain names a verification of the segment length also * implies that the entire domain names stays in the limit of 253 * characters. Wildcard suffixes only allow two additional segments (2*63+1 * = 127 chars) and all wildcard suffixes are far away from reaching the * critical length of 126 characters. */ public static final int MAX_DOMAIN_LENGTH_PART = 63; private static EffectiveTldFinder instance = null; private Map domains = null; private SuffixTrie domainTrie = new SuffixTrie<>(); private boolean configured = false; /** * A singleton loading the public suffix list from the Java class path. */ private EffectiveTldFinder() { URL publicSuffixList = this.getClass().getResource(ETLD_DATA); LOGGER.info("Loading public suffix list from class path: {}", publicSuffixList); try (InputStream is = publicSuffixList.openStream()) { initialize(is); } catch (IOException e) { LOGGER.error("Failed to load public suffix list {} from class path: {}", publicSuffixList, e); } } /** * Get singleton instance of EffectiveTldFinder with default configuration. * * @return singleton instance of EffectiveTldFinder */ public static EffectiveTldFinder getInstance() { if (null == instance) { instance = new EffectiveTldFinder(); } return instance; } /** * (Re)initialize EffectiveTldFinder with custom public suffix list. * * @param effectiveTldDataStream * content of public suffix list as input stream * @return true if (re)initialization was successful */ public boolean initialize(InputStream effectiveTldDataStream) { domains = new HashMap<>(); domainTrie = new SuffixTrie<>(); boolean inPrivateDomainSection = false; try { BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, StandardCharsets.UTF_8)); String line = null; while (null != (line = input.readLine())) { if (line.trim().isEmpty()) { continue; } else if (line.startsWith(COMMENT)) { if (line.contains("===BEGIN PRIVATE DOMAINS===")) { inPrivateDomainSection = true; } else if (line.contains("===END PRIVATE DOMAINS===")) { inPrivateDomainSection = false; } continue; } else { EffectiveTLD entry = new EffectiveTLD(line, inPrivateDomainSection); for (String var : entry.getNameVariants()) { domains.put(var, entry); domainTrie.put(var, entry); } } } configured = true; } catch (IOException e) { LOGGER.error("EffectiveTldFinder configuration failed: ", e); configured = false; } return configured; } @SuppressWarnings("unchecked") public static Map getEffectiveTLDs() { // The keys and values are immutables but we don't want the caller // changing the repertoire of our internal Map entries, so we clone return (Map) ((HashMap) getInstance().domains).clone(); } /** * Get EffectiveTLD for host name using the singleton instance of * EffectiveTldFinder. * * @param hostname * the hostname for which to find the {@link EffectiveTLD} * @return the {@link EffectiveTLD} */ public static EffectiveTLD getEffectiveTLD(String hostname) { return getEffectiveTLD(hostname, false); } /** * Get EffectiveTLD for host name using the singleton instance of * EffectiveTldFinder. * * @param hostname * the hostname for which to find the {@link EffectiveTLD} * @param excludePrivate * do not return an effective TLD from the PRIVATE section, * instead return the shorter eTLD not in the PRIVATE section * @return the {@link EffectiveTLD} */ public static EffectiveTLD getEffectiveTLD(String hostname, boolean excludePrivate) { SuffixTrie.LookupResult res = findEffectiveTLD(hostname, excludePrivate); if (res == null) { return null; } return res.value; } /** * Find EffectiveTLD and offset in host name using the singleton instance of * EffectiveTldFinder. * * @param hostname * the hostname for which to find the {@link EffectiveTLD} * @param excludePrivate * do not return an effective TLD from the PRIVATE section, * instead return the shorter eTLD not in the PRIVATE section * @return the {@link EffectiveTLD} or null if none is found */ private static SuffixTrie.LookupResult findEffectiveTLD(String hostname, boolean excludePrivate) { List> suffixes = getInstance().domainTrie.getSuffixes(hostname); for (int i = suffixes.size() - 1; i >= 0; i--) { SuffixTrie.LookupResult res = suffixes.get(i); int offset = res.offset; if (offset == 0 || DOT == hostname.charAt(offset - 1)) { EffectiveTLD foundTld = res.value; if (excludePrivate && foundTld.isPrivate) { continue; } if (offset == 0 || foundTld.isException() || !foundTld.isWild()) { return res; } // wildcards create an open ETLD namespace int wildcardOffset = hostname.lastIndexOf(DOT, offset - 2); String retryTld; if (wildcardOffset == -1) { // no further dot-separated element found, take full host // name retryTld = hostname; } else { retryTld = hostname.substring(wildcardOffset + 1); } try { foundTld = new EffectiveTLD(retryTld, foundTld.isPrivate); } catch (IllegalArgumentException e) { // retryTld contains forbidden characters return null; } return new SuffixTrie.LookupResult(wildcardOffset + 1, foundTld); } } return null; } /** * This method uses the effective TLD to determine which component of a FQDN * is the NIC-assigned domain name (aka "Paid Level Domain"). * * @param hostname * a string for which to obtain a NIC-assigned domain name * @return the NIC-assigned domain name or as fall-back the hostname if no * FQDN with valid TLD is found */ public static String getAssignedDomain(String hostname) { return getAssignedDomain(hostname, false, false); } /** * This method uses the effective TLD to determine which component of a FQDN * is the NIC-assigned domain name (aka "Paid Level Domain"). * * @param hostname * a string for which to obtain a NIC-assigned domain name * @param strict * do not return the hostname as fall-back if a FQDN with valid * TLD cannot be determined * @return the NIC-assigned domain name, null if strict and no FQDN with * valid TLD is found */ public static String getAssignedDomain(String hostname, boolean strict) { return getAssignedDomain(hostname, strict, false); } /** * This method uses the effective TLD to determine which component of a FQDN * is the NIC-assigned domain name. * * @param hostname * a string for which to obtain a NIC-assigned domain name * @param strict * do not return the hostname as fall-back if a FQDN with valid * TLD cannot be determined * @param excludePrivate * do not return a domain which is below an eTLD from the PRIVATE * section, return the shorter domain which is below the * "ICANN" registry suffix * @return the NIC-assigned domain name, null if strict and no FQDN with * valid TLD is found */ public static String getAssignedDomain(String hostname, boolean strict, boolean excludePrivate) { hostname = hostname.toLowerCase(Locale.ROOT); SuffixTrie.LookupResult res = findEffectiveTLD(hostname, excludePrivate); if (res == null) { return (strict ? null : hostname); } EffectiveTLD etld = res.value; if (etld.isException()) { return etld.domain; } if (res.offset == 0) { // found eTLD covering entire hostname: // if strict: hostname cannot be an eTLD (except if it's an // exception which is already checked) return (strict ? null : hostname); } // clip hostname one dot-separated element before eTLD int etldStartPos = res.offset - 1; if (hostname.charAt(etldStartPos) != DOT) { // should not happen: no dot before TLD LOGGER.debug("No dot before eTLD {} in {}", hostname.substring(res.offset), hostname); return (strict ? null : hostname); } int start = 0; int pos; while ((pos = hostname.indexOf(DOT, start)) != -1) { if (pos == start) { // there must be at least one character between two dots LOGGER.debug("Two immediately consecutive dots in hostname: {}", hostname); return (strict ? null : hostname); } if (pos >= etldStartPos) break; start = pos + 1; } String domainSegment = hostname.substring(start, etldStartPos); if (!EffectiveTLD.isAscii(domainSegment)) { try { IDN.toASCII(domainSegment); } catch (IllegalArgumentException e) { // not a valid IDN segment, // includes check for max. length (63 chars) return (strict ? null : hostname); } } else if (strict) { // (strict mode) check for max. length of segment (63 chars) if (domainSegment.length() > MAX_DOMAIN_LENGTH_PART) { return null; } } return hostname.substring(start); } public boolean isConfigured() { return configured; } private static String join(String[] ary) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < ary.length; i++) { sb.append(ary[i]).append(DOT); } sb.deleteCharAt(sb.length() - 1); return sb.toString(); } public static void help() { LOGGER.error("EffectiveTldFinder [-etld] [-strict] [-excludePrivate]"); LOGGER.error(" get domains or public suffixes for host names"); LOGGER.error("Options:"); LOGGER.error(" -etld"); LOGGER.error(" change mode: return public suffix (eTLD)"); LOGGER.error(" -strict"); LOGGER.error(" return null if no valid suffix/TLD is found"); LOGGER.error(" -excludePrivate"); LOGGER.error(" do not match suffixes from the private section of the public suffix list"); LOGGER.error("Input is read from stdin, output on stdout: host \\t domain/eTLD"); } public static void main(String[] args) throws IOException { boolean modeEtld = false; boolean strict = false; boolean excludePrivate = false; for (String arg : args) { switch (arg) { case "-etld": modeEtld = true; break; case "-strict": strict = true; break; case "-excludePrivate": excludePrivate = true; break; case "-h": case "-?": case "-help": case "--help": help(); System.exit(0); default: LOGGER.error("Unknown argument: {}", arg); help(); System.exit(1); } } String line, domain; EffectiveTLD etld; BufferedReader in = new BufferedReader(new InputStreamReader(System.in, UTF_8)); while ((line = in.readLine()) != null) { if (modeEtld) { etld = EffectiveTldFinder.getEffectiveTLD(line, excludePrivate); System.out.println(line + '\t' + etld); } else { domain = EffectiveTldFinder.getAssignedDomain(line, strict, excludePrivate); System.out.println(line + '\t' + domain); } } } /** * EffectiveTLD objects hold one line of the public suffix list: *
    *
  • the suffix (com, co.uk, etc.)
  • *
  • for IDN suffixes: both the ASCII and IDN variant * (xn--p1ai and рф)
  • *
  • and the properties required to parse host/domain names given in the * public suffix list (wildcard suffix, exception, in private domain * section)
  • *
*/ public static class EffectiveTLD { private boolean exception = false; private boolean wild = false; private boolean isPrivate = false; private String domain = null; private String idn = null; /** * Parse one non-empty, non-comment line in the public suffix list and * hold the public suffix and its properties in the created object. * * @param line * non-empty, non-comment line in the public suffix list * @param isPrivateDomain * whether line is in the section of "PRIVATE * DOMAINS" of the public suffix list * @throws IllegalArgumentException * if the input line contains non-ASCII Unicode characters * prohibited in IDNs, cf. {@link IDN#toASCII(String)} */ public EffectiveTLD(String line, boolean isPrivateDomain) throws IllegalArgumentException { if (line.startsWith(EXCEPTION)) { exception = true; domain = line.substring(EXCEPTION.length(), line.length()); } else if (line.startsWith(WILD_CARD)) { wild = true; domain = line.substring(WILD_CARD.length(), line.length()); } else { domain = line; } String norm = normalizeName(domain); if (!norm.equals(domain)) { idn = domain; domain = norm; } isPrivate = isPrivateDomain; } /** * Normalize a domain name: convert characters into to lowercase and * encode dot-separated segments containing non-ASCII characters. Cf. * {@link #asciiConvert(String)} and {@link IDN#toASCII(String)} * * @param str * domain name segment * @return normalized domain name containing only ASCII characters * @throws IllegalArgumentException * if the input contains prohibited characters */ private String normalizeName(String name) throws IllegalArgumentException { String[] parts = name.split(DOT_REGEX); String[] ary = new String[parts.length]; for (int i = 0; i < parts.length; i++) { ary[i] = asciiConvert(parts[i]); } return join(ary); } /** * Generate name variants caused by Internationalized Domain Names: * every IDN part of a eTLD can be replaced by its punycoded ASCII * variant. For two-part IDN eTLDs this will generate 4 variants. * * @return set of variant names */ public Set getNameVariants() { Set res = new HashSet<>(); if (idn == null) { res.add(domain); return res; } String[] parts = idn.split(DOT_REGEX); String[] var = new String[parts.length]; for (int i = 0; i < parts.length; i++) { if (!isAscii(parts[i])) { var[i] = IDN.toASCII(parts[i]); } } for (int i = 0; i < parts.length; i++) { Set r = new HashSet<>(); if (res.size() > 0) { for (String p : res) { r.add(p + DOT + parts[i]); } } else { r.add(parts[i]); } if (var[i] != null && !var[i].equals(parts[i])) { if (res.size() > 0) { for (String p : res) { r.add(p + DOT + var[i]); } } else { r.add(var[i]); } } res = r; } return res; } /** * Converts a single domain name segment (separated by dots) to ASCII if * it contains non-ASCII character, cf. {@link IDN#toASCII(String)}. * * @param str * domain name segment * @return ASCII "Punycode" representation of the domain name segment * @throws IllegalArgumentException * if the input contains prohibited characters */ private static String asciiConvert(String str) throws IllegalArgumentException { if (isAscii(str)) { return str.toLowerCase(Locale.ROOT); } return IDN.toASCII(str); } private static boolean isAscii(String str) { char[] chars = str.toCharArray(); for (char c : chars) { if (c > 127) { return false; } } return true; } public String getDomain() { return domain; } public boolean isWild() { return wild; } public boolean isException() { return exception; } @Override public String toString() { StringBuffer sb = new StringBuffer("["); sb.append("domain=").append(domain).append(","); sb.append("wild=").append(wild).append(","); sb.append("exception=").append(exception).append(","); sb.append("private=").append(isPrivate).append("]"); return sb.toString(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy