All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.common.URLUtil.kt Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.skeleton.crawl.common

import ai.platon.pulsar.skeleton.common.domain.DomainSuffix
import ai.platon.pulsar.skeleton.common.domain.DomainSuffixes
import ai.platon.pulsar.common.urls.UrlUtils.getURLOrNull
import org.slf4j.LoggerFactory
import java.net.*
import java.util.*

/**
 * Utility class for URL analysis.
 *
 * protocol://username:password@hostname:port/pathname?search#hash
 * -----------------------------href------------------------------
 *                              -----host----
 * -----------      origin      -------------
 *
 * protocol - protocol scheme of the URL, including the final ':'
 * hostname - domain name
 * port - port number
 * pathname - /pathname
 * search - ?parameters
 * hash - #fragment_identifier
 * username - username specified before the domain name
 * password - password specified before the domain name
 * href - the entire URL
 * origin - protocol://hostname:port
 * host - hostname:port
 *
 * TODO: merge with ai.platon.pulsar.common.url.Urls
 */
object URLUtil {
    private val logger = LoggerFactory.getLogger(URLUtil::class.java)
    private val IP_REGEX = Regex("(\\d{1,3}\\.){3}(\\d{1,3})")

    fun getHost(url: String): String? {
        val u = getURLOrNull(url) ?: return null
        return getHost(u, GroupMode.BY_HOST)
    }

    fun getHost(url: String, groupMode: GroupMode): String? {
        val u = getURLOrNull(url) ?: return null
        return getHost(u, groupMode)
    }

    fun getHost(url: String, defaultHost: String, groupMode: GroupMode): String {
        val host = getHost(url, groupMode) ?: return defaultHost
        return host.ifEmpty { defaultHost }
    }

    fun getHost(url: URL, defaultHost: String, groupMode: GroupMode): String {
        val host = getHost(url, groupMode) ?: return defaultHost
        return host.ifEmpty { defaultHost }
    }

    fun getHost(url: URL, groupMode: GroupMode): String? {
        var host: String?
        if (groupMode == GroupMode.BY_IP) {
            host = try {
                val addr = InetAddress.getByName(url.host)
                addr.hostAddress
            } catch (e: UnknownHostException) { // unable to resolve it, so don't fall back to host name
                logger.warn("Unable to resolve: " + url.host + ", skipping.")
                return ""
            }
        } else if (groupMode == GroupMode.BY_DOMAIN) {
            host = getDomainName(url)
            if (host == null) {
                logger.warn("Unknown domain for url: $url, using URL string as key")
                host = url.toExternalForm()
            }
        } else {
            host = url.host
            if (host == null) {
                logger.warn("Unknown host for url: $url, using URL string as key")
                host = url.toExternalForm()
            }
        }
        return host
    }

    /**
     * Returns the domain name of the url. The domain name of a url is the
     * substring of the url's hostname, w/o subdomain names. As an example 

* ` * getDomainName(conf, new URL(http://lucene.apache.org/)) ` *

* will return

* ` apache.org` * * @see com.google.common.net.InternetDomainName.topPrivateDomain */ fun getDomainName(url: URL): String { val tlds = DomainSuffixes.getInstance() var host = url.host // it seems that java returns hostnames ending with . if (host.endsWith(".")) host = host.substring(0, host.length - 1) if (IP_REGEX.matches(host)) return host var index = 0 var candidate = host while (index >= 0) { index = candidate.indexOf('.') val subCandidate = candidate.substring(index + 1) if (tlds.isDomainSuffix(subCandidate)) { return candidate } candidate = subCandidate } return candidate } /** * Returns the domain name of the url. The domain name of a url is the * substring of the url's hostname, w/o subdomain names. As an example

* ` * getDomainName(conf, new http://lucene.apache.org/) ` *

* will return

* ` apache.org` * * @see com.google.common.net.InternetDomainName.topPrivateDomain * * @throws MalformedURLException */ @Throws(MalformedURLException::class) fun getDomainName(url: String): String? { return getDomainName(URL(url)) } fun getDomainName(url: String, defaultDomain: String): String { return kotlin.runCatching { getDomainName(url) }.getOrNull()?:defaultDomain } /** * Returns whether the given urls have the same domain name. As an example,

* ` isSameDomain(new URL("http://lucene.apache.org") * , new URL("http://people.apache.org/")) *

will return true. ` * * @return true if the domain names are equal */ fun isSameDomainName(url1: URL, url2: URL): Boolean { return getDomainName(url1).equals(getDomainName(url2), ignoreCase = true) } /** * Returns whether the given urls have the same domain name. As an example,

* ` isSameDomain("http://lucene.apache.org" * ,"http://people.apache.org/") *

will return true. ` * * @return true if the domain names are equal * @throws MalformedURLException */ @Throws(MalformedURLException::class) fun isSameDomainName(url1: String?, url2: String?): Boolean { return isSameDomainName(URL(url1), URL(url2)) } /** * Returns the [DomainSuffix] corresponding to the last public part of * the hostname * * @see com.google.common.net.InternetDomainName.publicSuffix */ fun getDomainSuffix(url: String): DomainSuffix? { val u = getURLOrNull(url) ?: return null return getDomainSuffix(u) } /** * Returns the [DomainSuffix] corresponding to the last public part of * the hostname * * @see com.google.common.net.InternetDomainName.publicSuffix */ fun getDomainSuffix(url: URL): DomainSuffix? { return getDomainSuffix(DomainSuffixes.getInstance(), url) } fun getDomainSuffix(tlds: DomainSuffixes, url: URL): DomainSuffix? { // DomainSuffixes tlds = DomainSuffixes.getInstance(); val host = url.host if (IP_REGEX.matches(host)) { return null } var index = 0 var candidate = host while (index >= 0) { index = candidate.indexOf('.') val subCandidate = candidate.substring(index + 1) val d = tlds[subCandidate] if (d != null) { return d } candidate = subCandidate } return null } /** * Returns the [DomainSuffix] corresponding to the last public part of * the hostname */ @Throws(MalformedURLException::class) fun getDomainSuffix(tlds: DomainSuffixes, url: String?): DomainSuffix? { return getDomainSuffix(tlds, URL(url)) } /** Partitions of the hostname of the url by "." */ fun getHostBatches(url: URL): List { val host = url.host // return whole hostname, if it is an ipv4 // TODO : handle ipv6 return if (IP_REGEX.matches(host)) listOf(host) else host.split(".") } /** * Partitions of the hostname of the url by "." * * @throws MalformedURLException */ @Throws(MalformedURLException::class) fun getHostBatches(url: String): List { return getHostBatches(URL(url)) } /** * Given two urls, a src and a destination of a redirect, it returns the * representative url. * * This method implements an extended version of the algorithm used by the * Yahoo! Slurp crawler described here:

* [ How * does the Yahoo! webcrawler handle redirects?](http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html)

*

* * 1. Choose target url if either url is malformed. * 1. If different domains the keep the destination whether or not the * redirect is temp or perm * * * a.com -> b.com* * * 1. If the redirect is permanent and the source is root, keep the source. * * * *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html * * 1. If the redirect is permanent and the source is not root and the * destination is root, keep the destination * * * a.com/xyz/index.html -> a.com* * * 1. If the redirect is permanent and neither the source nor the destination * is root, then keep the destination * * * a.com/xyz/index.html -> a.com/abc/page.html* * * 1. If the redirect is temporary and source is root and destination is not * root, then keep the source * * * *a.com -> a.com/xyz/index.html * * 1. If the redirect is temporary and source is not root and destination is * root, then keep the destination * * * a.com/xyz/index.html -> a.com* * * 1. If the redirect is temporary and neither the source or the destination * is root, then keep the shortest url. First check for the shortest host, and * if both are equal then check by path. Path is first by length then by the * number of / path separators. * * * a.com/xyz/index.html -> a.com/abc/page.html* * * *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html * * 1. If the redirect is temporary and both the source and the destination * are root, then keep the shortest sub-domain * * * *www.a.com -> www.news.a.com * *

* While not in this logic there is a further piece of representative url * logic that occurs during indexing and after scoring. During creation of the * basic fields before indexing, if a url has a representative url stored we * check both the url and its representative url (which should never be the * same) against their linkrank scores and the highest scoring one is kept as * the url and the lower scoring one is held as the orig url inside of the * index. * * @param src * The source url. * @param dst * The destination url. * @param temp * Is the redirect a temporary redirect. * * @return String The representative url. */ @JvmStatic fun chooseRepr(src: String, dst: String, temp: Boolean): String { // validate both are well formed urls val srcUrl: URL val dstUrl: URL try { srcUrl = URL(src) dstUrl = URL(dst) } catch (e: MalformedURLException) { return dst } // get the source and destination domain, host, and page val srcDomain = getDomainName(srcUrl) val dstDomain = getDomainName(dstUrl) val srcHost = srcUrl.host val dstHost = dstUrl.host val srcFile = srcUrl.file val dstFile = dstUrl.file // are the source and destination the root path url.com/ or url.com val srcRoot = srcFile == "/" || srcFile.length == 0 val destRoot = dstFile == "/" || dstFile.length == 0 // 1) different domain them keep dest, temp or perm // a.com -> b.com* // // 2) permanent and root, keep src // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html // // 3) permanent and not root and dest root, keep dest // a.com/xyz/index.html -> a.com* // // 4) permanent and neither root keep dest // a.com/xyz/index.html -> a.com/abc/page.html* // // 5) temp and root and dest not root keep src // *a.com -> a.com/xyz/index.html // // 7) temp and not root and dest root keep dest // a.com/xyz/index.html -> a.com* // // 8) temp and neither root, keep shortest, if hosts equal by path else by // hosts. paths are first by length then by number of / separators // a.com/xyz/index.html -> a.com/abc/page.html* // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html // // 9) temp and both root keep shortest sub domain // *www.a.com -> www.news.a.com // if we are dealing with a redirect from one domain to another keep the // destination if (srcDomain != dstDomain) { return dst } // if it is a permanent redirect return if (!temp) { // if source is root return source, otherwise destination if (srcRoot) { src } else { dst } } else { // temporary redirect // source root and destination not root if (srcRoot && !destRoot) { src } else if (!srcRoot && destRoot) { // destination root and source not dst } else if (!srcRoot && !destRoot && srcHost == dstHost) { // source and destination hosts are the same, check paths, host length val numSrcPaths = srcFile.split("/").toTypedArray().size val numDstPaths = dstFile.split("/").toTypedArray().size if (numSrcPaths != numDstPaths) { if (numDstPaths < numSrcPaths) dst else src } else { val srcPathLength = srcFile.length val dstPathLength = dstFile.length if (dstPathLength < srcPathLength) dst else src } } else { // different host names and both root take the shortest val numSrcSubs = srcHost.split("\\.").toTypedArray().size val numDstSubs = dstHost.split("\\.").toTypedArray().size if (numDstSubs < numSrcSubs) dst else src } } } /** * Returns the lowercase origin for the url. * * @param url The url to check. * @return String The hostname for the url. */ @Throws(MalformedURLException::class) fun getOrigin(url: String): String { val u = URL(url) return u.protocol + "://" + u.host } /** * Returns the lowercase origin for the url or null if the url is not well-formed. * * @param url The url to check. * @return String The hostname for the url. */ fun getOriginOrNull(url: String?): String? { if (url == null) { return null } return try { val u = URL(url) u.protocol + "://" + u.host } catch (t: Throwable) { null } } /** * Returns the lowercase hostname for the url. * * @param url The url to check. * @return String The hostname for the url. */ @Throws(MalformedURLException::class) fun getHostName(url: String) = URL(url).host.lowercase(Locale.getDefault()) /** * Returns the lowercase hostname for the url or null if the url is not well-formed. * * @param url The url to check. * @return String The hostname for the url. */ fun getHostNameOrNull(url: String?): String? { if (url == null) { return null } return try { URL(url).host.lowercase(Locale.getDefault()) } catch (e: MalformedURLException) { null } } fun getHostName(url: String?, defaultValue: String): String { return try { URL(url).host.lowercase(Locale.getDefault()) } catch (e: MalformedURLException) { defaultValue } } /** * Returns the path for the url. The path consists of the protocol, host, and * path, but does not include the query string. The host is lowercased but the * path is not. * * @param url * The url to check. * @return String The path for the url. */ fun getQuery(url: String): String? { var url = url return try { // get the full url, and replace the query string with and empty string url = url.lowercase(Locale.getDefault()) val queryStr = URL(url).query if (queryStr != null) url.replace("?$queryStr", "") else url } catch (e: MalformedURLException) { null } } @JvmStatic fun toASCII(url: String?): String? { return try { val u = URL(url) val host = u.host if (host == null || host.isEmpty()) { // no host name => no punycoded domain name // also do not add additional slashes for file: URLs (PULSAR-1880) return url } val p = URI(u.protocol, u.userInfo, IDN.toASCII(host), u.port, u.path, u.query, u.ref) p.toString() } catch (e: Exception) { null } } @JvmStatic fun toUNICODE(url: String?): String? { return try { val u = URL(url) val host = u.host if (host == null || host.isEmpty()) { // no host name => no punycoded domain name // also do not add additional slashes for file: URLs (PULSAR-1880) return url } val sb = StringBuilder() sb.append(u.protocol) sb.append("://") if (u.userInfo != null) { sb.append(u.userInfo) sb.append('@') } sb.append(IDN.toUnicode(host)) if (u.port != -1) { sb.append(':') sb.append(u.port) } sb.append(u.file) // includes query if (u.ref != null) { sb.append('#') sb.append(u.ref) } sb.toString() } catch (e: Exception) { null } } /** * @see URLUtil */ enum class GroupMode { BY_IP, BY_DOMAIN, BY_HOST } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy