All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.protocol.http.ProtocolStatusTranslator.kt Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
package ai.platon.pulsar.skeleton.crawl.protocol.http

import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.skeleton.crawl.common.URLUtil
import ai.platon.pulsar.persist.CrawlStatus
import ai.platon.pulsar.persist.ProtocolStatus
import ai.platon.pulsar.persist.ProtocolStatus.ARG_HTTP_CODE
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.ProtocolStatusCodes
import ai.platon.pulsar.persist.metadata.ProtocolStatusCodes.REQUEST_TIMEOUT
import org.apache.http.HttpStatus

object ProtocolStatusTranslator {

    fun translateHttpCode(httpCode: Int): ProtocolStatus {
        return when (httpCode) {
            200 -> ProtocolStatus.STATUS_SUCCESS
            304 -> ProtocolStatus.STATUS_NOTMODIFIED
            in 300..399 -> {
                // handle redirect
                // some broken servers, such as MS IIS, use lowercase header name...
                val code = when (httpCode) {
                    HttpStatus.SC_MULTIPLE_CHOICES -> ProtocolStatus.MOVED_PERMANENTLY
                    HttpStatus.SC_MOVED_PERMANENTLY, HttpStatus.SC_USE_PROXY -> ProtocolStatus.MOVED_PERMANENTLY
                    HttpStatus.SC_MOVED_TEMPORARILY, HttpStatus.SC_SEE_OTHER, HttpStatus.SC_TEMPORARY_REDIRECT -> ProtocolStatus.MOVED_TEMPORARILY
                    else -> ProtocolStatus.MOVED_PERMANENTLY
                }
                // handle redirection in the higher layer.
                // page.getMetadata().set(ARG_REDIRECT_TO_URL, url.toString());
                ProtocolStatus.failed(code, ARG_HTTP_CODE, httpCode)
            }
            HttpStatus.SC_BAD_REQUEST -> {
                ProtocolStatus.failed(ProtocolStatusCodes.GONE, ARG_HTTP_CODE, httpCode)
            }
            HttpStatus.SC_UNAUTHORIZED -> { // requires authorization, but no valid auth provided.
                ProtocolStatus.failed(ProtocolStatusCodes.UNAUTHORIZED, ARG_HTTP_CODE, httpCode)
            }
            HttpStatus.SC_NOT_FOUND -> { // GONE
                ProtocolStatus.failed(ProtocolStatusCodes.NOT_FOUND, ARG_HTTP_CODE, httpCode)
            }
            HttpStatus.SC_REQUEST_TIMEOUT -> { // TIMEOUT
                ProtocolStatus.failed(REQUEST_TIMEOUT, ARG_HTTP_CODE, httpCode)
            }
            HttpStatus.SC_GONE -> { // permanently GONE
                ProtocolStatus.failed(ProtocolStatusCodes.GONE, ARG_HTTP_CODE, httpCode)
            }
            else -> {
                ProtocolStatus.failed(ProtocolStatus.EXCEPTION, ARG_HTTP_CODE, httpCode)
            }
        }
    }

    fun translateToCrawlStatus(protocolStatus: ProtocolStatus, page: WebPage): CrawlStatus {
        return when (protocolStatus.minorCode) {
            ProtocolStatus.SUCCESS_OK -> CrawlStatus.STATUS_FETCHED
            ProtocolStatus.NOT_MODIFIED -> CrawlStatus.STATUS_NOTMODIFIED
            ProtocolStatus.CANCELED -> CrawlStatus.STATUS_UNFETCHED

            ProtocolStatus.MOVED_PERMANENTLY,
            ProtocolStatus.MOVED_TEMPORARILY -> handleMoved(page, protocolStatus)

            ProtocolStatus.UNAUTHORIZED,
            ProtocolStatus.ROBOTS_DENIED,
            ProtocolStatus.UNKNOWN_HOST,
            ProtocolStatus.GONE,
            ProtocolStatus.NOT_FOUND -> CrawlStatus.STATUS_GONE

            ProtocolStatus.EXCEPTION,
            ProtocolStatus.RETRY,
            ProtocolStatus.BLOCKED -> CrawlStatus.STATUS_RETRY

            ProtocolStatus.REQUEST_TIMEOUT,
            ProtocolStatus.THREAD_TIMEOUT,
            ProtocolStatus.WEB_DRIVER_TIMEOUT,
            ProtocolStatus.SCRIPT_TIMEOUT -> CrawlStatus.STATUS_RETRY

            else -> CrawlStatus.STATUS_RETRY
        }
    }

    private fun handleMoved(page: WebPage, protocolStatus: ProtocolStatus): CrawlStatus {
        val url = page.url
        val minorCode = protocolStatus.minorCode

        val temp: Boolean
        val crawlStatus = if (minorCode == ProtocolStatus.MOVED_PERMANENTLY) {
            temp = false
            CrawlStatus.STATUS_REDIR_PERM
        } else {
            temp = true
            CrawlStatus.STATUS_REDIR_TEMP
        }

        val newUrl = protocolStatus.getArgOrElse(ProtocolStatus.ARG_REDIRECT_TO_URL, "")
        if (newUrl.isNotEmpty()) {
            // handleRedirect(url, newUrl, temp, PROTOCOL_REDIR, fetchTask.getPage());
            val reprUrl = URLUtil.chooseRepr(url, newUrl, temp)
            if (reprUrl.length >= AppConstants.SHORTEST_VALID_URL_LENGTH) {
                page.reprUrl = reprUrl
            }
        }
        return crawlStatus
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy