All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.schedule.AbstractFetchSchedule.kt Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.skeleton.crawl.schedule

import ai.platon.pulsar.skeleton.common.message.MiscMessageWriter
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.persist.ext.options
import ai.platon.pulsar.skeleton.common.persist.ext.updateFetchTime
import ai.platon.pulsar.persist.CrawlStatus
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.CrawlStatusCodes
import ai.platon.pulsar.persist.metadata.Mark
import java.time.Duration
import java.time.Instant
import java.time.temporal.ChronoUnit

/**
 * This class provides common methods for implementations of
 * [FetchSchedule].
 *
 * @author Andrzej Bialecki
 */
abstract class AbstractFetchSchedule(
    val conf: ImmutableConfig,
    val messageWriter: MiscMessageWriter? = null,
) : FetchSchedule {
    protected var defaultInterval = conf.getDuration(CapabilityTypes.FETCH_DEFAULT_INTERVAL, Duration.ofDays(30))
    protected val impreciseNow = Instant.now()
    override val maxFetchInterval: Duration =
        conf.getDuration(CapabilityTypes.FETCH_MAX_INTERVAL, ChronoUnit.DECADES.duration)

    override fun getParams(): Params {
        return Params.of(
            "defaultInterval", defaultInterval,
            "maxInterval", maxFetchInterval,
            "maxFetchInterval", maxFetchInterval,
        )
    }

    /**
     * Initialize fetch schedule related data. Implementations should at least set
     * the `fetchTime` and `fetchInterval`. The default
     * implementation sets the `fetchTime` to now, using the default
     * `fetchInterval`.
     *
     * @param page
     */
    override fun initializeSchedule(page: WebPage) {
        page.fetchInterval = defaultInterval
        page.fetchRetries = 0
        page.crawlStatus = CrawlStatus.STATUS_UNFETCHED
    }

    /**
     * Sets the `fetchInterval` and `fetchTime` on a successfully fetched page.
     * NOTE: this implementation resets the retry counter -
     * extending classes should call super.setFetchSchedule() to preserve this behavior.

     * @param m The modification info
     */
    override fun setFetchSchedule(page: WebPage, m: ModifyInfo) {
        if (page.protocolStatus.isSuccess) {
            page.fetchRetries = 0
        }

//        page.fetchInterval = page.options.fetchInterval
        page.fetchInterval = page.options.expires

        // note: page.fetchTime might not be the same as the actual fetch time
        val now = Instant.now()
        page.updateFetchTime(now, now + page.fetchInterval)

        page.modifiedTime = m.modifiedTime
        page.prevModifiedTime = m.prevModifiedTime
    }

    /**
     * This method adjusts the fetch schedule if fetching needs to be re-tried due
     * to transient errors. The default implementation sets the next fetch time 1
     * day in the future and increases the retry counter.
     *
     * @param page             WebPage to retry
     * @param prevFetchTime    previous fetch time
     * @param prevModifiedTime previous modified time
     * @param fetchTime        current fetch time
     */
    override fun setPageRetrySchedule(
        page: WebPage,
        prevFetchTime: Instant,
        prevModifiedTime: Instant,
        fetchTime: Instant,
    ) {
        page.fetchRetries++
        // retry immediately, this is the default behaviour
        val now = Instant.now()
        page.fetchInterval = Duration.ofSeconds(0)
        page.updateFetchTime(now, now)

        val crawlStatus = if (page.fetchRetries <= page.maxRetries) CrawlStatusCodes.UNFETCHED else CrawlStatusCodes.GONE
        page.setCrawlStatus(crawlStatus.toInt())
    }

    /**
     * This method specifies how to schedule refetching of pages marked as GONE.
     * Default implementation increases fetchInterval by 50% but the value may
     * never exceed `maxInterval`.
     *
     * @param page
     * @return adjusted page information, including all original information.
     * NOTE: this may be a different instance than
     */
    override fun setPageGoneSchedule(
        page: WebPage, prevFetchTime: Instant, prevModifiedTime: Instant, fetchTime: Instant,
    ) {
        page.fetchInterval = ChronoUnit.DECADES.duration
        val now = Instant.now()
        page.updateFetchTime(now, now + page.fetchInterval)
    }

    /**
     * This method return the last fetch time of the WebPage
     *
     * @return the date as a long.
     */
    override fun estimatePrevFetchTime(page: WebPage): Instant {
        return page.fetchTime - page.fetchInterval
    }

    /**
     * This method provides information whether the page is suitable for selection
     * in the current fetchlist. NOTE: a true return value does not guarantee that
     * the page will be fetched, it just allows it to be included in the further
     * selection process based on scores. The default implementation checks
     * `fetchTime`, if it is higher than the
     *
     * @param page    Web page to fetch
     * @param now reference time (usually set to the time when the fetchlist
     * generation process was started).
     * @return true, if the page should be considered for inclusion in the current
     * fetchlist, otherwise false.
     */
    override fun shouldFetch(page: WebPage, now: Instant): Boolean {
        if (page.hasMark(Mark.INACTIVE)) {
            return false
        }

        if (page.options.isExpired(now)) {
            return true
        }

        // Pages are never truly GONE - we have to check them from time to time.
        // pages with too long fetchInterval are adjusted so that they fit within
        // maximum fetchInterval (batch retention period).
        val fetchTime = page.fetchTime
//        if (now + maxFetchInterval < fetchTime) {
//            if (page.fetchInterval > maxFetchInterval) {
//                page.setFetchInterval(maxFetchInterval.seconds * 0.9f)
//            }
//            page.updateFetchTime(now, now)
//        }

        return fetchTime < now
    }

    /**
     * This method resets fetchTime, fetchInterval, modifiedTime,
     * retriesSinceFetch and page text, so that it forces refetching.
     *
     * @param page
     * @param asap if true, force refetch as soon as possible - this sets the
     * fetchTime to now. If false, force refetch whenever the next fetch
     * time is set.
     */
    override fun forceRefetch(page: WebPage, prevFetchTime: Instant, asap: Boolean) {
        if (page.hasMark(Mark.INACTIVE)) {
            return
        }

        // reduce fetchInterval so that it fits within the max value
        if (page.fetchInterval > maxFetchInterval) {
            page.setFetchInterval(maxFetchInterval.seconds * 0.9f)
        }
        page.crawlStatus = CrawlStatus.STATUS_UNFETCHED
        page.fetchRetries = 0

        val fetchInterval = if (asap) Duration.ZERO else page.fetchInterval
        val now = Instant.now()
        page.updateFetchTime(now, now + fetchInterval)
    }

    protected fun updateRefetchTime(page: WebPage, fetchInterval: Duration, m: ModifyInfo) {
        val now = Instant.now()
        page.fetchInterval = fetchInterval
        page.updateFetchTime(now, now + page.fetchInterval)
        page.prevModifiedTime = m.prevModifiedTime
        page.modifiedTime = m.modifiedTime
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy