All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.schedule.FetchSchedule.kt Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.skeleton.crawl.schedule

import ai.platon.pulsar.common.config.Parameterized
import ai.platon.pulsar.persist.WebPage
import java.time.Duration
import java.time.Instant

data class ModifyInfo(
    /**
     * The actual latest fetch time, WebPage.fetchTime will be updated by this value
     * */
    var fetchTime: Instant,
    /**
     * The previous actual latest fetch time
     * */
    var prevFetchTime: Instant,
    var prevModifiedTime: Instant,
    var modifiedTime: Instant,
    var modified: Int,
)

/**
 * This interface defines the contract for implementations that manipulate fetch
 * times and re-fetch intervals.
 */
interface FetchSchedule : Parameterized {

    companion object {
        /**
         * It is unknown whether page was changed since our last visit.
         */
        const val STATUS_UNKNOWN = 0

        /**
         * Page is known to have been modified since our last visit.
         */
        const val STATUS_MODIFIED = 1

        /**
         * Page is known to remain unmodified since our last visit.
         */
        const val STATUS_NOTMODIFIED = 2
    }

    val maxFetchInterval: Duration

    /**
     * Initialize fetch schedule related data. Implementations should at least set
     * the `fetchTime` and `fetchInterval`. The default
     * implementation set the `fetchTime` to now, using the default
     * `fetchInterval`.
     *
     * @param page
     */
    fun initializeSchedule(page: WebPage)

    /**
     * Sets the `fetchInterval` and `fetchTime` on a
     * successfully fetched page. Implementations may use supplied arguments to
     * support different re-fetching schedules.
     *
     * @param page             The Web page
     * @param prevFetchTime    The prev fetch time, (prev fetch time) = (the latest actual fetch time)
     * @param modifiedTime     The previous value of modifiedTime, or 0 if not available
     * @param fetchTime        The current fetch time, it's almost now, will be updated to the further
     * @param modifiedTime     The last time the content was modified. This information comes from
     * the protocol implementations, or is set to < 0 if not available.
     * Most FetchSchedule implementations should update the value in
     * @param state            if [STATUS_MODIFIED], then the content is considered to be
     * "changed" before the `fetchTime`, if
     * [STATUS_NOTMODIFIED] then the content is known to be
     * unchanged. This information may be obtained by comparing page
     * signatures before and after fetching. If this is set to
     * [STATUS_UNKNOWN], then it is unknown whether the page was
     * changed; implementations are free to follow a sensible default
     * behavior.
     */
    fun setFetchSchedule(page: WebPage, m: ModifyInfo)

    /**
     * This method specifies how to schedule refetching of pages marked as GONE.
     * Default implementation increases fetchInterval by 50%, and if it exceeds
     * the `maxInterval` it calls
     * [.forceRefetch].
     *
     * @param page The page
     */
    fun setPageGoneSchedule(
        page: WebPage,
        prevFetchTime: Instant, prevModifiedTime: Instant, fetchTime: Instant,
    )

    /**
     * This method adjusts the fetch schedule if fetching needs to be re-tried due
     * to transient errors. The default implementation sets the next fetch time 1
     * day in the future and increases the retry counter.Set
     *
     * @param page             The page
     * @param newPrevFetchTime    previous fetch time
     * @param prevModifiedTime previous modified time
     * @param fetchTime        current fetch time
     */
    fun setPageRetrySchedule(
        page: WebPage,
        prevFetchTime: Instant, prevModifiedTime: Instant, fetchTime: Instant,
    )

    /**
     * Calculates last fetch time of the given CrawlDatum.
     *
     * @return the date as a long.
     */
    fun estimatePrevFetchTime(page: WebPage): Instant

    /**
     * This method provides information whether the page is suitable for selection
     * in the current fetchlist. NOTE: a true return value does not guarantee that
     * the page will be fetched, it just allows it to be included in the further
     * selection process based on scores. The default implementation checks
     * `fetchTime`, if it is higher than the
     *
     * @param curTime it returns false, and true otherwise. It will also check that
     * fetchTime is not too remote (more than `maxInterval),
     * in which case it lowers the interval and returns true.
     * @param page The Web page
     * @param curTime reference time(usually set to the time when the fetch list generation process was started).
     * @return true, if the page should be considered for inclusion in the current fetch list, otherwise false.
     * */
    fun shouldFetch(page: WebPage, now: Instant): Boolean

    /**
     * This method resets fetchTime, fetchInterval, modifiedTime and page
     * text, so that it forces refetching.
     *
     * @param page The Web page
     * @param asap if true, force refetch as soon as possible - this sets the
     * fetchTime to now. If false, force refetch whenever the next fetch
     * time is set.
     */
    fun forceRefetch(page: WebPage, prevFetchTime: Instant, asap: Boolean)
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy