All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.schedule.AdaptiveFetchSchedule.kt Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version

package ai.platon.pulsar.skeleton.crawl.schedule

import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.message.MiscMessageWriter
import ai.platon.pulsar.persist.WebPage
import java.time.Duration
import java.time.Instant
import java.time.temporal.ChronoUnit

/**
 * This class implements an adaptive re-fetch algorithm. This works as follows:
 *
 *  * for pages that has changed since the last fetchTime, decrease their
 * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).
 *  * for pages that haven't changed since the last fetchTime, increase their
 * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).

* If SYNC_DELTA property is true, then: * * * calculate a `delta = fetchTime - modifiedTime` * * try to synchronize with the time of change, by shifting the next * fetchTime by a fraction of the difference between the last modification time * and the last fetch time. I.e. the next fetch time will be set to * `fetchTime + fetchInterval - delta * SYNC_DELTA_RATE` * * if the adjusted fetch interval is bigger than the delta, then * `fetchInterval = delta`. * * * * the minimum value of fetchInterval may not be smaller than MIN_INTERVAL * (default is 1 minute). * * the maximum value of fetchInterval may not be bigger than MAX_INTERVAL * (default is 365 days). * * * * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize * the algorithm, so that the fetch interval either increases or decreases * infinitely, with little relevance to the page changes. Please use * [.] method to test the values before applying them in a * production system. * * * @author Andrzej Bialecki */ open class AdaptiveFetchSchedule( conf: ImmutableConfig, messageWriter: MiscMessageWriter? = null, ) : AbstractFetchSchedule(conf, messageWriter) { protected var INC_RATE = conf.getFloat(CapabilityTypes.SCHEDULE_INC_RATE, 0.2f) protected var DEC_RATE = conf.getFloat(CapabilityTypes.SCHEDULE_DEC_RATE, 0.2f) protected var MIN_INTERVAL = conf.getDuration(CapabilityTypes.SCHEDULE_MIN_INTERVAL, Duration.ofMinutes(10)) protected var MAX_INTERVAL = conf.getDuration(CapabilityTypes.SCHEDULE_MAX_INTERVAL, Duration.ofDays(365)) protected var SEED_MAX_INTERVAL = conf.getDuration(CapabilityTypes.SCHEDULE_SEED_MAX_INTERVAL, Duration.ofDays(1)) protected var SYNC_DELTA = conf.getBoolean(CapabilityTypes.SCHEDULE_SYNC_DELTA, true) protected var SYNC_DELTA_RATE = conf.getFloat(CapabilityTypes.SCHEDULE_SYNC_DELTA_RATE, 0.2f).toDouble() protected var maxDistance = conf.getInt(CapabilityTypes.CRAWL_MAX_DISTANCE, AppConstants.DISTANCE_INFINITE) override fun getParams(): Params { return Params.of( "className", javaClass.simpleName, "MIN_INTERVAL", MIN_INTERVAL, "MAX_INTERVAL", MAX_INTERVAL, "SEED_MAX_INTERVAL", SEED_MAX_INTERVAL ).merge(super.getParams()) } override fun setFetchSchedule(page: WebPage, m: ModifyInfo) { val newModifiedTime = m.modifiedTime super.setFetchSchedule(page, m) if (m.modifiedTime < AppConstants.TCP_IP_STANDARDIZED_TIME) { m.modifiedTime = m.fetchTime } val newInterval = getFetchInterval(page, m.fetchTime, newModifiedTime, m.modified) updateRefetchTime(page, newInterval, m) } /** * This method specifies how to schedule refetching of pages marked as GONE. * Default implementation increases fetchInterval by 50% but the value may * never exceed `maxInterval`. * * @param page * @return adjusted page information, including all original information. * NOTE: this may be a different instance than */ override fun setPageGoneSchedule( page: WebPage, prevFetchTime: Instant, prevModifiedTime: Instant, fetchTime: Instant, ) { val prevInterval = page.fetchInterval.seconds.toFloat() var newInterval = prevInterval // no page is truly GONE ... just increase the interval by 50% // and try much later. newInterval = if (newInterval < maxFetchInterval.seconds) { prevInterval * 1.5f } else { maxFetchInterval.seconds * 0.9f } val now = Instant.now() page.setFetchInterval(newInterval) page.prevFetchTime = now page.fetchTime = now.plusSeconds(newInterval.toLong()) } protected fun getFetchInterval(page: WebPage, fetchTime_: Instant, modifiedTime: Instant, state: Int): Duration { var fetchTime = fetchTime_ var interval = page.fetchInterval.seconds when (state) { FetchSchedule.STATUS_MODIFIED -> interval *= (1.0f - DEC_RATE).toLong() FetchSchedule.STATUS_NOTMODIFIED -> interval *= (1.0f + INC_RATE).toLong() FetchSchedule.STATUS_UNKNOWN -> { } } if (SYNC_DELTA) { val gap = fetchTime.epochSecond - modifiedTime.epochSecond if (gap > interval) { interval = gap } // TODO : check fetch time fetchTime = fetchTime.minusSeconds(Math.round(gap * SYNC_DELTA_RATE)) } var newInterval = Duration.ofSeconds(interval) if (newInterval < MIN_INTERVAL) newInterval = MIN_INTERVAL if (newInterval > MAX_INTERVAL) newInterval = MAX_INTERVAL return newInterval } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy