ai.platon.pulsar.schedule.NewsMonitorFetchSchedule.kt Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.platon.pulsar.schedule
import ai.platon.pulsar.common.DateTimes.HOURS_PER_DAY
import ai.platon.pulsar.common.DateTimes.HOURS_PER_MONTH
import ai.platon.pulsar.common.DateTimes.HOURS_PER_YEAR
import ai.platon.pulsar.common.config.AppConstants.TCP_IP_STANDARDIZED_TIME
import ai.platon.pulsar.common.config.AppConstants.YES_STRING
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.skeleton.common.message.MiscMessageWriter
import ai.platon.pulsar.skeleton.crawl.filter.CrawlFilter
import ai.platon.pulsar.skeleton.crawl.schedule.AdaptiveFetchSchedule
import ai.platon.pulsar.skeleton.crawl.schedule.ModifyInfo
import ai.platon.pulsar.persist.PageCounters
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.Mark
import java.time.Duration
import java.time.Instant
import java.time.LocalDateTime
import java.time.temporal.ChronoUnit
/**
* This class implements an adaptive re-fetch algorithm.
*
* NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
* the algorithm, so that the fetch interval either increases or decreases
* infinitely, with little relevance to the page changes.
*
* @author Vincent Zhang
*/
class NewsMonitorFetchSchedule(
conf: ImmutableConfig,
messageWriter: MiscMessageWriter
): AdaptiveFetchSchedule(conf, messageWriter) {
private val middleNight = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS)
private val middleNightInstant = Instant.now().truncatedTo(ChronoUnit.DAYS)
// Check semi-inactive pages at 1 o'clock at night
private val semiInactivePageCheckTime = middleNight.plusHours(25)
override fun setFetchSchedule(page: WebPage, m: ModifyInfo) {
var newModifiedTime = m.modifiedTime
if (newModifiedTime.isBefore(TCP_IP_STANDARDIZED_TIME)) {
newModifiedTime = m.fetchTime
}
val distance = page.distance
val interval: Duration
if (page.isSeed) {
interval = adjustSeedFetchInterval(page, m.fetchTime, newModifiedTime, m.modified)
} else if (distance < maxDistance && veryLikeIndexPage(page)) {
// TODO : search for new seed pages from navigator(non-leaf) pages.
// We should monitor seed pages at day, and search for new seed pages at night
// We should have two mode : monitor mode and explore mode
// Under monitor mode, we just check seed again and again to detect the changes
// And under explore mode, indexing pages are detected automatically
interval = adjustSeedFetchInterval(page, m.fetchTime, newModifiedTime, m.modified)
} else if (veryLikeDetailPage(page)) {
// Detail pages are fetched only once, once it's mark
interval = Duration.ofDays(365 * 10L)
page.marks.put(Mark.INACTIVE, YES_STRING)
} else if (newModifiedTime.isAfter(middleNightInstant) && newModifiedTime.isAfter(m.prevModifiedTime)) {
val refArticles = page.pageCounters.get(PageCounters.Ref.item)
val fetchCount = page.fetchCount
if (refArticles > fetchCount / 10 - 1) {
// There are still bugs for modify time calculation
super.setFetchSchedule(page, m)
}
return
} else {
interval = adjustSemiInactivePageFetchInterval(page, m.fetchTime, newModifiedTime, m.modified)
page.marks.put(Mark.SEMI_INACTIVE, YES_STRING)
}
updateRefetchTime(page, interval, m)
}
private fun veryLikeIndexPage(page: WebPage): Boolean {
return page.pageCategory.isIndex || CrawlFilter.guessPageCategory(page.url).isIndex
}
private fun veryLikeDetailPage(page: WebPage): Boolean {
return page.pageCategory.isDetail || CrawlFilter.guessPageCategory(page.url).isDetail
}
/**
* Adjust fetch interval for article pages
*/
private fun adjustSeedFetchInterval(page: WebPage, fetchTime: Instant, modifiedTime: Instant, state: Int): Duration {
val fetchCount = page.fetchCount
if (fetchCount <= 1) {
// Ref-parameters are not initialized yet
return MIN_INTERVAL
}
var interval = page.fetchInterval
val pageCounters = page.pageCounters
// int noArticles = pageCounters.get(PageCounters.Self.noArticle);
val refArticles = pageCounters.get(PageCounters.Ref.item)
if (fetchCount > 5 && refArticles == 0) {
pageCounters.increase(PageCounters.Self.noItem)
messageWriter?.reportFetchSchedule(page, false)
// Check it at 1 o'clock next night, decrease fetch frequency if no articles
interval = Duration.between(LocalDateTime.now(), semiInactivePageCheckTime)
.plusDays(fetchCount / 10L).plusHours(fetchCount.toLong())
return interval
}
val hours = ChronoUnit.HOURS.between(modifiedTime, fetchTime)
if (hours <= 1 * HOURS_PER_DAY) {
// There are updates today, keep re-fetch the page in every crawl loop
interval = MIN_INTERVAL
} else if (hours <= 3 * HOURS_PER_DAY) {
// If there is not updates in 24 hours but there are updates in 72 hours, re-fetch the page a hour later
interval = Duration.ofHours(1)
} else if (hours <= 3 * HOURS_PER_MONTH) {
// If there is no any updates in 72 hours but has updates in 3 month,
// check the page at least 1 hour later and increase fetch interval time by time
val inc = (interval.seconds * INC_RATE).toLong()
interval = interval.plusSeconds(inc)
if (interval.toHours() < 1) {
interval = Duration.ofHours(1)
}
if (hours < 10 * HOURS_PER_DAY) {
// No longer than SEED_MAX_INTERVAL
if (interval > SEED_MAX_INTERVAL) {
interval = SEED_MAX_INTERVAL
}
} else {
// The page is
messageWriter?.reportFetchSchedule(page, false)
}
} else if (hours > 10 * HOURS_PER_YEAR) {
// Longer than 10 years, it's very likely the publishTime/modifiedTime is wrong
messageWriter?.reportFetchSchedule(page, false)
return super.getFetchInterval(page, fetchTime, modifiedTime, state)
}
return interval
}
private fun adjustSemiInactivePageFetchInterval(page: WebPage, fetchTime: Instant, modifiedTime: Instant, state: Int): Duration {
val distance = page.distance
if (distance >= maxDistance) {
// anything to do
}
val fetchCount = page.fetchCount
var interval = Duration.between(LocalDateTime.now(), semiInactivePageCheckTime).plusHours(fetchCount.toLong())
val pageCounters = page.pageCounters
val refArticles = pageCounters.get(PageCounters.Ref.item)
if (fetchCount > 3 && distance < maxDistance && refArticles == 0) {
interval = interval.plusDays(fetchCount.toLong())
}
// it seems that modified time is extracted correctly
if (modifiedTime.isAfter(TCP_IP_STANDARDIZED_TIME)) {
val days = ChronoUnit.DAYS.between(modifiedTime, fetchTime)
if (days > 30) {
interval = interval.plusDays(days)
}
}
return interval
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy