ai.platon.pulsar.skeleton.crawl.component.UpdateComponent.kt Maven / Gradle / Ivy
package ai.platon.pulsar.skeleton.crawl.component
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Parameterized
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.message.MiscMessageWriter
import ai.platon.pulsar.skeleton.common.metrics.MetricsSystem
import ai.platon.pulsar.skeleton.crawl.filter.CrawlFilter
import ai.platon.pulsar.skeleton.crawl.schedule.DefaultFetchSchedule
import ai.platon.pulsar.skeleton.crawl.schedule.FetchSchedule
import ai.platon.pulsar.skeleton.crawl.schedule.ModifyInfo
import ai.platon.pulsar.skeleton.crawl.scoring.ScoringFilters
import ai.platon.pulsar.skeleton.signature.SignatureComparator
import ai.platon.pulsar.persist.*
import ai.platon.pulsar.persist.PageCounters.Self
import ai.platon.pulsar.persist.metadata.CrawlStatusCodes
import org.slf4j.LoggerFactory
import java.time.Duration
import java.time.Instant
/**
* The update component.
*/
class UpdateComponent(
val webDb: WebDb,
val fetchSchedule: FetchSchedule,
val scoringFilters: ScoringFilters? = null,
val messageWriter: MiscMessageWriter? = null,
val conf: ImmutableConfig,
) : Parameterized {
val LOG = LoggerFactory.getLogger(UpdateComponent::class.java)
companion object {
enum class Counter { rCreated, rNewDetail, rPassed, rLoaded, rNotExist, rDepthUp, rUpdated, rTotalUpdates, rBadModTime }
init {
MetricsSystem.reg.register(Counter::class.java)
}
}
private val enumCounters = MetricsSystem.reg.enumCounterRegistry
constructor(webDb: WebDb, conf: ImmutableConfig) : this(webDb, DefaultFetchSchedule(conf), null, null, conf)
override fun getParams(): Params {
return Params.of(
"className", this.javaClass.simpleName,
"fetchSchedule", fetchSchedule.javaClass.simpleName
)
}
fun updateByOutgoingPage(page: WebPage, outgoingPage: WebPage) {
val pageExt = WebPageExt(page)
val pageCounters = page.pageCounters
pageCounters.increase(PageCounters.Ref.page)
pageExt.updateRefContentPublishTime(outgoingPage.contentPublishTime)
if (outgoingPage.pageCategory.isDetail || CrawlFilter.guessPageCategory(outgoingPage.url).isDetail) {
pageCounters.increase(PageCounters.Ref.ch, outgoingPage.contentTextLen)
pageCounters.increase(PageCounters.Ref.item)
}
val outgoingPageCounters = outgoingPage.pageCounters
val missingFields = outgoingPageCounters.get(Self.missingFields)
val brokenSubEntity = outgoingPageCounters.get(Self.brokenSubEntity)
pageCounters.increase(PageCounters.Ref.missingFields, missingFields)
pageCounters.increase(PageCounters.Ref.brokenEntity, if (missingFields > 0) 1 else 0)
pageCounters.increase(PageCounters.Ref.brokenSubEntity, brokenSubEntity)
if (outgoingPage.protocolStatus.isFailed) {
page.deadLinks.add(outgoingPage.url)
messageWriter?.debugDeadOutgoingPage(outgoingPage.url, page)
}
scoringFilters?.updateContentScore(page)
}
fun updateByOutgoingPages(page: WebPage, outgoingPages: Collection) {
val lastPageCounters = page.pageCounters.clone()
outgoingPages.forEach { updateByOutgoingPage(page, it) }
updatePageCounters(lastPageCounters, page.pageCounters, page)
}
fun updatePageCounters(lastPageCounters: PageCounters, pageCounters: PageCounters, page: WebPage) {
val lastMissingFields = lastPageCounters.get(PageCounters.Ref.missingFields)
val lastBrokenEntity = lastPageCounters.get(PageCounters.Ref.brokenEntity)
val lastBrokenSubEntity = lastPageCounters.get(PageCounters.Ref.brokenSubEntity)
val missingFieldsLastRound = pageCounters.get(PageCounters.Ref.missingFields) - lastMissingFields
val brokenEntityLastRound = pageCounters.get(PageCounters.Ref.brokenEntity) - lastBrokenEntity
val brokenSubEntityLastRound = pageCounters.get(PageCounters.Ref.brokenSubEntity) - lastBrokenSubEntity
pageCounters.set(PageCounters.Ref.missingFieldsLastRound, missingFieldsLastRound)
pageCounters.set(PageCounters.Ref.brokenEntityLastRound, brokenEntityLastRound)
pageCounters.set(PageCounters.Ref.brokenSubEntityLastRound, brokenSubEntityLastRound)
if (missingFieldsLastRound != 0 || brokenEntityLastRound != 0 || brokenSubEntityLastRound != 0) {
val message = Params.of(
"missingFields", missingFieldsLastRound,
"brokenEntity", brokenEntityLastRound,
"brokenSubEntity", brokenSubEntityLastRound
).formatAsLine()
messageWriter?.reportBrokenEntity(page.url, message)
LOG.warn(message)
}
}
/**
* A simple update procedure
*/
fun updateByIncomingPages(incomingPages: Collection, page: WebPage) {
var smallestDepth = page.distance
var shallowestPage: WebPage? = null
for (incomingPage in incomingPages) { // log.debug(incomingPage.url() + " -> " + page.url());
if (incomingPage.distance + 1 < smallestDepth) {
smallestDepth = incomingPage.distance + 1
shallowestPage = incomingPage
}
}
if (shallowestPage != null) {
page.referrer = shallowestPage.url
// TODO: Not the best options
page.args = shallowestPage.args
page.distance = shallowestPage.distance + 1
}
}
fun updateFetchSchedule(page: WebPage) {
if (page.marks.isInactive) {
return
}
val crawlStatus = page.crawlStatus
val m = handleModifiedTime(page, crawlStatus)
when (crawlStatus.code.toByte()) {
CrawlStatusCodes.FETCHED,
CrawlStatusCodes.REDIR_TEMP,
CrawlStatusCodes.REDIR_PERM,
CrawlStatusCodes.NOTMODIFIED,
-> {
val now = Instant.now()
require(Duration.between(m.fetchTime, now).seconds < 1) {
"The actual fetch time should be very close to now. Now: $now FetchTime: ${m.fetchTime}"
}
fetchSchedule.setFetchSchedule(page, m)
// do not enable the force fetch feature
// val enableForceFetch = false
// val fetchInterval = page.fetchInterval
// if (enableForceFetch && fetchInterval > fetchSchedule.maxFetchInterval) {
// LOG.info("Force re-fetch page with interval {} | {}", fetchInterval, page.url)
// fetchSchedule.forceRefetch(page, m.prevFetchTime, false)
// }
}
CrawlStatusCodes.RETRY -> {
fetchSchedule.setPageRetrySchedule(page, m.prevFetchTime, m.prevModifiedTime, m.fetchTime)
}
CrawlStatusCodes.GONE -> fetchSchedule.setPageGoneSchedule(
page, m.prevFetchTime, m.prevModifiedTime, m.fetchTime)
}
}
private fun handleModifiedTime(page: WebPage, crawlStatus: CrawlStatus): ModifyInfo {
val pageExt = WebPageExt(page)
// page.fetchTime is not the actual fetch time!
val prevFetchTime = page.fetchTime
val fetchTime = Instant.now()
var prevModifiedTime = page.prevModifiedTime
var modifiedTime = page.modifiedTime
val newModifiedTime = pageExt.sniffModifiedTime()
var modified = FetchSchedule.STATUS_UNKNOWN
if (crawlStatus.code == CrawlStatusCodes.NOTMODIFIED.toInt()) {
modified = FetchSchedule.STATUS_NOTMODIFIED
}
val prevSig = page.prevSignature
val signature = page.signature
if (prevSig != null && signature != null) {
modified = if (SignatureComparator.compare(prevSig, signature) != 0) {
FetchSchedule.STATUS_MODIFIED
} else {
FetchSchedule.STATUS_NOTMODIFIED
}
}
if (newModifiedTime.isAfter(modifiedTime)) {
prevModifiedTime = modifiedTime
modifiedTime = newModifiedTime
}
if (modifiedTime.isBefore(AppConstants.TCP_IP_STANDARDIZED_TIME)) {
handleBadModified(page)
}
return ModifyInfo(fetchTime, prevFetchTime, prevModifiedTime, modifiedTime, modified)
}
private fun handleBadModified(page: WebPage) {
enumCounters.inc(Counter.rBadModTime)
messageWriter?.reportBadModifiedTime(Params.of(
"PFT", page.prevFetchTime, "FT", page.fetchTime,
"PMT", page.prevModifiedTime, "MT", page.modifiedTime,
"HMT", page.headers.lastModified,
"U", page.url
).formatAsLine())
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy