ai.platon.pulsar.skeleton.common.collect.FatLinkExtractor.kt Maven / Gradle / Ivy
The newest version!
package ai.platon.pulsar.skeleton.common.collect
import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.pulsar.common.ObjectConverter
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.skeleton.common.message.PageLoadStatusFormatter
import ai.platon.pulsar.skeleton.common.metrics.MetricsSystem
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.common.readable
import ai.platon.pulsar.common.urls.*
import ai.platon.pulsar.common.urls.preprocess.UrlNormalizerPipeline
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.HyperlinkPersistable
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.skeleton.session.AbstractPulsarSession
import com.codahale.metrics.Gauge
import org.slf4j.LoggerFactory
import java.time.Duration
import java.time.Instant
data class PageFatLink(
val page: WebPage,
val fatLink: CrawlableFatLink
)
class FatLinkExtractor(
val session: PulsarSession,
val normalizer: UrlNormalizerPipeline = UrlNormalizerPipeline()
) {
private val log = LoggerFactory.getLogger(FatLinkExtractor::class.java)
companion object {
data class Counters(
var unfilteredLinks: Int = 0,
var regexMatchedLinks: Int = 0,
var allowLinks: Int = 0,
var freshLinks: Int = 0,
var lastFailedLinks: Int = 0,
var expiredLinks: Int = 0,
var fetchLinks: Int = 0,
var badSeeds: Int = 0,
var loadedSeeds: Int = 0
)
val globalCounters = Counters()
private val gauges = mapOf(
"unfilteredLinks" to Gauge { globalCounters.unfilteredLinks },
"regexMatchedLinks" to Gauge { globalCounters.regexMatchedLinks },
"freshLinks" to Gauge { globalCounters.freshLinks },
"lastFailedLinks" to Gauge { globalCounters.lastFailedLinks },
"expiredLinks" to Gauge { globalCounters.expiredLinks },
"fetchLinks" to Gauge { globalCounters.fetchLinks },
"loadedSeeds" to Gauge { globalCounters.loadedSeeds }
)
init {
MetricsSystem.reg.registerAll(this, gauges)
}
}
private val abstractSession get() = session as AbstractPulsarSession
private val webDb = abstractSession.context.webDb
val counters = Counters()
fun parse(page: WebPage, document: FeaturedDocument, options: LoadOptions) {
createFatLink(page, document, options)
}
fun createFatLink(seed: NormURL): PageFatLink? = createFatLink(seed, listOf())
fun createFatLink(seed: NormURL, page: WebPage, denyList: Collection): PageFatLink? {
++counters.loadedSeeds
++globalCounters.loadedSeeds
if (page.content == null) {
return null
}
val document = session.parse(page)
return createFatLink(seed, page, document, denyList)
}
fun createFatLink(seed: NormURL, denyList: Collection): PageFatLink? {
// TODO: we can use an event handler to extract links
// val handler = object: HtmlDocumentHandler() {
// override val name = CapabilityTypes.FETCH_AFTER_EXTRACT_HANDLER
// override fun invoke(page: WebPage, document: FeaturedDocument) {
//
// }
// }
// seed.options.volatileConfig?.putBean(handler.name, handler)
val page = session.load(seed)
if (!page.protocolStatus.isSuccess) {
++counters.badSeeds
++globalCounters.badSeeds
PageLoadStatusFormatter(page, prefix = "Bad seed", withOptions = true).also { log.warn(it.toString()) }
return null
}
return createFatLink(seed, page, denyList)
}
fun createFatLink(page: WebPage, document: FeaturedDocument, options: LoadOptions): PageFatLink? {
return createFatLink(NormURL(page.url, options), page, document)
}
fun createFatLink(seed: NormURL, page: WebPage, document: FeaturedDocument): PageFatLink? {
return createFatLink(seed, page, document, listOf())
}
/**
* Create a fat link.
* If the document is not null, parse links from the document, or if the document is null, try to load the page's
* vivid link, the vivid link can be parsed and saved recently
* */
fun createFatLink(
seed: NormURL, page: WebPage, document: FeaturedDocument? = null, denyList: Collection
): PageFatLink? {
val fatLinkSpec = seed.spec
val normalizedFatLink = normalizer.invoke(fatLinkSpec) ?: fatLinkSpec
val options = seed.options
val args = if (options.label.isNotBlank()) "-label ${options.label}" else ""
val selector = options.outLinkSelector
val now = Instant.now()
val vividLinks = if (document != null) {
parseVividLinks(seed, page, document, denyList).also { page.fetchedLinkCount = 0 }
} else {
loadVividLinks(page, options, denyList)
}
counters.fetchLinks = vividLinks.size
globalCounters.fetchLinks += vividLinks.size
if (vividLinks.isEmpty()) {
log.info(
"{}. No new link in portal page({}), latest fetch at: {} | <{}> | {}",
page.id,
Strings.compactFormat(page.contentLength),
Duration.between(page.prevFetchTime, now).readable(),
selector,
seed
)
log.info("{}. {}", page.id, ObjectConverter.asMap(counters).entries.joinToString())
if (document != null && counters.unfilteredLinks == 0) {
val path = session.export(page)
log.info("{}. No any link in the page, exported to {}", page.id, path)
}
val fatLink = CrawlableFatLink(normalizedFatLink, href = fatLinkSpec)
return PageFatLink(page, fatLink)
}
// update vivid links
if (document != null) {
val hyperlinks = vividLinks.map { HyperlinkPersistable(it.url, it.text, it.order) }
page.vividLinks = hyperlinks.associate { it.url to "${it.text} createdAt: $now" }
}
val fatLink = CrawlableFatLink(normalizedFatLink, href = fatLinkSpec, args = args, tailLinks = vividLinks)
return PageFatLink(page, fatLink)
}
private fun parseVividLinks(
seed: NormURL, page: WebPage, document: FeaturedDocument, denyList: Collection
): List {
val now = Instant.now()
val fatLinkSpec = seed.spec
val options = seed.options
val selector = options.outLinkSelector
val urlRegex = options.outLinkPattern.toRegex()
if (selector.isBlank()) {
return listOf()
}
return HyperlinkExtractor(page, document, selector, normalizer).extract()
.asSequence()
.onEach { ++counters.unfilteredLinks; ++globalCounters.unfilteredLinks }
.filter { it.url.matches(urlRegex) }
.onEach { ++counters.regexMatchedLinks; ++globalCounters.regexMatchedLinks }
.filter { it !in denyList }
.onEach { ++counters.allowLinks; ++globalCounters.allowLinks }
.mapNotNull { normalizeOrNull(it) }
.filter { shouldFetchVividPage(it.url, options.itemExpires, now) }
.map { StatefulHyperlink(it.url, it.text, it.order, referrer = fatLinkSpec) }
.onEach { it.args = "-i 0s" }
.toList()
}
private fun normalizeOrNull(hyperlink: Hyperlink): Hyperlink? {
val normURL = normalizer(hyperlink.url)
return if (normURL != null) {
hyperlink.also { it.url = normURL }
} else null
}
private fun loadVividLinks(
page: WebPage, options: LoadOptions, denyList: Collection
): List {
val now = Instant.now()
val urlRegex = options.outLinkPattern.toRegex()
return page.vividLinks
.asSequence()
.mapNotNull { normalizer(it.key.toString())?.let { u -> u to it.value.toString() } }
.map { StatefulHyperlink(it.first, it.second, 0, referrer = page.url) }
.filterNot { it in denyList }
.filter { it.url.matches(urlRegex) }
.filter { shouldFetchVividPage(it.url, options.itemExpires, now) }
.onEach { it.args = "-i 0s" }
.toList()
}
/**
* TODO: the logic is different from the one in LoadComponent
* */
fun shouldFetchVividPage(url: String, expires: Duration, now: Instant): Boolean {
// if (text != null) {
// val createdAt = DateTimes.parseInstant(text.substringAfter(" createdAt: "), Instant.EPOCH)
// if (Duration.between(createdAt, now).toHours() <= 24) {
// return true
// }
// }
val p = webDb.getOrNull(url)
return when {
p == null -> {
++counters.freshLinks
++globalCounters.freshLinks
true
}
!p.protocolStatus.isSuccess -> {
++counters.lastFailedLinks
++globalCounters.lastFailedLinks
true
}
p.prevFetchTime + expires < now -> {
++counters.expiredLinks
++globalCounters.expiredLinks
true
}
else -> false
}
}
}