ai.platon.pulsar.persist.WebPageExt.kt Maven / Gradle / Ivy
The newest version!
package ai.platon.pulsar.persist
import ai.platon.pulsar.common.DateTimes.constructTimeHistory
import ai.platon.pulsar.common.DateTimes.parseInstant
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.VolatileConfig
import ai.platon.pulsar.persist.gora.generated.GWebPage
import ai.platon.pulsar.persist.metadata.Name
import ai.platon.pulsar.persist.model.ActiveDOMStat
import ai.platon.pulsar.persist.model.ActiveDOMStatus
import java.time.Instant
import java.time.temporal.ChronoUnit
class WebPageExt(private val page: WebPage) {
companion object {
fun newTestWebPage(url: String): WebPage {
val page = WebPage.newWebPage(url, VolatileConfig(), null)
page.vividLinks = mapOf("$url?t=a" to "a", "$url?t=b" to "b")
page.activeDOMStatus = ActiveDOMStatus(1, 1, "1", "1", "1")
page.activeDOMStatTrace = mapOf("a" to ActiveDOMStat(), "b" to ActiveDOMStat())
page.ensurePageModel().emplace(1, "g", mapOf("a" to "b"))
return page
}
}
fun increaseDistance(newDistance: Int) {
val oldDistance: Int = page.distance
if (newDistance < oldDistance) {
page.distance = newDistance
}
}
fun sniffFetchPriority(): Int {
var priority = page.fetchPriority
val depth = page.distance
if (depth < AppConstants.FETCH_PRIORITY_DEPTH_BASE) {
priority = Math.max(priority, AppConstants.FETCH_PRIORITY_DEPTH_BASE - depth)
}
return priority
}
fun sniffTitle(): String {
var title = page.contentTitle
if (title.isEmpty()) {
title = page.anchor.toString()
}
if (title.isEmpty()) {
title = page.pageTitle
}
if (title.isEmpty()) {
title = page.location
}
if (title.isEmpty()) {
title = page.url
}
return title
}
fun setTextCascaded(text: String?) {
page.setContent(text)
page.setContentText(text)
page.setPageText(text)
}
/**
* Record all links appeared in a page
* The links are in FIFO order, for each time we fetch and parse a page,
* we push newly discovered links to the queue, if the queue is full, we drop out some old ones,
* usually they do not appears in the page any more.
*
* TODO: compress links
* TODO: HBase seems not modify any nested array
*
* @param hypeLinks a [java.lang.Iterable] object.
*/
fun addHyperlinks(hypeLinks: Iterable) {
var links = page.links
// If there are too many links, Drop the front 1/3 links
if (links.size > AppConstants.MAX_LINK_PER_PAGE) {
links = links.subList(links.size - AppConstants.MAX_LINK_PER_PAGE / 3, links.size)
}
for (l in hypeLinks) {
val url = WebPage.u8(l.url)
if (!links.contains(url)) {
links.add(url)
}
}
page.links = links
page.impreciseLinkCount = links.size
}
fun addLinks(hypeLinks: Iterable) {
var links = page.links
// If there are too many links, Drop the front 1/3 links
if (links.size > AppConstants.MAX_LINK_PER_PAGE) {
links = links.subList(links.size - AppConstants.MAX_LINK_PER_PAGE / 3, links.size)
}
for (link in hypeLinks) {
val url = WebPage.u8(link.toString())
// Use a set?
if (!links.contains(url)) {
links.add(url)
}
}
page.links = links
page.impreciseLinkCount = links.size
}
fun updateContentPublishTime(newPublishTime: Instant): Boolean {
if (!page.isValidContentModifyTime(newPublishTime)) {
return false
}
val lastPublishTime = page.contentPublishTime
if (newPublishTime.isAfter(lastPublishTime)) {
page.prevContentPublishTime = lastPublishTime
page.contentPublishTime = newPublishTime
}
return true
}
fun updateContent(pageDatum: PageDatum, contentTypeHint: String? = null) {
var contentType = contentTypeHint
page.setOriginalContentLength(pageDatum.originalContentLength)
page.setContent(pageDatum.content)
// clear content immediately to release resource as soon as possible
pageDatum.content = null
if (contentType != null) {
pageDatum.contentType = contentType
} else {
contentType = pageDatum.contentType
}
if (contentType != null) {
page.contentType = contentType
} else {
}
}
fun updateContentModifiedTime(newModifiedTime: Instant): Boolean {
if (!page.isValidContentModifyTime(newModifiedTime)) {
return false
}
val lastModifyTime = page.contentModifiedTime
if (newModifiedTime.isAfter(lastModifyTime)) {
page.prevContentModifiedTime = lastModifyTime
page.contentModifiedTime = newModifiedTime
}
return true
}
fun updateRefContentPublishTime(newRefPublishTime: Instant): Boolean {
if (!page.isValidContentModifyTime(newRefPublishTime)) {
return false
}
val latestRefPublishTime = page.refContentPublishTime
if (newRefPublishTime.isAfter(latestRefPublishTime)) {
page.prevRefContentPublishTime = latestRefPublishTime
page.refContentPublishTime = newRefPublishTime
return true
}
return false
}
fun getFirstIndexTime(defaultValue: Instant): Instant {
var firstIndexTime: Instant? = null
val indexTimeHistory = getIndexTimeHistory("")
if (!indexTimeHistory.isEmpty()) {
val times = indexTimeHistory.split(",").toTypedArray()
val time = parseInstant(times[0], Instant.EPOCH)
if (time.isAfter(Instant.EPOCH)) {
firstIndexTime = time
}
}
return firstIndexTime ?: defaultValue
}
/**
* *****************************************************************************
* Parsing
* ******************************************************************************
*/
fun updateFetchTimeHistory(fetchTime: Instant) {
var fetchTimeHistory = page.metadata[Name.FETCH_TIME_HISTORY]
fetchTimeHistory = constructTimeHistory(fetchTimeHistory, fetchTime, 10)
page.metadata[Name.FETCH_TIME_HISTORY] = fetchTimeHistory
}
/**
* Get the first fetch time
*/
val firstFetchTime: Instant?
get() {
var firstFetchTime: Instant? = null
val history = page.getFetchTimeHistory("")
if (!history.isEmpty()) {
val times = history.split(",").toTypedArray()
val time = parseInstant(times[0], Instant.EPOCH)
if (time.isAfter(Instant.EPOCH)) {
firstFetchTime = time
}
}
return firstFetchTime
}
fun sniffModifiedTime(): Instant {
var modifiedTime = page.modifiedTime
val headerModifiedTime = page.headers.lastModified
val contentModifiedTime = page.contentModifiedTime
if (page.isValidContentModifyTime(headerModifiedTime) && headerModifiedTime.isAfter(modifiedTime)) {
modifiedTime = headerModifiedTime
}
if (page.isValidContentModifyTime(contentModifiedTime) && contentModifiedTime.isAfter(modifiedTime)) {
modifiedTime = contentModifiedTime
}
val contentPublishTime = page.contentPublishTime
if (page.isValidContentModifyTime(contentPublishTime) && contentPublishTime.isAfter(modifiedTime)) {
modifiedTime = contentPublishTime
}
// A fix
if (modifiedTime.isAfter(Instant.now().plus(1, ChronoUnit.DAYS))) {
// LOG.warn("Invalid modified time " + DateTimeUtil.isoInstantFormat(modifiedTime) + ", url : " + page.url());
modifiedTime = Instant.now()
}
return modifiedTime
}
fun getIndexTimeHistory(defaultValue: String): String {
return page.metadata.get(Name.INDEX_TIME_HISTORY) ?: defaultValue
}
fun putIndexTimeHistory(indexTime: Instant) {
var indexTimeHistory = page.metadata.get(Name.INDEX_TIME_HISTORY)
indexTimeHistory = constructTimeHistory(indexTimeHistory, indexTime, 10)
page.metadata.set(Name.INDEX_TIME_HISTORY, indexTimeHistory)
}
fun isValidContentModifyTime(publishTime: Instant) =
publishTime.isAfter(AppConstants.MIN_ARTICLE_PUBLISH_TIME)
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy