ai.platon.pulsar.persist.experimental.MutableWebPage.kt.txt Maven / Gradle / Ivy
/*******************************************************************************
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.platon.pulsar.persist.experimental
import ai.platon.pulsar.common.DateTimes
import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.PulsarParams
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.common.browser.BrowserType
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.VolatileConfig
import ai.platon.pulsar.common.config.VolatileConfig.Companion.UNSAFE
import ai.platon.pulsar.common.urls.UrlUtils.isStandard
import ai.platon.pulsar.common.urls.UrlUtils.reverseUrlOrEmpty
import ai.platon.pulsar.common.urls.UrlUtils.unreverseUrl
import ai.platon.pulsar.persist.*
import ai.platon.pulsar.persist.gora.generated.*
import ai.platon.pulsar.persist.metadata.*
import ai.platon.pulsar.persist.metadata.OpenPageCategory.Companion.parse
import ai.platon.pulsar.persist.model.ActiveDOMStat
import ai.platon.pulsar.persist.model.ActiveDOMStatus
import ai.platon.pulsar.persist.model.Converters.convert
import ai.platon.pulsar.persist.model.PageModel
import ai.platon.pulsar.persist.model.PageModel.Companion.box
import ai.platon.pulsar.persist.model.WebPageFormatter
import org.apache.avro.util.Utf8
import org.apache.commons.collections4.CollectionUtils
import org.apache.commons.lang3.StringUtils
import org.apache.commons.lang3.math.NumberUtils
import org.apache.gora.util.ByteUtils
import org.xml.sax.InputSource
import java.io.ByteArrayInputStream
import java.nio.ByteBuffer
import java.time.Duration
import java.time.Instant
import java.time.ZoneId
import java.time.temporal.ChronoUnit
import java.util.*
import java.util.concurrent.atomic.AtomicInteger
import java.util.function.Consumer
import java.util.function.Function
import java.util.stream.Collectors
/**
* The core web page structure
*/
class MutableWebPage(
page: GWebPage
): KWebPage(page) {
/**
* Web page scope configuration
*/
override override var conf: VolatileConfig
/**
* If this page is fetched from internet
*/
override override var isCached = false
/**
* If this page is loaded from database or is created and fetched from the web
*/
override var isLoaded = false
/**
* If this page is fetched from internet
*/
override var isFetched = false
/**
* If a page is canceled, it remains unchanged
*/
/**
* If a page is canceled, it remains unchanged
*/
/**
* If this page is canceled
*/
override var isCanceled = false
/**
* If this page is fetched and updated
*/
@Volatile
override var isContentUpdated = false
private set
/**
* Get the cached content
*/
/**
* Set the cached content, keep the persisted page content unmodified
*/
/**
* The cached content
*/
@Volatile
override var tmpContent: ByteBuffer? = null
/**
* The delay time to retry if a retry is needed
*/
override var retryDelay = Duration.ZERO
/**
* Get The hypertext reference of this page.
* It defines the address of the document, which this time is linked from
*
*
* TODO: use a separate field for href
*
* @return The hypertext reference
*/
/**
* Set The hypertext reference of this page.
* It defines the address of the document, which this time is linked from
*
* @param href The hypertext reference
*/
override var href: String?
get() = metadata[Name.HREF]
set(href) {
metadata[Name.HREF] = href
}
/**
*
* getAndRemoveVar.
*
* @param name a [String] object.
* @return a boolean.
*/
fun removeVar(name: String): Any {
return variables.remove(name)
}
/**
* Get a page scope temporary variable
*
* @param name The variable name.
* @param value The variable value.
*/
fun setVar(name: String, value: Any) {
variables[name] = value
}
fun hasMark(mark: Mark): Boolean {
return page.markers[wrapKey(mark)] != null
}// The underlying field should not use name 'args'
/**
* Set the local args variable and the persist version, and also clear the load options.
*/
/**
* The load arguments is variant task by task, so the local version is the first choice,
* while the persisted version is used for historical check only
*
* The underlying field should not use name 'args' since it exists already
* with another gora type, see GProtocolStatus.args and GParseStatus.args
*/
override var args: String
set(args) {
variables.remove(PulsarParams.VAR_LOAD_OPTIONS)
page.params = args
}
override var maxRetries: Int
set(maxRetries) {
metadata[Name.FETCH_MAX_RETRY] = maxRetries
}
override var fetchedLinkCount: Int
get() = metadata.getInt(Name.FETCHED_LINK_COUNT, 0)
set(count) {
metadata[Name.FETCHED_LINK_COUNT] = count
}
override var zoneId: ZoneId
get() = if (page.zoneId == null) DateTimes.zoneId else ZoneId.of(page.zoneId.toString())
set(zoneId) {
page.zoneId = zoneId.id
}
override var batchId: String?
get() = if (page.batchId == null) "" else page.batchId.toString()
set(value) {
page.batchId = value
}
fun markSeed() {
metadata[Name.IS_SEED] = AppConstants.YES_STRING
}
fun unmarkSeed() {
metadata.remove(Name.IS_SEED)
}
val isSeed: Boolean
get() = metadata.contains(Name.IS_SEED)
override var distance: Int
get() {
val distance = page.distance
return if (distance < 0) AppConstants.DISTANCE_INFINITE else distance
}
set(newDistance) {
page.distance = newDistance
}
/**
* Fetch mode is used to determine the protocol before fetch, so it shall be set before fetch
*/
override var fetchMode: FetchMode
get() = FetchMode.fromString(metadata[Name.FETCH_MODE])
set(mode) {
metadata[Name.FETCH_MODE] = mode.name
}
override var lastBrowser: BrowserType
get() {
val browser = if (page.browser != null) page.browser.toString() else ""
return BrowserType.fromString(browser)
}
set(browser) {
page.browser = browser.name
}
override var isResource: Boolean
get() = page.resource != null
set(resource) {
if (resource) {
page.resource = 1
}
}
override var htmlIntegrity: HtmlIntegrity
get() {
val integrity = if (page.htmlIntegrity != null) page.htmlIntegrity.toString() else ""
return HtmlIntegrity.fromString(integrity)
}
set(integrity) {
page.htmlIntegrity = integrity.name
}
override var fetchPriority: Int
get() = if (page.fetchPriority > 0) page.fetchPriority else AppConstants.FETCH_PRIORITY_DEFAULT
set(priority) {
page.fetchPriority = priority
}
fun sniffFetchPriority(): Int {
override var priority = fetchPriority
val depth = distance
if (depth < AppConstants.FETCH_PRIORITY_DEPTH_BASE) {
priority = Math.max(priority, AppConstants.FETCH_PRIORITY_DEPTH_BASE - depth)
}
return priority
}
override var createTime: Instant
get() = Instant.ofEpochMilli(page.createTime)
set(createTime) {
page.createTime = createTime.toEpochMilli()
}
override var generateTime: Instant
get() {
val generateTime = metadata[Name.GENERATE_TIME]
return if (generateTime == null) {
Instant.EPOCH
} else {
Instant.parse(generateTime)
}
}
set(generateTime) {
metadata[Name.GENERATE_TIME] = generateTime.toString()
}
override var fetchCount: Int
get() = page.fetchCount
set(count) {
page.fetchCount = count
}
fun updateFetchCount() {
val count = fetchCount
fetchCount = count + 1
}
override var crawlStatus: CrawlStatus
get() = CrawlStatus(page.crawlStatus.toByte())
set(crawlStatus) {
page.crawlStatus = crawlStatus.code
}
fun setCrawlStatus(value: Int) {
page.crawlStatus = value
}
/**
* The baseUrl is as the same as Location
*
*
* A baseUrl has the same semantic with Jsoup.parse:
*
* @return a [String] object.
* @link {https://jsoup.org/apidocs/org/jsoup/Jsoup.html#parse-java.io.File-java.lang.String-java.lang.String-}
* @see MutableWebPage.getLocation
*/
val baseUrl: String
get() = if (page.baseUrl == null) "" else page.baseUrl.toString()
/**
* WebPage.url is the permanent internal address, it might not still available to access the target.
* And WebPage.location or WebPage.baseUrl is the last working address, it might redirect to url,
* or it might have additional random parameters.
* WebPage.location may be different from url, it's generally normalized.
*/
val location: String get() = baseUrl
/**
* The url is the permanent internal address, it might not still available to access the target.
*
*
* Location is the last working address, it might redirect to url, or it might have additional random parameters.
*
*
* Location may be different from url, it's generally normalized.
*
* @param location The location.
*/
fun setLocation(location: String) {
page.baseUrl = location
}
/**
* The latest fetch time
*
* @return The latest fetch time
*/
fun getFetchTime(): Instant {
return Instant.ofEpochMilli(page.fetchTime)
}
/**
* The latest fetch time
*
* @param time The latest fetch time
*/
fun setFetchTime(time: Instant) {
page.fetchTime = time.toEpochMilli()
}
/**
* The previous fetch time, updated at the fetch stage
*
* @return The previous fetch time.
*/
fun getPrevFetchTime(): Instant {
return Instant.ofEpochMilli(page.prevFetchTime)
}
fun setPrevFetchTime(time: Instant) {
page.prevFetchTime = time.toEpochMilli()
}
/**
* The previous crawl time, used for fat link crawl, which means both the page itself and out pages are fetched
*/
fun getPrevCrawlTime1(): Instant {
return Instant.ofEpochMilli(page.prevCrawlTime1)
}
/**
* The previous crawl time, used for fat link crawl, which means both the page itself and out pages are fetched
*/
fun setPrevCrawlTime1(time: Instant) {
page.prevCrawlTime1 = time.toEpochMilli()
}
/**
* Get fetch interval
*/
fun getFetchInterval(): Duration {
override var seconds = page.fetchInterval.toLong()
if (seconds < 0) {
seconds = ChronoUnit.CENTURIES.duration.seconds
}
return Duration.ofSeconds(seconds)
}
/**
* Set fetch interval
*/
fun setFetchInterval(duration: Duration) {
page.fetchInterval = duration.seconds.toInt()
}
/**
* Set fetch interval in seconds
*/
fun setFetchInterval(seconds: Long) {
page.fetchInterval = seconds.toInt()
}
/**
* Set fetch interval in seconds
*/
fun setFetchInterval(seconds: Float) {
page.fetchInterval = Math.round(seconds)
}
/**
* Get protocol status
*/
fun getProtocolStatus(): ProtocolStatus {
override var protocolStatus = page.protocolStatus
if (protocolStatus == null) {
protocolStatus = GProtocolStatus.newBuilder().build()
}
return ProtocolStatus.box(protocolStatus)
}
/**
* Set protocol status
*/
fun setProtocolStatus(protocolStatus: ProtocolStatus) {
page.protocolStatus = protocolStatus.unbox()
}
/**
* Header information returned from the web server used to server the content which is subsequently fetched from.
* This includes keys such as
* TRANSFER_ENCODING,
* CONTENT_ENCODING,
* CONTENT_LANGUAGE,
* CONTENT_LENGTH,
* CONTENT_LOCATION,
* CONTENT_DISPOSITION,
* CONTENT_MD5,
* CONTENT_TYPE,
* LAST_MODIFIED
* and LOCATION.
*/
fun getHeaders(): ProtocolHeaders {
return ProtocolHeaders.box(page.headers)
}
fun getReprUrl(): String {
return if (page.reprUrl == null) "" else page.reprUrl.toString()
}
fun setReprUrl(value: String) {
page.reprUrl = value
}
fun getFetchRetries(): Int {
return page.fetchRetries
}
fun setFetchRetries(value: Int) {
page.fetchRetries = value
}
fun getModifiedTime(): Instant {
return Instant.ofEpochMilli(page.modifiedTime)
}
fun setModifiedTime(value: Instant) {
page.modifiedTime = value.toEpochMilli()
}
fun getPrevModifiedTime(): Instant {
return Instant.ofEpochMilli(page.prevModifiedTime)
}
fun setPrevModifiedTime(value: Instant) {
page.prevModifiedTime = value.toEpochMilli()
}
fun getFetchTimeHistory(defaultValue: String): String {
val s = metadata[Name.FETCH_TIME_HISTORY]
return s ?: defaultValue
}
fun getPageCategory(): PageCategory {
try {
val pageCategory = page.pageCategory
if (pageCategory != null) {
return PageCategory.parse(pageCategory.toString())
}
} catch (ignored: Throwable) {
}
return PageCategory.UNKNOWN
}
fun getOpenPageCategory(): OpenPageCategory {
try {
val pageCategory = page.pageCategory
if (pageCategory != null) {
return parse(pageCategory.toString())
}
} catch (ignored: Throwable) {
}
return OpenPageCategory("", "")
}
/**
* category : index, detail, review, media, search, etc
*
* @param pageCategory a [PageCategory] object.
*/
fun setPageCategory(pageCategory: PageCategory) {
page.pageCategory = pageCategory.format()
}
fun setPageCategory(pageCategory: OpenPageCategory) {
page.pageCategory = pageCategory.format()
}
/**
* Get the encoding of the content.
* Content encoding is detected just before it's parsed.
*/
fun getEncoding(): String? {
return if (page.encoding == null) null else page.encoding.toString()
}
/**
* Set the encoding of the content.
* Content encoding is detected just before it's parsed.
*/
fun setEncoding(encoding: String?) {
page.encoding = encoding
}
/**
* Get the encoding of the content.
* Content encoding is detected just before it's parsed.
*/
fun getEncodingOrDefault(defaultEncoding: String): String {
return if (page.encoding == null) defaultEncoding else page.encoding.toString()
}
/**
* The clues are used to determine the encoding of the page content
*/
fun getEncodingClues(): String {
return metadata.getOrDefault(Name.ENCODING_CLUES, "")
}
/**
* The clues are used to determine the encoding of the page content
*/
fun setEncodingClues(clues: String) {
metadata[Name.ENCODING_CLUES] = clues
}
/**
* The entire raw document content e.g. raw XHTML
*
* @return The raw document content in [ByteBuffer].
*/
fun getContent(): ByteBuffer? {
return if (tmpContent != null) {
tmpContent
} else page.content
}
/**
* Get the persistent page content
*/
fun getPersistContent(): ByteBuffer? {
return page.content
}
/**
* Get content as bytes, the underling buffer is duplicated
*
* @return a duplication of the underling buffer.
*/
fun getContentAsBytes(): ByteArray {
val content = getContent() ?: return ByteUtils.toBytes('\u0000')
return ByteUtils.toBytes(content)
}
/**
* Get the page content as a string, if the underlying page content is null, return an empty string
*/
fun getContentAsString(): String {
val buffer = getContent()
return if (buffer == null || buffer.remaining() == 0) {
""
} else String(buffer.array(), buffer.arrayOffset(), buffer.limit())
}
/**
* Get the page content as input stream
*/
fun getContentAsInputStream(): ByteArrayInputStream {
val contentInOctets = getContent() ?: return ByteArrayInputStream(ByteUtils.toBytes('\u0000'))
return ByteArrayInputStream(
getContent()!!.array(),
contentInOctets.arrayOffset() + contentInOctets.position(),
contentInOctets.remaining()
)
}
/**
* Get the page content as sax input source
*/
fun getContentAsSaxInputSource(): InputSource {
val inputSource = InputSource(getContentAsInputStream())
val encoding = getEncoding()
if (encoding != null) {
inputSource.encoding = encoding
}
return inputSource
}
/**
* Set the page content
*/
fun setContent(value: String?) {
if (value != null) {
setContent(value.toByteArray())
} else {
setContent(null as ByteBuffer?)
}
}
/**
* Set the page content
*/
fun setContent(value: ByteArray?) {
if (value != null) {
setContent(ByteBuffer.wrap(value))
} else {
setContent(null as ByteBuffer?)
}
}
/**
* Set the page content
*
* @param value a ByteBuffer.
*/
fun setContent(value: ByteBuffer?) {
if (value != null) {
page.content = value
isContentUpdated = true
val length = value.array().size
computeContentLength(length.toLong())
setPersistedContentLength(length.toLong())
} else {
clearPersistContent()
}
}
fun clearPersistContent() {
tmpContent = page.content
page.content = null
setPersistedContentLength(0)
}
/**
* Get the length of content in bytes.
*
* TODO: check consistency with HttpHeaders.CONTENT_LENGTH
*
* @return The length of the content in bytes.
*/
fun getContentLength(): Long {
return if (page.contentLength != null) page.contentLength else 0
}
/**
* Compute the length of content in bytes.
*/
private fun computeContentLength(bytes: Long) {
val lastBytes = getContentLength()
page.lastContentLength = lastBytes
page.contentLength = bytes
computeAveContentLength(bytes)
}
private fun computeAveContentLength(bytes: Long) {
val count = fetchCount
val lastAveBytes = page.aveContentLength
val aveBytes: Long
aveBytes = if (count > 0 && lastAveBytes == 0L) {
// old version, average bytes is not calculated
bytes
} else {
(lastAveBytes * count + bytes) / (count + 1)
}
page.aveContentLength = aveBytes
}
fun getPersistedContentLength(): Long {
return if (page.persistedContentLength != null) page.persistedContentLength else 0
}
private fun setPersistedContentLength(bytes: Long) {
page.persistedContentLength = bytes
}
fun getLastContentLength(): Long {
return if (page.lastContentLength != null) page.lastContentLength else 0
}
fun getAveContentLength(): Long {
return if (page.aveContentLength != null) page.aveContentLength else 0
}
fun getContentType(): String {
return if (page.contentType == null) "" else page.contentType.toString()
}
fun setContentType(value: String) {
page.contentType = value.trim { it <= ' ' }.toLowerCase()
}
fun getPrevSignature(): ByteBuffer? {
return page.prevSignature
}
fun setPrevSignature(value: ByteBuffer?) {
page.prevSignature = value
}
fun getPrevSignatureAsString(): String {
override var sig = getPrevSignature()
if (sig == null) {
sig = ByteBuffer.wrap("".toByteArray())
}
return Strings.toHexString(sig)
}
/**
* The last proxy used to fetch the page
*/
fun getProxy(): String? {
return if (page.proxy == null) null else page.proxy.toString()
}
/**
* The last proxy used to fetch the page
*/
fun setProxy(proxy: String?) {
page.proxy = proxy
}
fun getActiveDOMStatus(): ActiveDOMStatus? {
val s = page.activeDOMStatus ?: return null
return ActiveDOMStatus(
s.n,
s.scroll,
s.st.toString(),
s.r.toString(),
s.idl.toString(),
s.ec.toString()
)
}
fun setActiveDOMStatus(s: ActiveDOMStatus?) {
if (s == null) {
return
}
val s2 = page.activeDOMStatus
if (s2 != null) {
s2.n = s.n
s2.scroll = s.scroll
s2.st = s.st
s2.r = s.r
s2.idl = s.idl
s2.ec = s.ec
}
}
fun getActiveDOMStatTrace(): Map {
val s = page.activeDOMStatTrace
return s.entries.stream().collect(
Collectors.toMap, String, ActiveDOMStat>(
Function { (key1): Map.Entry -> key1.toString() },
Function { (_, value): Map.Entry ->
convert(
value
)
}
))
}
fun setActiveDOMStatTrace(trace: Map) {
val statTrace = trace.entries.stream().collect(
Collectors.toMap(
Function, CharSequence> { (key1, value) -> java.util.Map.Entry.key },
Function { (_, value): Map.Entry ->
convert(
value!!
)
})
)
page.activeDOMStatTrace = statTrace
}
/**
* An implementation of a WebPage's signature from which it can be identified and referenced at any point in time.
* This is essentially the WebPage's fingerprint representing its state for any point in time.
*/
fun getSignature(): ByteBuffer? {
return page.signature
}
fun setSignature(value: ByteArray?) {
page.signature = ByteBuffer.wrap(value)
}
fun getSignatureAsString(): String {
override var sig = getSignature()
if (sig == null) {
sig = ByteBuffer.wrap("".toByteArray())
}
return Strings.toHexString(sig)
}
fun getPageTitle(): String {
return if (page.pageTitle == null) "" else page.pageTitle.toString()
}
fun setPageTitle(pageTitle: String?) {
page.pageTitle = pageTitle
}
fun getContentTitle(): String {
return if (page.contentTitle == null) "" else page.contentTitle.toString()
}
fun setContentTitle(contentTitle: String?) {
if (contentTitle != null) {
page.contentTitle = contentTitle
}
}
fun getPageText(): String {
return if (page.pageText == null) "" else page.pageText.toString()
}
fun setPageText(value: String?) {
if (value != null && !value.isEmpty()) page.pageText = value
}
fun getContentText(): String {
return if (page.contentText == null) "" else page.contentText.toString()
}
fun setContentText(textContent: String?) {
if (textContent != null && !textContent.isEmpty()) {
page.contentText = textContent
page.contentTextLen = textContent.length
}
}
fun getContentTextLen(): Int {
return page.contentTextLen
}
fun getParseStatus(): ParseStatus {
val parseStatus = page.parseStatus
return ParseStatus.box(parseStatus ?: GParseStatus.newBuilder().build())
}
fun setParseStatus(parseStatus: ParseStatus) {
page.parseStatus = parseStatus.unbox()
}
fun getLiveLinks(): Map {
return page.liveLinks
}
fun getSimpleLiveLinks(): Collection {
return CollectionUtils.collect(page.liveLinks.keys) { obj: CharSequence -> obj.toString() }
}
fun setLiveLinks(liveLinks: Iterable) {
page.liveLinks.clear()
val links = page.liveLinks
liveLinks.forEach(Consumer { l: HyperlinkPersistable -> links[l.url] = l.unbox() })
}
fun setLiveLinks(links: Map?) {
page.liveLinks = links
}
fun addLiveLink(hyperLink: HyperlinkPersistable) {
page.liveLinks[hyperLink.url] = hyperLink.unbox()
}
fun getVividLinks(): Map {
return page.vividLinks
}
fun getSimpleVividLinks(): Collection {
return CollectionUtils.collect(page.vividLinks.keys) { obj: CharSequence -> obj.toString() }
}
fun setVividLinks(links: Map?) {
page.vividLinks = links
}
fun getDeadLinks(): List {
return page.deadLinks
}
fun setDeadLinks(deadLinks: List?) {
page.deadLinks = deadLinks
}
fun getLinks(): List {
return page.links
}
fun setLinks(links: List?) {
page.links = links
}
fun getImpreciseLinkCount(): Int {
val count = metadata.getOrDefault(Name.TOTAL_OUT_LINKS, "0")
return NumberUtils.toInt(count, 0)
}
fun setImpreciseLinkCount(count: Int) {
metadata[Name.TOTAL_OUT_LINKS] = count.toString()
}
fun getInlinks(): Map {
return page.inlinks
}
fun getAnchor(): CharSequence {
return if (page.anchor != null) page.anchor else ""
}
fun setAnchor(anchor: CharSequence?) {
page.anchor = anchor
}
fun getInlinkAnchors(): Array {
return StringUtils.split(metadata.getOrDefault(Name.ANCHORS, ""), "\n")
}
fun setInlinkAnchors(anchors: Collection?) {
metadata[Name.ANCHORS] = StringUtils.join(anchors, "\n")
}
fun getAnchorOrder(): Int {
val order = page.anchorOrder
return if (order < 0) AppConstants.MAX_LIVE_LINK_PER_PAGE else order
}
fun setAnchorOrder(order: Int) {
page.anchorOrder = order
}
fun getContentPublishTime(): Instant {
return Instant.ofEpochMilli(page.contentPublishTime)
}
fun setContentPublishTime(publishTime: Instant) {
page.contentPublishTime = publishTime.toEpochMilli()
}
fun isValidContentModifyTime(publishTime: Instant): Boolean {
return publishTime.isAfter(AppConstants.MIN_ARTICLE_PUBLISH_TIME)
}
fun getPrevContentPublishTime(): Instant {
return Instant.ofEpochMilli(page.prevContentPublishTime)
}
fun setPrevContentPublishTime(publishTime: Instant) {
page.prevContentPublishTime = publishTime.toEpochMilli()
}
fun getRefContentPublishTime(): Instant {
return Instant.ofEpochMilli(page.refContentPublishTime)
}
fun setRefContentPublishTime(publishTime: Instant) {
page.refContentPublishTime = publishTime.toEpochMilli()
}
fun getContentModifiedTime(): Instant {
return Instant.ofEpochMilli(page.contentModifiedTime)
}
fun setContentModifiedTime(modifiedTime: Instant) {
page.contentModifiedTime = modifiedTime.toEpochMilli()
}
fun getPrevContentModifiedTime(): Instant {
return Instant.ofEpochMilli(page.prevContentModifiedTime)
}
fun setPrevContentModifiedTime(modifiedTime: Instant) {
page.prevContentModifiedTime = modifiedTime.toEpochMilli()
}
fun getPrevRefContentPublishTime(): Instant {
return Instant.ofEpochMilli(page.prevRefContentPublishTime)
}
fun setPrevRefContentPublishTime(publishTime: Instant) {
page.prevRefContentPublishTime = publishTime.toEpochMilli()
}
fun getReferrer(): String? {
return if (page.referrer == null) null else page.referrer.toString()
}
fun setReferrer(referrer: String?) {
if (isStandard(referrer)) {
page.referrer = referrer
}
}
/**
* *****************************************************************************
* Page Model
* ******************************************************************************
*/
fun getPageModelUpdateTime(): Instant? {
return Instant.ofEpochMilli(page.pageModelUpdateTime)
}
fun setPageModelUpdateTime(time: Instant?) {
page.pageModelUpdateTime = time?.toEpochMilli() ?: 0
}
fun getPageModel(): PageModel? {
return if (page.pageModel != null) {
box(page.pageModel)
} else {
null
}
}
fun ensurePageModel(): PageModel {
if (page.pageModel == null) {
page.pageModel = GPageModel.newBuilder().build()
}
return getPageModel()!!
}
/**
* *****************************************************************************
* Scoring
* ******************************************************************************
*/
fun getScore(): Float {
return page.score
}
fun setScore(value: Float) {
page.score = value
}
fun getContentScore(): Float {
return if (page.contentScore == null) 0.0f else page.contentScore
}
fun setContentScore(score: Float) {
page.contentScore = score
}
fun getSortScore(): String {
return if (page.sortScore == null) "" else page.sortScore.toString()
}
fun setSortScore(score: String?) {
page.sortScore = score
}
fun getCash(): Float {
return metadata.getFloat(Name.CASH_KEY, 0.0f)
}
fun setCash(cash: Float) {
metadata[Name.CASH_KEY] = cash.toString()
}
fun getPageCounters(): PageCounters {
return PageCounters.box(page.pageCounters)
}
/**
* *****************************************************************************
* Index
* ******************************************************************************
*/
override fun hashCode(): Int {
return url.hashCode()
}
override fun equals(other: Any?): Boolean {
return if (this === other) {
true
} else other is MutableWebPage && other.url == url
}
override fun toString(): String {
TODO("Not implemented")
}
companion object {
private val SEQUENCER = AtomicInteger()
val NIL = newInternalPage(AppConstants.NIL_PAGE_URL, 0, "nil", "nil")
@JvmOverloads
fun newWebPage(url: String, conf: VolatileConfig, href: String? = null): MutableWebPage {
return newWebPageInternal(url, conf, href)
}
private fun newWebPageInternal(url: String, conf: VolatileConfig, href: String?): MutableWebPage {
val page = MutableWebPage(url, GWebPage.newBuilder().build(), false, conf)
page.setLocation(url)
page.conf = conf
page.href = href
page.crawlStatus = CrawlStatus.STATUS_UNFETCHED
page.createTime = Instant.now()
page.setModifiedTime(Instant.now())
page.setScore(0f)
page.fetchCount = 0
return page
}
@JvmOverloads
fun newInternalPage(url: String, title: String = "internal", content: String = "internal"): MutableWebPage {
return newInternalPage(url, -1, title, content)
}
fun newInternalPage(url: String, id: Int, title: String, content: String): MutableWebPage {
val unsafe = UNSAFE
val page = newWebPage(url, unsafe)
if (id >= 0) {
page.id = id
}
page.setLocation(url)
page.setModifiedTime(Instant.EPOCH)
page.setPrevFetchTime(Instant.EPOCH)
page.setFetchTime(Instant.EPOCH.plus(ChronoUnit.CENTURIES.duration))
page.setFetchInterval(ChronoUnit.CENTURIES.duration)
page.fetchPriority = AppConstants.FETCH_PRIORITY_MIN
page.crawlStatus = CrawlStatus.STATUS_UNFETCHED
page.distance = AppConstants.DISTANCE_INFINITE // or -1?
page.marks.put(Mark.INTERNAL, AppConstants.YES_STRING)
page.marks.put(Mark.INACTIVE, AppConstants.YES_STRING)
page.setPageTitle(title)
page.setContent(content)
return page
}
/**
* Initialize a WebPage with the underlying GWebPage instance.
*/
fun box(
url: String, reversedUrl: String, page: GWebPage, conf: VolatileConfig
): MutableWebPage {
return MutableWebPage(url, reversedUrl, page, conf)
}
/**
* Initialize a WebPage with the underlying GWebPage instance.
*/
fun box(url: String, page: GWebPage, conf: VolatileConfig): MutableWebPage {
return box(url, page, false, conf)
}
/**
* Initialize a WebPage with the underlying GWebPage instance.
*/
fun box(
url: String, page: GWebPage, urlReversed: Boolean, conf: VolatileConfig
): MutableWebPage {
return MutableWebPage(url, page, urlReversed, conf)
}
fun wrapKey(mark: Mark): Utf8 {
return u8(mark.value())!!
}
fun u8(value: String?): Utf8? {
return if (value == null) {
// TODO: return new Utf8.EMPTY?
null
} else Utf8(value)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy