All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.persist.experimental.MutableWebPage.kt.txt Maven / Gradle / Ivy

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.persist.experimental

import ai.platon.pulsar.common.DateTimes
import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.PulsarParams
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.common.browser.BrowserType
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.VolatileConfig
import ai.platon.pulsar.common.config.VolatileConfig.Companion.UNSAFE
import ai.platon.pulsar.common.urls.UrlUtils.isStandard
import ai.platon.pulsar.common.urls.UrlUtils.reverseUrlOrEmpty
import ai.platon.pulsar.common.urls.UrlUtils.unreverseUrl
import ai.platon.pulsar.persist.*
import ai.platon.pulsar.persist.gora.generated.*
import ai.platon.pulsar.persist.metadata.*
import ai.platon.pulsar.persist.metadata.OpenPageCategory.Companion.parse
import ai.platon.pulsar.persist.model.ActiveDOMStat
import ai.platon.pulsar.persist.model.ActiveDOMStatus
import ai.platon.pulsar.persist.model.Converters.convert
import ai.platon.pulsar.persist.model.PageModel
import ai.platon.pulsar.persist.model.PageModel.Companion.box
import ai.platon.pulsar.persist.model.WebPageFormatter
import org.apache.avro.util.Utf8
import org.apache.commons.collections4.CollectionUtils
import org.apache.commons.lang3.StringUtils
import org.apache.commons.lang3.math.NumberUtils
import org.apache.gora.util.ByteUtils
import org.xml.sax.InputSource
import java.io.ByteArrayInputStream
import java.nio.ByteBuffer
import java.time.Duration
import java.time.Instant
import java.time.ZoneId
import java.time.temporal.ChronoUnit
import java.util.*
import java.util.concurrent.atomic.AtomicInteger
import java.util.function.Consumer
import java.util.function.Function
import java.util.stream.Collectors

/**
 * The core web page structure
 */
class MutableWebPage(
    page: GWebPage
): KWebPage(page) {

    /**
     * Web page scope configuration
     */
    override override var conf: VolatileConfig

    /**
     * If this page is fetched from internet
     */
    override override var isCached = false

    /**
     * If this page is loaded from database or is created and fetched from the web
     */
    override var isLoaded = false

    /**
     * If this page is fetched from internet
     */
    override var isFetched = false
    /**
     * If a page is canceled, it remains unchanged
     */
    /**
     * If a page is canceled, it remains unchanged
     */
    /**
     * If this page is canceled
     */
    override var isCanceled = false

    /**
     * If this page is fetched and updated
     */
    @Volatile
    override var isContentUpdated = false
        private set
    /**
     * Get the cached content
     */
    /**
     * Set the cached content, keep the persisted page content unmodified
     */
    /**
     * The cached content
     */
    @Volatile
    override var tmpContent: ByteBuffer? = null

    /**
     * The delay time to retry if a retry is needed
     */
    override var retryDelay = Duration.ZERO

    /**
     * Get The hypertext reference of this page.
     * It defines the address of the document, which this time is linked from
     *
     *
     * TODO: use a separate field for href
     *
     * @return The hypertext reference
     */
    /**
     * Set The hypertext reference of this page.
     * It defines the address of the document, which this time is linked from
     *
     * @param href The hypertext reference
     */
    override var href: String?
        get() = metadata[Name.HREF]
        set(href) {
            metadata[Name.HREF] = href
        }

    /**
     *
     * getAndRemoveVar.
     *
     * @param name a [String] object.
     * @return a boolean.
     */
    fun removeVar(name: String): Any {
        return variables.remove(name)
    }

    /**
     * Get a page scope temporary variable
     *
     * @param name  The variable name.
     * @param value The variable value.
     */
    fun setVar(name: String, value: Any) {
        variables[name] = value
    }

    fun hasMark(mark: Mark): Boolean {
        return page.markers[wrapKey(mark)] != null
    }// The underlying field should not use name 'args'
    /**
     * Set the local args variable and the persist version, and also clear the load options.
     */
    /**
     * The load arguments is variant task by task, so the local version is the first choice,
     * while the persisted version is used for historical check only
     *
     * The underlying field should not use name 'args' since it exists already
     * with another gora type, see GProtocolStatus.args and GParseStatus.args
     */
    override var args: String
        set(args) {
            variables.remove(PulsarParams.VAR_LOAD_OPTIONS)
            page.params = args
        }

    override var maxRetries: Int
        set(maxRetries) {
            metadata[Name.FETCH_MAX_RETRY] = maxRetries
        }

    override var fetchedLinkCount: Int
        get() = metadata.getInt(Name.FETCHED_LINK_COUNT, 0)
        set(count) {
            metadata[Name.FETCHED_LINK_COUNT] = count
        }
    override var zoneId: ZoneId
        get() = if (page.zoneId == null) DateTimes.zoneId else ZoneId.of(page.zoneId.toString())
        set(zoneId) {
            page.zoneId = zoneId.id
        }
    override var batchId: String?
        get() = if (page.batchId == null) "" else page.batchId.toString()
        set(value) {
            page.batchId = value
        }

    fun markSeed() {
        metadata[Name.IS_SEED] = AppConstants.YES_STRING
    }

    fun unmarkSeed() {
        metadata.remove(Name.IS_SEED)
    }

    val isSeed: Boolean
        get() = metadata.contains(Name.IS_SEED)
    override var distance: Int
        get() {
            val distance = page.distance
            return if (distance < 0) AppConstants.DISTANCE_INFINITE else distance
        }
        set(newDistance) {
            page.distance = newDistance
        }

    /**
     * Fetch mode is used to determine the protocol before fetch, so it shall be set before fetch
     */
    override var fetchMode: FetchMode
        get() = FetchMode.fromString(metadata[Name.FETCH_MODE])
        set(mode) {
            metadata[Name.FETCH_MODE] = mode.name
        }
    override var lastBrowser: BrowserType
        get() {
            val browser = if (page.browser != null) page.browser.toString() else ""
            return BrowserType.fromString(browser)
        }
        set(browser) {
            page.browser = browser.name
        }
    override var isResource: Boolean
        get() = page.resource != null
        set(resource) {
            if (resource) {
                page.resource = 1
            }
        }
    override var htmlIntegrity: HtmlIntegrity
        get() {
            val integrity = if (page.htmlIntegrity != null) page.htmlIntegrity.toString() else ""
            return HtmlIntegrity.fromString(integrity)
        }
        set(integrity) {
            page.htmlIntegrity = integrity.name
        }
    override var fetchPriority: Int
        get() = if (page.fetchPriority > 0) page.fetchPriority else AppConstants.FETCH_PRIORITY_DEFAULT
        set(priority) {
            page.fetchPriority = priority
        }

    fun sniffFetchPriority(): Int {
        override var priority = fetchPriority
        val depth = distance
        if (depth < AppConstants.FETCH_PRIORITY_DEPTH_BASE) {
            priority = Math.max(priority, AppConstants.FETCH_PRIORITY_DEPTH_BASE - depth)
        }
        return priority
    }

    override var createTime: Instant
        get() = Instant.ofEpochMilli(page.createTime)
        set(createTime) {
            page.createTime = createTime.toEpochMilli()
        }
    override var generateTime: Instant
        get() {
            val generateTime = metadata[Name.GENERATE_TIME]
            return if (generateTime == null) {
                Instant.EPOCH
            } else {
                Instant.parse(generateTime)
            }
        }
        set(generateTime) {
            metadata[Name.GENERATE_TIME] = generateTime.toString()
        }
    override var fetchCount: Int
        get() = page.fetchCount
        set(count) {
            page.fetchCount = count
        }

    fun updateFetchCount() {
        val count = fetchCount
        fetchCount = count + 1
    }

    override var crawlStatus: CrawlStatus
        get() = CrawlStatus(page.crawlStatus.toByte())
        set(crawlStatus) {
            page.crawlStatus = crawlStatus.code
        }

    fun setCrawlStatus(value: Int) {
        page.crawlStatus = value
    }

    /**
     * The baseUrl is as the same as Location
     *
     *
     * A baseUrl has the same semantic with Jsoup.parse:
     *
     * @return a [String] object.
     * @link {https://jsoup.org/apidocs/org/jsoup/Jsoup.html#parse-java.io.File-java.lang.String-java.lang.String-}
     * @see MutableWebPage.getLocation
     */
    val baseUrl: String
        get() = if (page.baseUrl == null) "" else page.baseUrl.toString()

    /**
     * WebPage.url is the permanent internal address, it might not still available to access the target.
     * And WebPage.location or WebPage.baseUrl is the last working address, it might redirect to url,
     * or it might have additional random parameters.
     * WebPage.location may be different from url, it's generally normalized.
     */
    val location: String get() = baseUrl

    /**
     * The url is the permanent internal address, it might not still available to access the target.
     *
     *
     * Location is the last working address, it might redirect to url, or it might have additional random parameters.
     *
     *
     * Location may be different from url, it's generally normalized.
     *
     * @param location The location.
     */
    fun setLocation(location: String) {
        page.baseUrl = location
    }

    /**
     * The latest fetch time
     *
     * @return The latest fetch time
     */
    fun getFetchTime(): Instant {
        return Instant.ofEpochMilli(page.fetchTime)
    }

    /**
     * The latest fetch time
     *
     * @param time The latest fetch time
     */
    fun setFetchTime(time: Instant) {
        page.fetchTime = time.toEpochMilli()
    }

    /**
     * The previous fetch time, updated at the fetch stage
     *
     * @return The previous fetch time.
     */
    fun getPrevFetchTime(): Instant {
        return Instant.ofEpochMilli(page.prevFetchTime)
    }

    fun setPrevFetchTime(time: Instant) {
        page.prevFetchTime = time.toEpochMilli()
    }

    /**
     * The previous crawl time, used for fat link crawl, which means both the page itself and out pages are fetched
     */
    fun getPrevCrawlTime1(): Instant {
        return Instant.ofEpochMilli(page.prevCrawlTime1)
    }

    /**
     * The previous crawl time, used for fat link crawl, which means both the page itself and out pages are fetched
     */
    fun setPrevCrawlTime1(time: Instant) {
        page.prevCrawlTime1 = time.toEpochMilli()
    }

    /**
     * Get fetch interval
     */
    fun getFetchInterval(): Duration {
        override var seconds = page.fetchInterval.toLong()
        if (seconds < 0) {
            seconds = ChronoUnit.CENTURIES.duration.seconds
        }
        return Duration.ofSeconds(seconds)
    }

    /**
     * Set fetch interval
     */
    fun setFetchInterval(duration: Duration) {
        page.fetchInterval = duration.seconds.toInt()
    }

    /**
     * Set fetch interval in seconds
     */
    fun setFetchInterval(seconds: Long) {
        page.fetchInterval = seconds.toInt()
    }

    /**
     * Set fetch interval in seconds
     */
    fun setFetchInterval(seconds: Float) {
        page.fetchInterval = Math.round(seconds)
    }

    /**
     * Get protocol status
     */
    fun getProtocolStatus(): ProtocolStatus {
        override var protocolStatus = page.protocolStatus
        if (protocolStatus == null) {
            protocolStatus = GProtocolStatus.newBuilder().build()
        }
        return ProtocolStatus.box(protocolStatus)
    }

    /**
     * Set protocol status
     */
    fun setProtocolStatus(protocolStatus: ProtocolStatus) {
        page.protocolStatus = protocolStatus.unbox()
    }

    /**
     * Header information returned from the web server used to server the content which is subsequently fetched from.
     * This includes keys such as
     * TRANSFER_ENCODING,
     * CONTENT_ENCODING,
     * CONTENT_LANGUAGE,
     * CONTENT_LENGTH,
     * CONTENT_LOCATION,
     * CONTENT_DISPOSITION,
     * CONTENT_MD5,
     * CONTENT_TYPE,
     * LAST_MODIFIED
     * and LOCATION.
     */
    fun getHeaders(): ProtocolHeaders {
        return ProtocolHeaders.box(page.headers)
    }

    fun getReprUrl(): String {
        return if (page.reprUrl == null) "" else page.reprUrl.toString()
    }

    fun setReprUrl(value: String) {
        page.reprUrl = value
    }

    fun getFetchRetries(): Int {
        return page.fetchRetries
    }

    fun setFetchRetries(value: Int) {
        page.fetchRetries = value
    }

    fun getModifiedTime(): Instant {
        return Instant.ofEpochMilli(page.modifiedTime)
    }

    fun setModifiedTime(value: Instant) {
        page.modifiedTime = value.toEpochMilli()
    }

    fun getPrevModifiedTime(): Instant {
        return Instant.ofEpochMilli(page.prevModifiedTime)
    }

    fun setPrevModifiedTime(value: Instant) {
        page.prevModifiedTime = value.toEpochMilli()
    }

    fun getFetchTimeHistory(defaultValue: String): String {
        val s = metadata[Name.FETCH_TIME_HISTORY]
        return s ?: defaultValue
    }

    fun getPageCategory(): PageCategory {
        try {
            val pageCategory = page.pageCategory
            if (pageCategory != null) {
                return PageCategory.parse(pageCategory.toString())
            }
        } catch (ignored: Throwable) {
        }
        return PageCategory.UNKNOWN
    }

    fun getOpenPageCategory(): OpenPageCategory {
        try {
            val pageCategory = page.pageCategory
            if (pageCategory != null) {
                return parse(pageCategory.toString())
            }
        } catch (ignored: Throwable) {
        }
        return OpenPageCategory("", "")
    }

    /**
     * category : index, detail, review, media, search, etc
     *
     * @param pageCategory a [PageCategory] object.
     */
    fun setPageCategory(pageCategory: PageCategory) {
        page.pageCategory = pageCategory.format()
    }

    fun setPageCategory(pageCategory: OpenPageCategory) {
        page.pageCategory = pageCategory.format()
    }

    /**
     * Get the encoding of the content.
     * Content encoding is detected just before it's parsed.
     */
    fun getEncoding(): String? {
        return if (page.encoding == null) null else page.encoding.toString()
    }

    /**
     * Set the encoding of the content.
     * Content encoding is detected just before it's parsed.
     */
    fun setEncoding(encoding: String?) {
        page.encoding = encoding
    }

    /**
     * Get the encoding of the content.
     * Content encoding is detected just before it's parsed.
     */
    fun getEncodingOrDefault(defaultEncoding: String): String {
        return if (page.encoding == null) defaultEncoding else page.encoding.toString()
    }

    /**
     * The clues are used to determine the encoding of the page content
     */
    fun getEncodingClues(): String {
        return metadata.getOrDefault(Name.ENCODING_CLUES, "")
    }

    /**
     * The clues are used to determine the encoding of the page content
     */
    fun setEncodingClues(clues: String) {
        metadata[Name.ENCODING_CLUES] = clues
    }

    /**
     * The entire raw document content e.g. raw XHTML
     *
     * @return The raw document content in [ByteBuffer].
     */
    fun getContent(): ByteBuffer? {
        return if (tmpContent != null) {
            tmpContent
        } else page.content
    }

    /**
     * Get the persistent page content
     */
    fun getPersistContent(): ByteBuffer? {
        return page.content
    }

    /**
     * Get content as bytes, the underling buffer is duplicated
     *
     * @return a duplication of the underling buffer.
     */
    fun getContentAsBytes(): ByteArray {
        val content = getContent() ?: return ByteUtils.toBytes('\u0000')
        return ByteUtils.toBytes(content)
    }

    /**
     * Get the page content as a string, if the underlying page content is null, return an empty string
     */
    fun getContentAsString(): String {
        val buffer = getContent()
        return if (buffer == null || buffer.remaining() == 0) {
            ""
        } else String(buffer.array(), buffer.arrayOffset(), buffer.limit())
    }

    /**
     * Get the page content as input stream
     */
    fun getContentAsInputStream(): ByteArrayInputStream {
        val contentInOctets = getContent() ?: return ByteArrayInputStream(ByteUtils.toBytes('\u0000'))
        return ByteArrayInputStream(
            getContent()!!.array(),
            contentInOctets.arrayOffset() + contentInOctets.position(),
            contentInOctets.remaining()
        )
    }

    /**
     * Get the page content as sax input source
     */
    fun getContentAsSaxInputSource(): InputSource {
        val inputSource = InputSource(getContentAsInputStream())
        val encoding = getEncoding()
        if (encoding != null) {
            inputSource.encoding = encoding
        }
        return inputSource
    }

    /**
     * Set the page content
     */
    fun setContent(value: String?) {
        if (value != null) {
            setContent(value.toByteArray())
        } else {
            setContent(null as ByteBuffer?)
        }
    }

    /**
     * Set the page content
     */
    fun setContent(value: ByteArray?) {
        if (value != null) {
            setContent(ByteBuffer.wrap(value))
        } else {
            setContent(null as ByteBuffer?)
        }
    }

    /**
     * Set the page content
     *
     * @param value a ByteBuffer.
     */
    fun setContent(value: ByteBuffer?) {
        if (value != null) {
            page.content = value
            isContentUpdated = true
            val length = value.array().size
            computeContentLength(length.toLong())
            setPersistedContentLength(length.toLong())
        } else {
            clearPersistContent()
        }
    }

    fun clearPersistContent() {
        tmpContent = page.content
        page.content = null
        setPersistedContentLength(0)
    }

    /**
     * Get the length of content in bytes.
     *
     * TODO: check consistency with HttpHeaders.CONTENT_LENGTH
     *
     * @return The length of the content in bytes.
     */
    fun getContentLength(): Long {
        return if (page.contentLength != null) page.contentLength else 0
    }

    /**
     * Compute the length of content in bytes.
     */
    private fun computeContentLength(bytes: Long) {
        val lastBytes = getContentLength()
        page.lastContentLength = lastBytes
        page.contentLength = bytes
        computeAveContentLength(bytes)
    }

    private fun computeAveContentLength(bytes: Long) {
        val count = fetchCount
        val lastAveBytes = page.aveContentLength
        val aveBytes: Long
        aveBytes = if (count > 0 && lastAveBytes == 0L) {
            // old version, average bytes is not calculated
            bytes
        } else {
            (lastAveBytes * count + bytes) / (count + 1)
        }
        page.aveContentLength = aveBytes
    }

    fun getPersistedContentLength(): Long {
        return if (page.persistedContentLength != null) page.persistedContentLength else 0
    }

    private fun setPersistedContentLength(bytes: Long) {
        page.persistedContentLength = bytes
    }

    fun getLastContentLength(): Long {
        return if (page.lastContentLength != null) page.lastContentLength else 0
    }

    fun getAveContentLength(): Long {
        return if (page.aveContentLength != null) page.aveContentLength else 0
    }

    fun getContentType(): String {
        return if (page.contentType == null) "" else page.contentType.toString()
    }

    fun setContentType(value: String) {
        page.contentType = value.trim { it <= ' ' }.toLowerCase()
    }

    fun getPrevSignature(): ByteBuffer? {
        return page.prevSignature
    }

    fun setPrevSignature(value: ByteBuffer?) {
        page.prevSignature = value
    }

    fun getPrevSignatureAsString(): String {
        override var sig = getPrevSignature()
        if (sig == null) {
            sig = ByteBuffer.wrap("".toByteArray())
        }
        return Strings.toHexString(sig)
    }

    /**
     * The last proxy used to fetch the page
     */
    fun getProxy(): String? {
        return if (page.proxy == null) null else page.proxy.toString()
    }

    /**
     * The last proxy used to fetch the page
     */
    fun setProxy(proxy: String?) {
        page.proxy = proxy
    }

    fun getActiveDOMStatus(): ActiveDOMStatus? {
        val s = page.activeDOMStatus ?: return null
        return ActiveDOMStatus(
            s.n,
            s.scroll,
            s.st.toString(),
            s.r.toString(),
            s.idl.toString(),
            s.ec.toString()
        )
    }

    fun setActiveDOMStatus(s: ActiveDOMStatus?) {
        if (s == null) {
            return
        }
        val s2 = page.activeDOMStatus
        if (s2 != null) {
            s2.n = s.n
            s2.scroll = s.scroll
            s2.st = s.st
            s2.r = s.r
            s2.idl = s.idl
            s2.ec = s.ec
        }
    }

    fun getActiveDOMStatTrace(): Map {
        val s = page.activeDOMStatTrace
        return s.entries.stream().collect(
            Collectors.toMap, String, ActiveDOMStat>(
                Function { (key1): Map.Entry -> key1.toString() },
                Function { (_, value): Map.Entry ->
                    convert(
                        value
                    )
                }
            ))
    }

    fun setActiveDOMStatTrace(trace: Map) {
        val statTrace = trace.entries.stream().collect(
            Collectors.toMap(
                Function, CharSequence> { (key1, value) -> java.util.Map.Entry.key },
                Function { (_, value): Map.Entry ->
                    convert(
                        value!!
                    )
                })
        )
        page.activeDOMStatTrace = statTrace
    }

    /**
     * An implementation of a WebPage's signature from which it can be identified and referenced at any point in time.
     * This is essentially the WebPage's fingerprint representing its state for any point in time.
     */
    fun getSignature(): ByteBuffer? {
        return page.signature
    }

    fun setSignature(value: ByteArray?) {
        page.signature = ByteBuffer.wrap(value)
    }

    fun getSignatureAsString(): String {
        override var sig = getSignature()
        if (sig == null) {
            sig = ByteBuffer.wrap("".toByteArray())
        }
        return Strings.toHexString(sig)
    }

    fun getPageTitle(): String {
        return if (page.pageTitle == null) "" else page.pageTitle.toString()
    }

    fun setPageTitle(pageTitle: String?) {
        page.pageTitle = pageTitle
    }

    fun getContentTitle(): String {
        return if (page.contentTitle == null) "" else page.contentTitle.toString()
    }

    fun setContentTitle(contentTitle: String?) {
        if (contentTitle != null) {
            page.contentTitle = contentTitle
        }
    }

    fun getPageText(): String {
        return if (page.pageText == null) "" else page.pageText.toString()
    }

    fun setPageText(value: String?) {
        if (value != null && !value.isEmpty()) page.pageText = value
    }

    fun getContentText(): String {
        return if (page.contentText == null) "" else page.contentText.toString()
    }

    fun setContentText(textContent: String?) {
        if (textContent != null && !textContent.isEmpty()) {
            page.contentText = textContent
            page.contentTextLen = textContent.length
        }
    }

    fun getContentTextLen(): Int {
        return page.contentTextLen
    }

    fun getParseStatus(): ParseStatus {
        val parseStatus = page.parseStatus
        return ParseStatus.box(parseStatus ?: GParseStatus.newBuilder().build())
    }

    fun setParseStatus(parseStatus: ParseStatus) {
        page.parseStatus = parseStatus.unbox()
    }

    fun getLiveLinks(): Map {
        return page.liveLinks
    }

    fun getSimpleLiveLinks(): Collection {
        return CollectionUtils.collect(page.liveLinks.keys) { obj: CharSequence -> obj.toString() }
    }

    fun setLiveLinks(liveLinks: Iterable) {
        page.liveLinks.clear()
        val links = page.liveLinks
        liveLinks.forEach(Consumer { l: HyperlinkPersistable -> links[l.url] = l.unbox() })
    }

    fun setLiveLinks(links: Map?) {
        page.liveLinks = links
    }

    fun addLiveLink(hyperLink: HyperlinkPersistable) {
        page.liveLinks[hyperLink.url] = hyperLink.unbox()
    }

    fun getVividLinks(): Map {
        return page.vividLinks
    }

    fun getSimpleVividLinks(): Collection {
        return CollectionUtils.collect(page.vividLinks.keys) { obj: CharSequence -> obj.toString() }
    }

    fun setVividLinks(links: Map?) {
        page.vividLinks = links
    }

    fun getDeadLinks(): List {
        return page.deadLinks
    }

    fun setDeadLinks(deadLinks: List?) {
        page.deadLinks = deadLinks
    }

    fun getLinks(): List {
        return page.links
    }

    fun setLinks(links: List?) {
        page.links = links
    }

    fun getImpreciseLinkCount(): Int {
        val count = metadata.getOrDefault(Name.TOTAL_OUT_LINKS, "0")
        return NumberUtils.toInt(count, 0)
    }

    fun setImpreciseLinkCount(count: Int) {
        metadata[Name.TOTAL_OUT_LINKS] = count.toString()
    }

    fun getInlinks(): Map {
        return page.inlinks
    }

    fun getAnchor(): CharSequence {
        return if (page.anchor != null) page.anchor else ""
    }

    fun setAnchor(anchor: CharSequence?) {
        page.anchor = anchor
    }

    fun getInlinkAnchors(): Array {
        return StringUtils.split(metadata.getOrDefault(Name.ANCHORS, ""), "\n")
    }

    fun setInlinkAnchors(anchors: Collection?) {
        metadata[Name.ANCHORS] = StringUtils.join(anchors, "\n")
    }

    fun getAnchorOrder(): Int {
        val order = page.anchorOrder
        return if (order < 0) AppConstants.MAX_LIVE_LINK_PER_PAGE else order
    }

    fun setAnchorOrder(order: Int) {
        page.anchorOrder = order
    }

    fun getContentPublishTime(): Instant {
        return Instant.ofEpochMilli(page.contentPublishTime)
    }

    fun setContentPublishTime(publishTime: Instant) {
        page.contentPublishTime = publishTime.toEpochMilli()
    }

    fun isValidContentModifyTime(publishTime: Instant): Boolean {
        return publishTime.isAfter(AppConstants.MIN_ARTICLE_PUBLISH_TIME)
    }

    fun getPrevContentPublishTime(): Instant {
        return Instant.ofEpochMilli(page.prevContentPublishTime)
    }

    fun setPrevContentPublishTime(publishTime: Instant) {
        page.prevContentPublishTime = publishTime.toEpochMilli()
    }

    fun getRefContentPublishTime(): Instant {
        return Instant.ofEpochMilli(page.refContentPublishTime)
    }

    fun setRefContentPublishTime(publishTime: Instant) {
        page.refContentPublishTime = publishTime.toEpochMilli()
    }

    fun getContentModifiedTime(): Instant {
        return Instant.ofEpochMilli(page.contentModifiedTime)
    }

    fun setContentModifiedTime(modifiedTime: Instant) {
        page.contentModifiedTime = modifiedTime.toEpochMilli()
    }

    fun getPrevContentModifiedTime(): Instant {
        return Instant.ofEpochMilli(page.prevContentModifiedTime)
    }

    fun setPrevContentModifiedTime(modifiedTime: Instant) {
        page.prevContentModifiedTime = modifiedTime.toEpochMilli()
    }

    fun getPrevRefContentPublishTime(): Instant {
        return Instant.ofEpochMilli(page.prevRefContentPublishTime)
    }

    fun setPrevRefContentPublishTime(publishTime: Instant) {
        page.prevRefContentPublishTime = publishTime.toEpochMilli()
    }

    fun getReferrer(): String? {
        return if (page.referrer == null) null else page.referrer.toString()
    }

    fun setReferrer(referrer: String?) {
        if (isStandard(referrer)) {
            page.referrer = referrer
        }
    }

    /**
     * *****************************************************************************
     * Page Model
     * ******************************************************************************
     */
    fun getPageModelUpdateTime(): Instant? {
        return Instant.ofEpochMilli(page.pageModelUpdateTime)
    }

    fun setPageModelUpdateTime(time: Instant?) {
        page.pageModelUpdateTime = time?.toEpochMilli() ?: 0
    }

    fun getPageModel(): PageModel? {
        return if (page.pageModel != null) {
            box(page.pageModel)
        } else {
            null
        }
    }

    fun ensurePageModel(): PageModel {
        if (page.pageModel == null) {
            page.pageModel = GPageModel.newBuilder().build()
        }
        return getPageModel()!!
    }

    /**
     * *****************************************************************************
     * Scoring
     * ******************************************************************************
     */
    fun getScore(): Float {
        return page.score
    }

    fun setScore(value: Float) {
        page.score = value
    }

    fun getContentScore(): Float {
        return if (page.contentScore == null) 0.0f else page.contentScore
    }

    fun setContentScore(score: Float) {
        page.contentScore = score
    }

    fun getSortScore(): String {
        return if (page.sortScore == null) "" else page.sortScore.toString()
    }

    fun setSortScore(score: String?) {
        page.sortScore = score
    }

    fun getCash(): Float {
        return metadata.getFloat(Name.CASH_KEY, 0.0f)
    }

    fun setCash(cash: Float) {
        metadata[Name.CASH_KEY] = cash.toString()
    }

    fun getPageCounters(): PageCounters {
        return PageCounters.box(page.pageCounters)
    }

    /**
     * *****************************************************************************
     * Index
     * ******************************************************************************
     */
    override fun hashCode(): Int {
        return url.hashCode()
    }

    override fun equals(other: Any?): Boolean {
        return if (this === other) {
            true
        } else other is MutableWebPage && other.url == url
    }

    override fun toString(): String {
        TODO("Not implemented")
    }

    companion object {
        private val SEQUENCER = AtomicInteger()
        val NIL = newInternalPage(AppConstants.NIL_PAGE_URL, 0, "nil", "nil")

        @JvmOverloads
        fun newWebPage(url: String, conf: VolatileConfig, href: String? = null): MutableWebPage {
            return newWebPageInternal(url, conf, href)
        }

        private fun newWebPageInternal(url: String, conf: VolatileConfig, href: String?): MutableWebPage {
            val page = MutableWebPage(url, GWebPage.newBuilder().build(), false, conf)
            page.setLocation(url)
            page.conf = conf
            page.href = href
            page.crawlStatus = CrawlStatus.STATUS_UNFETCHED
            page.createTime = Instant.now()
            page.setModifiedTime(Instant.now())
            page.setScore(0f)
            page.fetchCount = 0
            return page
        }

        @JvmOverloads
        fun newInternalPage(url: String, title: String = "internal", content: String = "internal"): MutableWebPage {
            return newInternalPage(url, -1, title, content)
        }

        fun newInternalPage(url: String, id: Int, title: String, content: String): MutableWebPage {
            val unsafe = UNSAFE
            val page = newWebPage(url, unsafe)
            if (id >= 0) {
                page.id = id
            }
            page.setLocation(url)
            page.setModifiedTime(Instant.EPOCH)
            page.setPrevFetchTime(Instant.EPOCH)
            page.setFetchTime(Instant.EPOCH.plus(ChronoUnit.CENTURIES.duration))
            page.setFetchInterval(ChronoUnit.CENTURIES.duration)
            page.fetchPriority = AppConstants.FETCH_PRIORITY_MIN
            page.crawlStatus = CrawlStatus.STATUS_UNFETCHED
            page.distance = AppConstants.DISTANCE_INFINITE // or -1?
            page.marks.put(Mark.INTERNAL, AppConstants.YES_STRING)
            page.marks.put(Mark.INACTIVE, AppConstants.YES_STRING)
            page.setPageTitle(title)
            page.setContent(content)
            return page
        }

        /**
         * Initialize a WebPage with the underlying GWebPage instance.
         */
        fun box(
            url: String, reversedUrl: String, page: GWebPage, conf: VolatileConfig
        ): MutableWebPage {
            return MutableWebPage(url, reversedUrl, page, conf)
        }

        /**
         * Initialize a WebPage with the underlying GWebPage instance.
         */
        fun box(url: String, page: GWebPage, conf: VolatileConfig): MutableWebPage {
            return box(url, page, false, conf)
        }

        /**
         * Initialize a WebPage with the underlying GWebPage instance.
         */
        fun box(
            url: String, page: GWebPage, urlReversed: Boolean, conf: VolatileConfig
        ): MutableWebPage {
            return MutableWebPage(url, page, urlReversed, conf)
        }

        fun wrapKey(mark: Mark): Utf8 {
            return u8(mark.value())!!
        }

        fun u8(value: String?): Utf8? {
            return if (value == null) {
                // TODO: return new Utf8.EMPTY?
                null
            } else Utf8(value)
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy