ai.platon.pulsar.skeleton.crawl.common.url.Hyperlinks.kt Maven / Gradle / Ivy
The newest version!
package ai.platon.pulsar.skeleton.crawl.common.url
import ai.platon.pulsar.common.DateTimes
import ai.platon.pulsar.common.ResourceStatus
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.options.OptionUtils
import ai.platon.pulsar.common.urls.*
import ai.platon.pulsar.skeleton.crawl.PageEventHandlers
import ai.platon.pulsar.skeleton.crawl.event.WebPageHandler
import ai.platon.pulsar.skeleton.crawl.event.impl.DefaultPageEventHandlers
import ai.platon.pulsar.skeleton.crawl.event.impl.PageEventHandlersFactory
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.common.urls.NormURL
import java.net.MalformedURLException
import java.net.URL
import java.time.Duration
import java.time.Instant
import java.util.concurrent.CompletableFuture
import java.util.concurrent.TimeUnit
import java.util.function.BiConsumer
/**
* A url that contains a [PageEventHandlers] to handle page events.
* */
interface ListenableUrl: UrlAware {
val event: PageEventHandlers
}
/**
* A hyperlink that contains a [PageEventHandlers] to handle page events.
* */
open class ListenableHyperlink(
/**
* The url of this hyperlink
* */
url: String,
/**
* The anchor text of this hyperlink
* */
text: String = "",
/**
* The order of this hyperlink on its referrer page
* */
order: Int = 0,
/**
* The url of the referrer page
* */
referrer: String? = null,
/**
* The url arguments
* */
args: String? = null,
/**
* A click url is an url variant, it's the raw url in the html without normalization,
* for example, an url with a timestamp query parameter added
* */
href: String? = null,
/**
* The event handler
* */
override var event: PageEventHandlers = PageEventHandlersFactory().create(),
): Hyperlink(url, text, order, referrer, args, href), ListenableUrl {
/**
* A listenable url is not a persistence object because the event handler is not persistent
* */
override val isPersistable: Boolean = false
}
/**
* A stateful hyperlink that has status and contains a [PageEventHandlers] to handle page events.
* */
open class StatefulListenableHyperlink(
/**
* The url of this hyperlink
* */
url: String,
/**
* The anchor text of this hyperlink
* */
text: String = "",
/**
* The order of this hyperlink on its referrer page
* */
order: Int = 0,
/**
* The url of the referrer page
* */
referrer: String? = null,
/**
* The url arguments
* */
args: String? = null,
/**
* A click url is a url variant, it's the raw url in the html without normalization,
* for example, an url with a timestamp query parameter added
* */
href: String? = null,
/**
* The event handler
* */
override var event: PageEventHandlers = PageEventHandlersFactory().create()
): StatefulHyperlink(url, text, order, referrer, args, href), ListenableUrl {
override val isPersistable: Boolean = false
val idleTime get() = Duration.between(modifiedAt, Instant.now())
}
/**
* A hyperlink that contains a `onParse` event handler. The `onParse` event handler
* will be called when the page is parsed.
* */
open class ParsableHyperlink(
/**
* The url of this hyperlink
* */
url: String,
/**
* A event handler that will be called when the page is parsed.
* */
val onParse: (WebPage, FeaturedDocument) -> Any?
): Hyperlink(url, args = "-parse"), ListenableUrl {
/**
* Java compatible constructor
* */
constructor(url: String, onParse: BiConsumer):
this(url, { page, document -> onParse.accept(page, document) })
/**
* The PageEvent handlers of this hyperlink.
* */
override var event: PageEventHandlers = DefaultPageEventHandlers().also {
it.loadEventHandlers.onHTMLDocumentParsed.addLast { page, document ->
onParse(page, document)
}
}
}
open class CompletableHyperlink(
/**
* The url of this hyperlink
* */
override var url: String,
/**
* The anchor text of this hyperlink
* */
var text: String = "",
/**
* The order of this hyperlink in its referrer page
* */
var order: Int = 0,
/**
* The url of the referrer page
* */
override var referrer: String? = null,
/**
* The url arguments
* */
override var args: String? = null,
/**
* The hypertext reference, It defines the address of the document, which this time is linked from
* */
override var href: String? = null,
/**
* The priority
* */
override var priority: Int = 0
): UrlAware, Comparable, StatefulUrl, CompletableFuture() {
override val configuredUrl get() = UrlUtils.mergeUrlArgs(url, args)
override val isStandard get() = UrlUtils.isStandard(url)
@get: Throws(MalformedURLException::class)
override val toURL get() = URL(url)
override val toURLOrNull get() = UrlUtils.getURLOrNull(url)
override val isNil: Boolean get() = url == AppConstants.NIL_PAGE_URL
/**
* If this link is persistable
* */
override val isPersistable: Boolean = false
override val label: String get() = OptionUtils.findOption(args, listOf("-l", "-label", "--label")) ?: ""
/**
* Required website language
* */
override var lang: String = "*"
/**
* Required website country
* */
override var country: String = "*"
/**
* Required website district
* */
override var district: String = "*"
/**
* The maximum retry times
* */
override var nMaxRetry: Int = 3
override val deadline: Instant
get() {
val deadTime = OptionUtils.findOption(args, listOf("-deadTime", "--dead-time")) ?: ""
return DateTimes.parseBestInstantOrNull(deadTime) ?: DateTimes.doomsday
}
override var authToken: String? = null
override var remoteAddr: String? = null
override var status: Int = ResourceStatus.SC_CREATED
override var modifiedAt: Instant = Instant.now()
override val createdAt: Instant = Instant.now()
/**
* An abstract url can compare to one of the following types:
* 1. a [String]
* 2. a [URL]
* 3. a [UrlAware]
* */
override fun equals(other: Any?): Boolean {
if (this === other) {
return true
}
return when (other) {
is String -> url == other
is URL -> url == other.toString()
is UrlAware -> url == other.url
else -> false
}
}
override fun hashCode() = url.hashCode()
override fun compareTo(other: UrlAware): Int {
return url.compareTo(other.url)
}
override fun toString() = url
}
open class CompletableListenableHyperlink(
/**
* The url of this hyperlink
* */
url: String,
/**
* The anchor text of this hyperlink
* */
text: String = "",
/**
* The order of this hyperlink in its referrer page
* */
order: Int = 0,
/**
* The url of the referrer page
* */
referrer: String? = null,
/**
* The url arguments
* */
args: String? = null,
/**
* The hypertext reference, It defines the address of the document, which this time is linked from
* */
href: String? = null,
/**
* The event handler
* */
override var event: PageEventHandlers = PageEventHandlersFactory().create(),
): UrlAware, Comparable, ListenableUrl,
CompletableHyperlink(url, text, order, referrer, args, href)
internal class CompleteWebPageHyperlinkHandler(val link: CompletableListenableHyperlink): WebPageHandler() {
override fun invoke(page: WebPage) {
link.complete(page)
link.event.loadEventHandlers.onLoaded.remove(this)
// TODO: the following code might be better
// if (link.event.loadEvent.onLoaded.remove(this)) {
// link.complete(page)
// }
}
}
/**
* Create a completable listenable hyperlink
* */
fun NormURL.toCompletableListenableHyperlink(): CompletableListenableHyperlink {
val link = CompletableListenableHyperlink(spec, args = args, href = hrefSpec)
link.event.loadEventHandlers.onLoaded.addLast(CompleteWebPageHyperlinkHandler(link))
options.rawEvent?.let { link.event.chain(it) }
link.completeOnTimeout(WebPage.NIL, options.pageLoadTimeout.seconds + 1, TimeUnit.SECONDS)
return link
}