Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.simiacryptus.skyenet.util.Selenium2S3.kt Maven / Gradle / Ivy
package com.simiacryptus.skyenet.util
import com.simiacryptus.skyenet.core.platform.ApplicationServices.cloud
import com.simiacryptus.skyenet.core.util.Selenium
import org.apache.hc.client5.http.async.methods.SimpleHttpRequest
import org.apache.hc.client5.http.async.methods.SimpleHttpResponse
import org.apache.hc.client5.http.cookie.BasicCookieStore
import org.apache.hc.client5.http.impl.async.HttpAsyncClientBuilder
import org.apache.hc.client5.http.impl.cookie.BasicClientCookie
import org.apache.hc.core5.concurrent.FutureCallback
import org.apache.hc.core5.http.Method
import org.jsoup.Jsoup
import org.openqa.selenium.By
import org.openqa.selenium.Cookie
import org.openqa.selenium.WebDriver
import org.openqa.selenium.WebElement
import org.openqa.selenium.chrome.ChromeDriver
import org.openqa.selenium.chrome.ChromeDriverService
import org.openqa.selenium.chrome.ChromeOptions
import java.io.File
import java.net.URI
import java.net.URL
import java.time.Duration
import java.time.temporal.ChronoUnit
import java.util.*
import java.util.concurrent.Executors
import java.util.concurrent.Semaphore
import java.util.concurrent.ThreadPoolExecutor
open class Selenium2S3(
val pool: ThreadPoolExecutor = Executors.newCachedThreadPool() as ThreadPoolExecutor,
private val cookies: Array?,
) : Selenium {
var loadImages: Boolean = false
open val driver: WebDriver by lazy {
chromeDriver(loadImages = loadImages).apply {
setCookies(
this,
cookies
)
}
}
private val httpClient by lazy {
HttpAsyncClientBuilder.create()
.useSystemProperties()
.setDefaultCookieStore(BasicCookieStore().apply {
cookies?.forEach { cookie -> addCookie(BasicClientCookie(cookie.name, cookie.value)) }
})
.setThreadFactory(pool.threadFactory)
.build()
.also { it.start() }
}
private val linkReplacements = mutableMapOf()
private val htmlPages: MutableMap = mutableMapOf()
private val jsonPages = mutableMapOf()
private val links: MutableList = mutableListOf()
override fun save(
url: URL,
currentFilename: String?,
saveRoot: String
) {
log.info("Saving URL: $url")
log.info("Current filename: $currentFilename")
log.info("Save root: $saveRoot")
driver.navigate().to(url)
driver.navigate().refresh()
Thread.sleep(5000) // Wait for javascript to load
htmlPages += mutableMapOf((currentFilename ?: url.file.split("/").last()) to editPage(driver.pageSource))
val baseUrl = url.toString().split("#").first()
links += toAbsolute(baseUrl, *currentPageLinks(driver).map { link ->
val relative = toRelative(baseUrl, link) ?: return@map link
linkReplacements[link] = "${cloud!!.shareBase}/$saveRoot/${toArchivePath(relative)}"
linkReplacements[relative] = "${cloud!!.shareBase}/$saveRoot/${toArchivePath(relative)}"
link
}.toTypedArray()).toMutableList()
val completionSemaphores = mutableListOf()
log.info("Fetching page source")
log.info("Base URL: $baseUrl")
val coveredLinks = mutableSetOf()
log.info("Processing links")
while (links.isNotEmpty()) {
val href = links.removeFirst()
try {
if (coveredLinks.contains(href)) continue
coveredLinks += href
log.debug("Processing $href")
process(url, href, completionSemaphores, saveRoot)
} catch (e: Exception) {
log.warn("Error processing $href", e)
}
}
log.info("Fetching current page links")
log.debug("Waiting for completion")
completionSemaphores.forEach { it.acquire(); it.release() }
log.debug("Saving")
saveAll(saveRoot)
log.debug("Done")
}
protected open fun process(
url: URL,
href: String,
completionSemaphores: MutableList,
saveRoot: String
): Boolean {
val base = url.toString().split("/").dropLast(1).joinToString("/")
val relative = toArchivePath(toRelative(base, href) ?: return true)
when (val mimeType = mimeType(relative)) {
"text/html" -> {
if (htmlPages.containsKey(relative)) return true
log.info("Fetching $href")
val semaphore = Semaphore(0)
completionSemaphores += semaphore
getHtml(href, htmlPages, relative, links, saveRoot, semaphore)
}
"application/json" -> {
if (jsonPages.containsKey(relative)) return true
log.info("Fetching $href")
val semaphore = Semaphore(0)
completionSemaphores += semaphore
getJson(href, jsonPages, relative, semaphore)
}
else -> {
val semaphore = Semaphore(0)
completionSemaphores += semaphore
getMedia(href, mimeType, saveRoot, relative, semaphore)
}
}
return false
}
protected open fun getHtml(
href: String,
htmlPages: MutableMap,
relative: String,
links: MutableList,
saveRoot: String,
semaphore: Semaphore
) {
httpClient.execute(get(href), object : FutureCallback {
override fun completed(p0: SimpleHttpResponse?) {
log.debug("Fetched $href")
val html = p0?.body?.bodyText ?: ""
htmlPages[relative] = html
links += toAbsolute(href, *currentPageLinks(html).map { link ->
val relative = toArchivePath(toRelative(href, link) ?: return@map link)
linkReplacements[link] = "${cloud!!.shareBase}/$saveRoot/$relative"
link
}.toTypedArray())
semaphore.release()
}
override fun failed(p0: java.lang.Exception?) {
log.info("Error fetching $href", p0)
semaphore.release()
}
override fun cancelled() {
log.info("Cancelled fetching $href")
semaphore.release()
}
})
}
protected open fun getJson(
href: String,
jsonPages: MutableMap,
relative: String,
semaphore: Semaphore
) {
httpClient.execute(get(href), object : FutureCallback {
override fun completed(p0: SimpleHttpResponse?) {
log.debug("Fetched $href")
jsonPages[relative] = p0?.body?.bodyText ?: ""
semaphore.release()
}
override fun failed(p0: java.lang.Exception?) {
log.info("Error fetching $href", p0)
semaphore.release()
}
override fun cancelled() {
log.info("Cancelled fetching $href")
semaphore.release()
}
})
}
protected open fun getMedia(
href: String,
mimeType: String,
saveRoot: String,
relative: String,
semaphore: Semaphore
) {
val request = get(href)
httpClient.execute(request, object : FutureCallback {
override fun completed(p0: SimpleHttpResponse?) {
try {
log.debug("Fetched $request")
val bytes = p0?.body?.bodyBytes ?: return
if (validate(mimeType, p0.body.contentType.mimeType, bytes))
cloud!!.upload(
path = "/$saveRoot/$relative",
contentType = mimeType,
bytes = bytes
)
} finally {
semaphore.release()
}
}
override fun failed(p0: java.lang.Exception?) {
log.info("Error fetching $href", p0)
semaphore.release()
}
override fun cancelled() {
log.info("Cancelled fetching $href")
semaphore.release()
}
})
}
private fun saveAll(
saveRoot: String
) {
(htmlPages.map { (filename, html) ->
pool.submit {
try {
saveHTML(html, saveRoot, filename)
} catch (e: Exception) {
log.warn("Error processing $filename", e)
}
}
} + jsonPages.map { (filename, js) ->
pool.submit {
try {
saveJS(js, saveRoot, filename)
} catch (e: Exception) {
log.warn("Error processing $filename", e)
}
}
}).forEach {
try {
it.get()
} catch (e: Exception) {
log.warn("Error processing", e)
}
}
}
protected open fun saveJS(js: String, saveRoot: String, filename: String) {
val finalJs = linkReplacements.toList().sortedBy { it.first.length }
.fold(js) { acc, (href, relative) -> //language=RegExp
acc.replace("""(? acc.replace("""(?
request.addHeader("Cookie", "${cookie.name}=${cookie.value}")
}
return request
}
protected open fun currentPageLinks(driver: WebDriver) = listOf(
driver.findElements(By.xpath("//a[@href]")).map { it?.getAttribute("href") }.toSet(),
driver.findElements(By.xpath("//img[@src]")).map { it?.getAttribute("src") }.toSet(),
driver.findElements(By.xpath("//link[@href]")).map { it?.getAttribute("href") }.toSet(),
driver.findElements(By.xpath("//script[@src]")).map { it?.getAttribute("src") }.toSet(),
driver.findElements(By.xpath("//source[@src]")).map { it?.getAttribute("src") }.toSet(),
).flatten().filterNotNull()
private fun currentPageLinks(html: String) = listOf(
Jsoup.parse(html).select("a[href]").map { it.attr("href") }.toSet(),
Jsoup.parse(html).select("img[src]").map { it.attr("src") }.toSet(),
Jsoup.parse(html).select("link[href]").map { it.attr("href") }.toSet(),
Jsoup.parse(html).select("script[src]").map { it.attr("src") }.toSet(),
Jsoup.parse(html).select("source[src]").map { it.attr("src") }.toSet(),
).flatten().filterNotNull()
protected open fun toAbsolute(base: String, vararg links: String) = links
.map { it.split("#").first() }.filter { it.isNotBlank() }.distinct()
.map { link ->
val newLink = when {
link.startsWith("http") -> link
else -> URI.create(base).resolve(link).toString()
}
newLink
}
protected open fun toRelative(base: String, link: String): String? = when {
link.startsWith(base) -> toRelative(
base,
link.removePrefix(base).replace("/{2,}".toRegex(), "/").removePrefix("/")
) // relativize
link.startsWith("http") -> null // absolute
else -> link // relative
}
protected open fun toArchivePath(link: String): String = when {
link.startsWith("fileIndex") -> link.split("/").drop(2).joinToString("/") // rm file segment
else -> link
}
protected open fun validate(
expected: String,
actual: String,
bytes: ByteArray
): Boolean {
if (!actual.startsWith(expected)) {
log.warn("Content type mismatch: $actual != $expected")
if (actual.startsWith("text/html")) {
log.warn("Response Error: ${String(bytes)}", Exception())
}
return false
}
return true
}
protected open fun mimeType(relative: String): String {
val extension = relative.split(".").last().split("?").first()
val contentType = when (extension) {
"css" -> "text/css"
"js" -> "text/javascript"
"json" -> "application/json"
"pdf" -> "application/pdf"
"zip" -> "application/zip"
"tar" -> "application/x-tar"
"gz" -> "application/gzip"
"bz2" -> "application/bzip2"
"mp3" -> "audio/mpeg"
//"tsv" -> "text/tab-separated-values"
"csv" -> "text/csv"
"txt" -> "text/plain"
"xml" -> "text/xml"
"svg" -> "image/svg+xml"
"png" -> "image/png"
"jpg" -> "image/jpeg"
"jpeg" -> "image/jpeg"
"gif" -> "image/gif"
"ico" -> "image/x-icon"
"html" -> "text/html"
"htm" -> "text/html"
else -> "text/plain"
}
return contentType
}
protected open fun editPage(html: String): String {
val doc = Jsoup.parse(html)
doc.select("#toolbar").remove()
doc.select("#namebar").remove()
doc.select("#main-input").remove()
doc.select("#footer").remove()
return doc.toString()
}
override fun close() {
log.debug("Closing", Exception())
driver.quit()
httpClient.close()
//driver.close()
//Companion.chromeDriverService.close()
}
companion object {
private val log = org.slf4j.LoggerFactory.getLogger(Selenium2S3::class.java)
init {
Runtime.getRuntime().addShutdownHook(Thread {
try {
} catch (e: Exception) {
log.warn("Error closing com.simiacryptus.skyenet.webui.util.Selenium2S3", e)
}
})
}
fun chromeDriver(headless: Boolean = true, loadImages: Boolean = !headless): ChromeDriver {
val osname = System.getProperty("os.name")
val chromePath = when {
// Windows
osname.contains("Windows") -> listOf(
"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe"
)
// Ubuntu
osname.contains("Linux") -> listOf("/usr/bin/chromedriver")
else -> throw RuntimeException("Not implemented for $osname")
}
System.setProperty("webdriver.chrome.driver",
chromePath.find { File(it).exists() } ?: throw RuntimeException("Chrome not found"))
val options = ChromeOptions()
val args = mutableListOf()
if (headless) args += "--headless"
if (loadImages) args += "--blink-settings=imagesEnabled=false"
options.addArguments(*args.toTypedArray())
options.setPageLoadTimeout(Duration.of(90, ChronoUnit.SECONDS))
return ChromeDriver(chromeDriverService, options)
}
private val chromeDriverService by lazy { ChromeDriverService.createDefaultService() }
fun setCookies(
driver: WebDriver,
cookies: Array?,
domain: String? = null
) {
cookies?.forEach { cookie ->
try {
driver.manage().addCookie(
Cookie(
/* name = */ cookie.name,
/* value = */ cookie.value,
/* domain = */ cookie.domain ?: domain,
/* path = */ cookie.path,
/* expiry = */ if (cookie.maxAge <= 0) null else Date(cookie.maxAge * 1000L),
/* isSecure = */ cookie.secure,
/* isHttpOnly = */ cookie.isHttpOnly
)
)
} catch (e: Exception) {
log.warn("Error setting cookie: $cookie", e)
}
}
}
}
}