All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.examples.sites.food.dianping.RestaurantCrawlerSlim.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.examples.sites.food.dianping

import ai.platon.pulsar.common.AppFiles
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.context.PulsarContexts
import ai.platon.pulsar.skeleton.session.PulsarSession
import com.google.gson.GsonBuilder
import kotlinx.coroutines.delay
import java.nio.file.Files
import java.util.*

class RestaurantCrawlerSlim(val session: PulsarSession) {
    val commentSelectors = IntRange(1, 10)
        .associate { i -> "comment-$i" to "#reviewlist-wrapper li.comment-item:nth-child($i) p.desc.J-desc" }

    val fieldSelectors = mutableMapOf(
        "title" to ".basic-info h2",
        "score" to ".basic-info .brief-info .mid-score",
        "reviewCount" to "#reviewCount",
        "avgPrice" to "#avgPriceTitle",
        "commentScores" to "#comment_score",
        "address" to "#address",
        "tel" to ".tel",
    ).also { it.putAll(commentSelectors) }

    fun options(args: String): LoadOptions {
        val options = session.options(args)
        val browseEvent = options.event.browseEventHandlers
        browseEvent.onWillComputeFeature.addLast { page, driver ->
            IntRange(1, commentSelectors.size)
                .map { i -> "#reviewlist-wrapper .comment-item:nth-child($i) .more" }
                .forEach { selector ->
                    if (driver.exists(selector)) {
                        driver.click(selector)
                        delay(500)
                    }
                }
        }

        browseEvent.onFeatureComputed.addLast { page, driver ->
            fieldSelectors.forEach { (name, selector) ->
                if (driver.exists(selector)) {
                    val screenshot = driver.captureScreenshot(selector)

                    val path = AppFiles.createTempFile("screenshot", "png")
                    val bytes = Base64.getDecoder().decode(screenshot)
                    Files.write(path, bytes)

                    delay(500)
                }
            }
        }

        return options
    }
}

fun main() {
    val portalUrl = "https://www.dianping.com/beijing/ch10/g110"
    val args = "-i 1s -ii 5s -ol \"#shop-all-list .tit a[href~=shop]\" -ignoreFailure"

    val crawler = RestaurantCrawlerSlim(PulsarContexts.createSession())
    val fields = crawler.session.scrapeOutPages(portalUrl, crawler.options(args), crawler.fieldSelectors)
    println(GsonBuilder().setPrettyPrinting().create().toJson(fields))
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy