All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.examples.rpa.RPACrawler Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.examples.rpa;

import ai.platon.pulsar.skeleton.common.options.LoadOptions;
import ai.platon.pulsar.skeleton.context.PulsarContexts;
import ai.platon.pulsar.skeleton.crawl.event.JvmWebPageWebDriverEventHandler;
import ai.platon.pulsar.skeleton.crawl.fetch.driver.JvmWebDriver;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.skeleton.session.PulsarSession;
import kotlin.coroutines.Continuation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;

public class RPACrawler {

    private final Logger logger = LoggerFactory.getLogger(this.getClass());

    private final PulsarSession session;

    public final Map fieldSelectors;

    public RPACrawler() throws Exception {
        this(PulsarContexts.createSession());
    }

    public RPACrawler(PulsarSession session) {
        this.session = session;

        fieldSelectors = new HashMap<>(Map.of(
                "sku-name", ".sku-name",
                "news", ".news",
                "summary", ".summary"
        ));
    }

    public LoadOptions options(String args) {
        var options = session.options(args, null);
        var be = options.getEvent().getBrowseEventHandlers();

        be.getOnWillComputeFeature().addLast(new JvmWebPageWebDriverEventHandler() {
            @Override
            public Object invoke(WebPage page, JvmWebDriver driver, Continuation continuation) {
                fieldSelectors.values().forEach(selector -> interact(selector, driver));
                return null;
            }
        });

        be.getOnFeatureComputed().addLast(new JvmWebPageWebDriverEventHandler() {
            @Override
            public Object invoke(WebPage page, JvmWebDriver driver, Continuation $completion) {
                logger.info("Feature computed");
                return null;
            }
        });

        return options;
    }

    private void interact(String selector, JvmWebDriver driver) {
        var delayedExecutor = CompletableFuture.delayedExecutor(2, TimeUnit.SECONDS);
        var searchBoxSelector = ".form input[type=text]";

        driver.existsAsync(selector).thenAccept(exists -> {
            if (exists) {
                driver.clickAsync(selector)
                        .thenCompose(ignored -> driver.selectFirstTextOptionalAsync(selector))
                        .thenAcceptAsync(text -> driver.typeAsync(searchBoxSelector, text.orElse("").substring(1, 4)), delayedExecutor)
                        .thenRun(() -> logger.info("{} clicked", selector))
                        .join();
            }
        }).join();
    }

    public static void main(String[] argv) throws Exception {
        var url = "https://item.jd.com/10023632209832.html";
        var args = "-refresh -parse";

        var session = PulsarContexts.createSession();
        var crawler = new RPACrawler(session);

        var fields = session.scrape(url, crawler.options(args), crawler.fieldSelectors);
        System.out.println(fields);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy