ai.platon.pulsar.examples.BasicUsage Maven / Gradle / Ivy
The newest version!
package ai.platon.pulsar.examples;
import ai.platon.pulsar.skeleton.context.PulsarContexts;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.skeleton.session.PulsarSession;
import com.google.gson.Gson;
import org.jsoup.nodes.Element;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class BasicUsage {
public static void main(String[] args) throws Exception {
// create a pulsar session
PulsarSession session = PulsarContexts.createSession();
// the main url we are playing with
String url = "https://list.jd.com/list.html?cat=652,12345,12349";
// load a page, or fetch it from the Internet if it does not exist or has expired
WebPage page = session.load(url, "-expires 1d");
// submit a url to the URL pool, and it will be processed in a crawl loop
session.submit(url, "-expires 1d");
// parse the page content into a Jsoup document
FeaturedDocument document = session.parse(page, false);
// do something with the document
// ...
// or, load and parse
FeaturedDocument document2 = session.loadDocument(url, "-expires 1d");
// do something with the document
// ...
// load all pages with links specified by -outLink
List pages = session.loadOutPages(url, "-expires 1d -itemExpires 7d -outLink a[href~=item]");
// load the portal page and submit the out links specified by the `-outLink` option to the URL pool
session.submitForOutPages(url, "-expires 1d -itemExpires 7d -outLink a[href~=item]");
// load, parse and scrape fields
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy