All Downloads are FREE. Search and download functionalities are using the official Maven repository.

apoc.load.LoadHtml Maven / Gradle / Ivy

There is a newer version: 4.4.0.34
Show newest version
package apoc.load;

import apoc.result.MapResult;
import apoc.util.MapUtil;
import apoc.util.Util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.logging.Log;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;

import java.util.*;
import java.util.stream.Stream;

public class LoadHtml {

    @Context
    public GraphDatabaseService db;

    @Context
    public Log log;


    @Procedure
    @Description("apoc.load.html('url',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map")
    public Stream html(@Name("url") String url, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) {
        return readHtmlPage(url, query, config);
    }

    private Stream readHtmlPage(String url, Map query, Map config){
        try {
            String charset = config.getOrDefault("charset", "UTF-8").toString();
            // baseUri is used to resolve relative paths
            String baseUri = config.getOrDefault("baseUri", "").toString();

            Document document = Jsoup.parse(Util.openInputStream(url, null, null), charset, baseUri);

            return query.keySet().stream().map(key -> {
                Elements elements = document.select(query.get(key));
                List> resultList = new ArrayList<>();
                getElements(elements, resultList);

                return new MapResult(MapUtil.map(key, resultList));
            });
        } catch(Exception e){
            throw new RuntimeException("Can't read the HTML from: "+ url);
        }
    }

    private void getElements(Elements elements, List> resultList) {
        for (Element element : elements) {
            Map result = new HashMap<>();
            if(element.attributes().size() > 0) result.put("attributes", getAttributes(element));
            if(!element.data().isEmpty())result.put("data", element.data());
            if(element.hasText()) result.put("text", element.text());
            if(!element.val().isEmpty()) result.put("value", element.val());
            if(!element.tagName().isEmpty()) result.put("tagName", element.tagName());

            resultList.add(result);
        }
    }

    private Map getAttributes(Element element) {
        Map attributes = new HashMap<>();
        for (Attribute attribute : element.attributes()) {
            if(!attribute.getValue().isEmpty()) attributes.put(attribute.getKey(), attribute.getValue());
        }

        return attributes;
    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy