Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
apoc.load.LoadHtml Maven / Gradle / Ivy
package apoc.load;
import apoc.Extended;
import apoc.result.MapResult;
import apoc.util.MissingDependencyException;
import apoc.util.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.logging.Log;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
import java.util.stream.Stream;
import static apoc.load.LoadHtmlBrowser.getChromeInputStream;
import static apoc.load.LoadHtmlBrowser.getFirefoxInputStream;
@Extended
public class LoadHtml {
// public for test purpose
public static final String KEY_ERROR = "errorList";
@Context
public GraphDatabaseService db;
@Context
public Log log;
@Procedure
@Description("apoc.load.htmlPlainText('urlOrHtml',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map")
public Stream htmlPlainText(@Name("urlOrHtml") String urlOrHtml, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) {
return readHtmlPage(urlOrHtml, query, config, HtmlResultInterface.Type.PLAIN_TEXT);
}
@Procedure
@Description("apoc.load.html('url',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map")
public Stream html(@Name("url") String url, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) {
return readHtmlPage(url, query, config, HtmlResultInterface.Type.DEFAULT);
}
private Stream readHtmlPage(String url, Map query, Map conf, HtmlResultInterface.Type type) {
LoadHtmlConfig config = new LoadHtmlConfig(conf);
try {
// baseUri is used to resolve relative paths
Document document = config.isHtmlString()
? Jsoup.parseBodyFragment(url)
: Jsoup.parse(getHtmlInputStream(url, query, config), config.getCharset(), config.getBaseUri());
Map output = new HashMap<>();
List errorList = new ArrayList<>();
query.keySet().forEach(key -> {
final Object value = type.get().getResult(document, query.get(key), config, errorList, log);
output.put(key, value);
});
if (!errorList.isEmpty()) {
output.put(KEY_ERROR, errorList);
}
return Stream.of(new MapResult(output));
} catch (IllegalArgumentException | ClassCastException e) {
throw new RuntimeException("Invalid config: " + config);
} catch (FileNotFoundException e) {
throw new RuntimeException("File not found from: " + url);
} catch(UnsupportedEncodingException e) {
throw new RuntimeException("Unsupported charset: " + config.getCharset());
} catch(Exception e) {
throw new RuntimeException("Can't read the HTML from: "+ url, e);
}
}
private InputStream getHtmlInputStream(String url, Map query, LoadHtmlConfig config) throws IOException {
final boolean isHeadless = config.isHeadless();
final boolean isAcceptInsecureCerts = config.isAcceptInsecureCerts();
switch (config.getBrowser()) {
case FIREFOX:
return withSeleniumBrowser(() -> getFirefoxInputStream(url, query, config, isHeadless, isAcceptInsecureCerts));
case CHROME:
return withSeleniumBrowser(() -> getChromeInputStream(url, query, config, isHeadless, isAcceptInsecureCerts));
default:
return FileUtils.inputStreamFor(url, null, null, null);
}
}
public static List> getElements(Elements elements, LoadHtmlConfig conf, List errorList, Log log) {
List> elementList = new ArrayList<>();
for (Element element : elements) {
withError(element, errorList, conf.getFailSilently(), log, () -> {
Map result = new HashMap<>();
if(element.attributes().size() > 0) result.put("attributes", getAttributes(element));
if(!element.data().isEmpty()) result.put("data", element.data());
if(!element.val().isEmpty()) result.put("value", element.val());
if(!element.tagName().isEmpty()) result.put("tagName", element.tagName());
if (conf.isChildren()) {
if(element.hasText()) result.put("text", element.ownText());
result.put("children", getElements(element.children(), conf, errorList, log));
}
else {
if(element.hasText()) result.put("text", element.text());
}
elementList.add(result);
return null;
});
}
return elementList;
}
private static Map getAttributes(Element element) {
Map attributes = new HashMap<>();
for (Attribute attribute : element.attributes()) {
if (!attribute.hasDeclaredValue() && !Attribute.isBooleanAttribute(attribute.getKey())) {
throw new RuntimeException("Invalid tag " + element);
}
if (!attribute.getValue().isBlank()) {
final String key = attribute.getKey();
// with href/src attribute we prepend baseUri path
final boolean attributeHasLink = key.equals("href") || key.equals("src");
attributes.put(key, attributeHasLink ? element.absUrl(key) : attribute.getValue());
}
}
return attributes;
}
public static T withError(Element element, List errorList, LoadHtmlConfig.FailSilently failConfig, Log log, Supplier fun) {
try {
return fun.get();
} catch (Exception e) {
final String parseError = "Error during parsing element: " + element;
switch (failConfig) {
case WITH_LOG:
log.warn(parseError);
break;
case WITH_LIST:
errorList.add(element.toString());
break;
default:
throw new RuntimeException(parseError);
}
}
return null;
}
private InputStream withSeleniumBrowser(Supplier action) {
try {
return action.get();
} catch (NoClassDefFoundError e) {
throw new MissingDependencyException("Cannot find jars into the plugins folder.\n" +
"See the documentation: https://neo4j.com/labs/apoc/4.1/overview/apoc.load/apoc.load.html/#selenium-depencencies");
}
}
}