
apoc.load.LoadHtml Maven / Gradle / Ivy
package apoc.load;
import apoc.Extended;
import apoc.result.MapResult;
import apoc.util.MissingDependencyException;
import apoc.util.FileUtils;
import java.nio.charset.UnsupportedCharsetException;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.security.URLAccessChecker;
import org.neo4j.logging.Log;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
import java.util.stream.Stream;
import static apoc.load.LoadHtmlBrowser.getChromeInputStream;
import static apoc.load.LoadHtmlBrowser.getFirefoxInputStream;
@Extended
public class LoadHtml {
// public for test purpose
public static final String KEY_ERROR = "errorList";
public static final String INVALID_CONFIG_ERR = "Invalid config: ";
public static final String UNSUPPORTED_CHARSET_ERR = "Unsupported charset: ";
public static final String SELENIUM_MISSING_DEPS_ERROR = """
Cannot find the Selenium client jar.
Please put the apoc-selenium-dependencies-5.x.x-all.jar into plugin folder.
See the documentation: https://neo4j.com/labs/apoc/5/overview/apoc.load/apoc.load.html/#selenium-dependencies""";
@Context
public GraphDatabaseService db;
@Context
public Log log;
@Context
public URLAccessChecker urlAccessChecker;
@Procedure
@Description("apoc.load.htmlPlainText('urlOrHtml',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map")
public Stream htmlPlainText(@Name("urlOrHtml") String urlOrHtml, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) {
return readHtmlPage(urlOrHtml, query, config, HtmlResultInterface.Type.PLAIN_TEXT);
}
@Procedure
@Description("apoc.load.html('url',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map")
public Stream html(@Name("url") String url, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) {
return readHtmlPage(url, query, config, HtmlResultInterface.Type.DEFAULT);
}
private Stream readHtmlPage(String url, Map query, Map conf, HtmlResultInterface.Type type) {
LoadHtmlConfig config = new LoadHtmlConfig(conf);
try {
// baseUri is used to resolve relative paths
Document document = config.isHtmlString()
? Jsoup.parseBodyFragment(url)
: Jsoup.parse(getHtmlInputStream(url, query, config), config.getCharset(), config.getBaseUri());
Map output = new HashMap<>();
List errorList = new ArrayList<>();
query.keySet().forEach(key -> {
final Object value = type.get().getResult(document, query.get(key), config, errorList, log);
output.put(key, value);
});
if (!errorList.isEmpty()) {
output.put(KEY_ERROR, errorList);
}
return Stream.of(new MapResult(output));
} catch (UnsupportedCharsetException e) {
throw new RuntimeException(UNSUPPORTED_CHARSET_ERR + config.getCharset());
} catch (IllegalArgumentException | ClassCastException e) {
throw new RuntimeException(INVALID_CONFIG_ERR + e.getMessage());
} catch (FileNotFoundException e) {
throw new RuntimeException("File not found from: " + url);
} catch(Exception e) {
throw new RuntimeException("Can't read the HTML from: "+ url, e);
}
}
private InputStream getHtmlInputStream(String url, Map query, LoadHtmlConfig config) throws IOException {
final boolean isHeadless = config.isHeadless();
final boolean isAcceptInsecureCerts = config.isAcceptInsecureCerts();
switch (config.getBrowser()) {
case FIREFOX:
return withSeleniumBrowser(() -> getFirefoxInputStream(url, query, config, isHeadless, isAcceptInsecureCerts));
case CHROME:
return withSeleniumBrowser(() -> getChromeInputStream(url, query, config, isHeadless, isAcceptInsecureCerts));
default:
return FileUtils.inputStreamFor(url, null, null, null, urlAccessChecker);
}
}
public static List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy