com.metreeca.xml.actions.Crawl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-xml Show documentation
Connector kit for XML data.
The newest version!
/*
 * Copyright © 2013-2022 Metreeca srl
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.metreeca.xml.actions;

import com.metreeca.rest.*;
import com.metreeca.rest.actions.*;
import com.metreeca.xml.formats.HTMLFormat;

import org.w3c.dom.Document;
import org.w3c.dom.Node;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.*;
import java.util.function.BiPredicate;
import java.util.function.Function;
import java.util.stream.Stream;

import static com.metreeca.rest.Request.HEAD;
import static com.metreeca.xml.formats.HTMLFormat.html;

import static java.lang.Runtime.getRuntime;

/**
 * Site crawling.
 *
 * Maps site root URLs to streams of URLs for HTML site pages.
 */
public final class Crawl implements Function> {

	// !!! inline after linking context to threads in the execution service

	private final Function> head=new Query(request -> request.method(HEAD));
	private final Function> get=new Query();

	private final Function, Optional> parse=new Parse<>(html()); // !!! support xhtml


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	// !!! honour robots.txt
	// !!! session state


	private int threads;

	private Fetch fetch=new Fetch();

	private Function> focus=Optional::of;
	private BiPredicate prune=(root, link) -> true;


	/**
	 * Configures the number of concurrent requests (defaults to the number of processors)
	 *
	 * @param threads the maximum number of concurrent resource fetches; equivalent to the number of system
	 *                processors if equal to zero
	 *
	 * @return this action
	 *
	 * @throws IllegalArgumentException if {@code threads} is negative
	 */
	public Crawl threads(final int threads) {

		if ( threads < 0 ) {
			throw new IllegalArgumentException("negative thread count");
		}

		this.threads=threads;

		return this;
	}

	/**
	 * Configures the fetch action (defaults to {@link Fetch}.
	 *
	 * @param fetch the action used to fetch pages
	 *
	 * @return this action
	 *
	 * @throws NullPointerException if {@code fetch} is null
	 */
	public Crawl fetch(final Fetch fetch) {

		if ( fetch == null ) {
			throw new NullPointerException("null fetch");
		}

		this.fetch=fetch;

		return this;
	}

	/**
	 * Configures the content focus action (defaults to the identity function).
	 *
	 * @param focus a function taking as argument an element and returning an optional partial/restructured focus
	 *              element, if one was identified, or an empty optional, otherwise
	 *
	 * @return this action
	 *
	 * @throws NullPointerException if {@code focus} is null
	 */
	public Crawl focus(final Function> focus) {

		if ( focus == null ) {
			throw new NullPointerException("null focus");
		}

		this.focus=focus;

		return this;
	}

	/**
	 * Configures the prune action (defaults to always pass).
	 *
	 * @param prune a bi-predicate taking as arguments the site root URL and a link URL and returning {@code true} if
	 *              the link targets a site page or {@code false} otherwise
	 *
	 * @return this action
	 *
	 * @throws NullPointerException if {@code prune} is null
	 */
	public Crawl prune(final BiPredicate prune) {

		if ( prune == null ) {
			throw new NullPointerException("null prune");
		}

		this.prune=prune;

		return this;
	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Crawls a site.
	 *
	 * @param root the root URL of the site to be crawled
	 *
	 * @return a stream of links to nested HTML pages reachable from the root {@code root}; empty if {@code root} is
	 * null or empty
	 */
	@Override public Stream apply(final String root) {
		return root == null || root.isEmpty() ? Stream.empty() : new Crawler(root).crawl();
	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	private final class Crawler {

		private final String root;
		private final Map pages=new ConcurrentHashMap<>();

		private final Phaser phaser=new Phaser(); // !!! handle 65k limit with tiered phasers
		private final ExecutorService executor=Executors.newFixedThreadPool(
				threads > 0 ? threads : getRuntime().availableProcessors()
				// !!! custom thread factory for linking context
		);


		private Crawler(final String root) {
			this.root=root;
		}


		private Stream crawl() {
			try {

				phaser.register();

				crawl(root);

				phaser.arriveAndAwaitAdvance();

				return pages
						.entrySet().stream()
						.filter(Map.Entry::getValue)
						.map(Map.Entry::getKey);


			} finally {

				executor.shutdown();

			}
		}


		private void crawl(final String page) {
			if ( pages.putIfAbsent(page, false) == null ) { // mark as pending

				phaser.register();

				executor.execute(() -> {
					try {

						Xtream

								.of(page)

								.filter(link -> Xtream.of(link)

										.optMap(head)
										.optMap(fetch)

										.anyMatch(response -> response
												.header("Content-Type")
												.filter(HTMLFormat.MIMEPattern.asPredicate())
												.isPresent()
										)

								)

								.optMap(get)
								.optMap(fetch)

								.optMap(parse)
								.optMap(focus)

								.peek(node -> pages.put(page, true)) // successfully processed

								.flatMap(new XPath<>(p -> p.links("//html:a/@href")))

								.map(new Regex<>(r -> r.replace("#.*$", ""))) // remove anchor
								.map(new Regex<>(r1 -> r1.replace("\\?.*$", ""))) // remove query // !!! ?

								.filter(link -> { // keep only nested resources
									try {

										final URI origin=new URI(root).normalize();
										final URI target=new URI(link).normalize();

										return !origin.relativize(target).equals(target);

									} catch ( final URISyntaxException e ) {

										return false;

									}
								})

								.filter(link -> prune.test(root, link))

								.forEach(this::crawl);

					} finally {

						phaser.arrive();

					}
				});

			}
		}

	}

}