io.earcam.utilitarian.site.search.offline.Crawler Maven / Gradle / Ivy
/*-
* #%L
* io.earcam.utilitarian.site.search.offline
* %%
* Copyright (C) 2017 earcam
* %%
* SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
*
* You must choose to accept, in full - any individual or combination of
* the following licenses:
*
* - BSD-3-Clause
* - EPL-1.0
* - Apache-2.0
* - MIT
*
* #L%
*/
package io.earcam.utilitarian.site.search.offline;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.function.Predicate;
import java.util.stream.Stream;
import io.earcam.unexceptional.EmeticStream;
// @formatter:off
/**
* Pipeline needs to be built according to definition, with names driven via SPI, e.g.
*
* configuration is just a Map passed to each
*
*
*
*
* default-regex
*
* regex
* regex
*
*
*
* default-html
*
*
* default-pdf
*
*
*
* default-tokenizer
*
*
*
*
* Therefore Filter and Processor both need to extend 'Component'
*
* Component{ String id; void configure(Map) }
*
* Filter imps Predicate
*
* Processor{ process(Document); }
*
*
* HtmlContentParser{ }
*
*
*/
// @formatter:on
public class Crawler {
private Stream documents;
public static Crawler crawler(Map directories)
{
Crawler crawler = new Crawler();
crawler.documents = crawl(directories);
return crawler;
}
private static Stream crawl(Map directories)
{
return directories.entrySet().parallelStream().flatMap(Crawler::crawl);
}
private static Stream crawl(Map.Entry pair)
{
return crawl(pair.getKey(), pair.getValue());
}
private static Stream crawl(Path baseDir, URI baseUri)
{
return EmeticStream.emesis(Files::walk, baseDir)
.mapToStream()
.filter(Files::isRegularFile)
.map(f -> Document.document(baseDir, baseUri, f));
}
public Crawler filter(Predicate filter)
{
documents = documents.filter(filter);
return this;
}
public Crawler processor(Processor processor)
{
documents = documents.map(processor);
return this;
}
public Stream documents()
{
return documents.filter(Document::hasTokens);
}
}