All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.earcam.utilitarian.site.search.offline.Crawler Maven / Gradle / Ivy

/*-
 * #%L
 * io.earcam.utilitarian.site.search.offline
 * %%
 * Copyright (C) 2017 earcam
 * %%
 * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
 * 
 * You must choose to accept, in full - any individual or combination of 
 * the following licenses:
 * 
 * #L%
 */
package io.earcam.utilitarian.site.search.offline;

import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.function.Predicate;
import java.util.stream.Stream;

import io.earcam.unexceptional.EmeticStream;

// @formatter:off
/**
 * Pipeline needs to be built according to definition, with names driven via SPI, e.g.
 *
 * configuration is just a Map passed to each
 *
 * 
 * 
 *    
 *    	 default-regex
 *    	 
 *          regex
 *          regex
 *       
 *    
 *    
 *       default-html
 *    
 *    
 *       default-pdf
 *    
 *    
 *    
 *       default-tokenizer
 *    
 * 
 * 
* * Therefore Filter and Processor both need to extend 'Component' * * Component{ String id; void configure(Map) } * * Filter imps Predicate * * Processor{ process(Document); } * * * HtmlContentParser{ } * * */ // @formatter:on public class Crawler { private Stream documents; public static Crawler crawler(Map directories) { Crawler crawler = new Crawler(); crawler.documents = crawl(directories); return crawler; } private static Stream crawl(Map directories) { return directories.entrySet().parallelStream().flatMap(Crawler::crawl); } private static Stream crawl(Map.Entry pair) { return crawl(pair.getKey(), pair.getValue()); } private static Stream crawl(Path baseDir, URI baseUri) { return EmeticStream.emesis(Files::walk, baseDir) .mapToStream() .filter(Files::isRegularFile) .map(f -> Document.document(baseDir, baseUri, f)); } public Crawler filter(Predicate filter) { documents = documents.filter(filter); return this; } public Crawler processor(Processor processor) { documents = documents.map(processor); return this; } public Stream documents() { return documents.filter(Document::hasTokens); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy