com.goikosoft.crawler4j.crawler.GenericCrawlController Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler4j Show documentation
Show all versions of crawler4j Show documentation
crawler4j: Open Source Web Crawler for Java. Modified by Dario Goikoetxea to add POST capabilities
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.goikosoft.crawler4j.crawler;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.goikosoft.crawler4j.fetcher.PageFetcherInterface;
import com.goikosoft.crawler4j.frontier.DocIDServer;
import com.goikosoft.crawler4j.frontier.DocIDServerInterface;
import com.goikosoft.crawler4j.frontier.Frontier;
import com.goikosoft.crawler4j.frontier.FrontierInterface;
import com.goikosoft.crawler4j.parser.Parser;
import com.goikosoft.crawler4j.parser.ParserInterface;
import com.goikosoft.crawler4j.robotstxt.RobotstxtServer;
import com.goikosoft.crawler4j.url.TLDList;
import com.goikosoft.crawler4j.url.URLCanonicalizer;
import com.goikosoft.crawler4j.url.WebURL;
import com.goikosoft.crawler4j.util.IO;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
/**
* The controller that manages a crawling session. This class creates the
* crawler threads and monitors their progress.
*
* Modified by Dario Goikoetxea to be generic
*
* @author Yasser Ganjisaffar
*/
public class GenericCrawlController, ResultType> {
static final Logger logger = LoggerFactory.getLogger(GenericCrawlController.class);
private final CrawlConfig config;
/**
* The 'customData' object can be used for passing custom crawl-related
* configurations to different components of the crawler.
*/
protected Object customData;
/**
* Once the crawling session finishes the controller collects the local data
* of the crawler threads and stores them in this List.
*/
protected List crawlersLocalData = new ArrayList<>();
/**
* Is the crawling of this session finished?
*/
protected boolean finished;
private Throwable error;
/**
* Is the crawling session set to 'shutdown'. Crawler threads monitor this
* flag and when it is set they will no longer process new pages.
*/
protected boolean shuttingDown;
protected PageFetcherInterface pageFetcher;
protected RobotstxtServer robotstxtServer;
protected FrontierInterface frontier;
protected DocIDServerInterface docIdServer;
protected TLDList tldList;
protected final Object waitingLock = new Object();
protected final Environment env;
protected ParserInterface parser;
public GenericCrawlController(CrawlConfig config, PageFetcherInterface pageFetcher,
RobotstxtServer robotstxtServer) throws Exception {
this(config, pageFetcher, null, robotstxtServer, null, null, null, null);
}
public GenericCrawlController(CrawlConfig config, PageFetcherInterface pageFetcher,
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
this(config, pageFetcher, null, robotstxtServer, tldList, null, null, null);
}
public GenericCrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, ParserInterface parser,
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
this(config, pageFetcher, parser, robotstxtServer, tldList, null, null, null);
}
public GenericCrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, ParserInterface parser,
RobotstxtServer robotstxtServer, TLDList tldList,
String docIdDbName, String pendingDbName) throws Exception {
this(config, pageFetcher, parser, robotstxtServer, tldList, null, null, null);
}
public GenericCrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, ParserInterface parser,
RobotstxtServer robotstxtServer, TLDList tldList,
String docIdDbName, String pendingDbName, String inProcessDbName) throws Exception {
config.validate();
this.config = config;
File folder = new File(config.getCrawlStorageFolder());
if (!folder.exists()) {
if (folder.mkdirs()) {
logger.debug("Created folder: " + folder.getAbsolutePath());
} else {
throw new IOException(
"couldn't create the storage folder: " + folder.getAbsolutePath() +
" does it already exist ?");
}
}
this.tldList = tldList == null ? new TLDList(config) : tldList;
URLCanonicalizer.setHaltOnError(config.isHaltOnError());
boolean resumable = config.isResumableCrawling();
EnvironmentConfig envConfig = new EnvironmentConfig();
envConfig.setAllowCreate(true);
envConfig.setTransactional(resumable);
envConfig.setLocking(resumable);
envConfig.setLockTimeout(config.getDbLockTimeout(), TimeUnit.MILLISECONDS);
File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
if (!envHome.exists()) {
if (envHome.mkdir()) {
logger.debug("Created folder: " + envHome.getAbsolutePath());
} else {
throw new IOException(
"Failed creating the frontier folder: " + envHome.getAbsolutePath());
}
}
if (!resumable) {
IO.deleteFolderContents(envHome);
logger.info("Deleted contents of: " + envHome +
" ( as you have configured resumable crawling to false )");
}
env = new Environment(envHome, envConfig);
docIdServer = new DocIDServer(env, config, docIdDbName);
frontier = createFrontier(config, pendingDbName, inProcessDbName);
this.pageFetcher = pageFetcher;
this.parser = parser == null ? new Parser(config, this.tldList) : parser;
this.robotstxtServer = robotstxtServer;
finished = false;
shuttingDown = false;
robotstxtServer.setCrawlConfig(config);
}
/**
* Creates the Frontier for this instance. Subclasses can create custom Frontiers
* @param config configuration procided to the CrawlController
* @return new instance of Frontier
*/
protected Frontier createFrontier(CrawlConfig config, String pendingDbName, String inProcessDbName) {
return new Frontier(env, config, pendingDbName, inProcessDbName);
}
/**
* Creates an empty WebURL. Subclases can override this to create subclases of WebURL instead.
* @param nonCanonicalString url before being transformed into canonical. It is ignored in default implementation
* @return new empty instance of WebURL
*/
protected WebURL createEmptyWebURL(String nonCanonicalString) {
return new WebURL();
}
public ParserInterface getParser() {
return parser;
}
public interface WebCrawlerFactory> {
T newInstance() throws Exception;
}
private static class SingleInstanceFactory>
implements WebCrawlerFactory {
final T instance;
SingleInstanceFactory(T instance) {
this.instance = instance;
}
@Override
public T newInstance() throws Exception {
return this.instance;
}
}
private static class DefaultWebCrawlerFactory>
implements WebCrawlerFactory {
final Class clazz;
DefaultWebCrawlerFactory(Class clazz) {
this.clazz = clazz;
}
@Override
public T newInstance() throws Exception {
try {
return clazz.newInstance();
} catch (ReflectiveOperationException e) {
throw e;
}
}
}
/**
* Start the crawling session and wait for it to finish.
* This method utilizes default crawler factory that creates new crawler using Java reflection
*
* @param clazz
* the class that implements the logic for crawler threads
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
* @param Your class extending WebCrawler
*/
public void start(Class clazz, int numberOfCrawlers) {
this.start(new DefaultWebCrawlerFactory<>(clazz), numberOfCrawlers, true);
}
/**
* Start the crawling session and wait for it to finish.
* This method depends on a single instance of a crawler. Only that instance will be used for crawling.
*
* @param instance
* the instance of a class that implements the logic for crawler threads
* @param Your class extending WebCrawler
*/
public void start(T instance) {
this.start(new SingleInstanceFactory<>(instance), 1, true);
}
/**
* Start the crawling session and wait for it to finish.
*
* @param crawlerFactory
* factory to create crawlers on demand for each thread
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
* @param Your class extending WebCrawler
*/
public void start(WebCrawlerFactory crawlerFactory,
int numberOfCrawlers) {
this.start(crawlerFactory, numberOfCrawlers, true);
}
/**
* Start the crawling session and return immediately.
*
* @param crawlerFactory
* factory to create crawlers on demand for each thread
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
* @param Your class extending WebCrawler
*/
public void startNonBlocking(WebCrawlerFactory crawlerFactory,
final int numberOfCrawlers) {
this.start(crawlerFactory, numberOfCrawlers, false);
}
/**
* Start the crawling session and return immediately.
* This method utilizes default crawler factory that creates new crawler using Java reflection
*
* @param clazz
* the class that implements the logic for crawler threads
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
* @param Your class extending WebCrawler
*/
public void startNonBlocking(Class clazz, int numberOfCrawlers) {
start(new DefaultWebCrawlerFactory<>(clazz), numberOfCrawlers, false);
}
protected void start(final WebCrawlerFactory crawlerFactory,
final int numberOfCrawlers, boolean isBlocking) {
try {
finished = false;
setError(null);
crawlersLocalData.clear();
final List threads = new ArrayList<>();
final List crawlers = new ArrayList<>();
for (int i = 1; i <= numberOfCrawlers; i++) {
T crawler = crawlerFactory.newInstance();
Thread thread = new Thread(crawler, "Crawler " + i);
crawler.setThread(thread);
crawler.init(i, this);
thread.start();
crawlers.add(crawler);
threads.add(thread);
logger.info("Crawler {} started", i);
}
final GenericCrawlController controller = this;
Thread monitorThread = new Thread(new Runnable() {
@Override
public void run() {
try {
synchronized (waitingLock) {
while (true) {
sleep(config.getThreadMonitoringDelaySeconds());
boolean someoneIsWorking = false;
for (int i = 0; i < threads.size(); i++) {
Thread thread = threads.get(i);
if (!thread.isAlive()) {
if (!shuttingDown && !config.isHaltOnError()) {
logger.info("Thread {} was dead, I'll recreate it", i);
T crawler = crawlerFactory.newInstance();
thread = new Thread(crawler, "Crawler " + (i + 1));
threads.remove(i);
threads.add(i, thread);
crawler.setThread(thread);
crawler.init(i + 1, controller);
thread.start();
crawlers.remove(i);
crawlers.add(i, crawler);
}
} else if (crawlers.get(i).isNotWaitingForNewURLs()) {
someoneIsWorking = true;
}
Throwable t = crawlers.get(i).getError();
if (t != null && config.isHaltOnError()) {
throw new RuntimeException(
"error on thread [" + threads.get(i).getName() + "]", t);
}
}
boolean shutOnEmpty = config.isShutdownOnEmptyQueue();
if (!someoneIsWorking && shutOnEmpty) {
// Make sure again that none of the threads
// are
// alive.
logger.info(
"It looks like no thread is working, waiting for " +
config.getThreadShutdownDelaySeconds() +
" seconds to make sure...");
sleep(config.getThreadShutdownDelaySeconds());
someoneIsWorking = false;
for (int i = 0; i < threads.size(); i++) {
Thread thread = threads.get(i);
if (thread.isAlive() &&
crawlers.get(i).isNotWaitingForNewURLs()) {
someoneIsWorking = true;
}
}
if (!someoneIsWorking) {
if (!shuttingDown) {
long queueLength = frontier.getQueueLength();
if (queueLength > 0) {
continue;
}
logger.info(
"No thread is working and no more URLs are in " +
"queue waiting for another " +
config.getThreadShutdownDelaySeconds() +
" seconds to make sure...");
sleep(config.getThreadShutdownDelaySeconds());
queueLength = frontier.getQueueLength();
if (queueLength > 0) {
continue;
}
}
logger.info(
"All of the crawlers are stopped. Finishing the " +
"process...");
// At this step, frontier notifies the threads that were
// waiting for new URLs and they should stop
frontier.finish();
for (T crawler : crawlers) {
crawler.onBeforeExit();
crawlersLocalData.add(collectCrawlerData(crawler));
}
logger.info(
"Waiting for " + config.getCleanupDelaySeconds() +
" seconds before final clean up...");
sleep(config.getCleanupDelaySeconds());
frontier.close();
docIdServer.close();
pageFetcher.shutDown();
finished = true;
waitingLock.notifyAll();
env.close();
return;
}
}
}
}
} catch (Throwable e) {
if (config.isHaltOnError()) {
setError(e);
synchronized (waitingLock) {
frontier.finish();
frontier.close();
docIdServer.close();
pageFetcher.shutDown();
waitingLock.notifyAll();
env.close();
}
} else {
logger.error("Unexpected Error", e);
}
} finally {
onFinish();
}
}
});
monitorThread.start();
if (isBlocking) {
waitUntilFinish();
}
} catch (Exception e) {
if (config.isHaltOnError()) {
if (e instanceof RuntimeException) {
throw (RuntimeException)e;
} else {
throw new RuntimeException("error running the monitor thread", e);
}
} else {
logger.error("Error happened", e);
}
}
}
/**
* This function will be called when the monitorThread finishes. Everything is closed by this momment
*/
protected void onFinish() {
}
/**
* Wait until this crawling session finishes.
*/
public void waitUntilFinish() {
while (!finished) {
synchronized (waitingLock) {
if (config.isHaltOnError()) {
Throwable t = getError();
if (t != null && config.isHaltOnError()) {
if (t instanceof RuntimeException) {
throw (RuntimeException)t;
} else if (t instanceof Error) {
throw (Error)t;
} else {
throw new RuntimeException("error on monitor thread", t);
}
}
}
if (finished) {
return;
}
try {
waitingLock.wait();
} catch (InterruptedException e) {
logger.error("Error occurred", e);
}
}
}
}
/**
* Once the crawling session finishes the controller collects the local data of the crawler
* threads and stores them
* in a List.
* This function returns the reference to this list.
*
* @return List of Objects which are your local data
*/
public List getCrawlersLocalData() {
return crawlersLocalData;
}
protected static void sleep(int seconds) {
try {
Thread.sleep(seconds * 1000);
} catch (InterruptedException ignored) {
// Do nothing
}
}
/**
* Extracts data from crawler. Subclases can override this in order to fine-tune
* results extraction with custom behaviour or processing.
*
* @param crawler
* @return the data extracted from the crawler
*/
protected ResultType collectCrawlerData(CrawlerType crawler) {
return crawler.getMyLocalData();
}
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling.
*
* @param pageUrl
* the URL of the seed
*
* @throws InterruptedException
* @throws IOException
*/
public void addSeed(String pageUrl) throws IOException, InterruptedException {
addSeed(pageUrl, -1);
}
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
*
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* @param pageUrl
* the URL of the seed
* @param docId
* the document id that you want to be assigned to this seed URL.
*
* @throws InterruptedException
* @throws IOException
*/
public void addSeed(String pageUrl, int docId) throws IOException, InterruptedException {
WebURL webUrl = createEmptyWebURL(pageUrl);
webUrl.setURL(pageUrl);
webUrl.setDocid(docId);
addSeed(webUrl);
}
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
*
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* NOTE: It will modify the provided URL to set it to a canonical form.
* It will also set depth 0 and add the tldList to the WebURL.
*
* @param pageUrl
* the URL of the seed
*
* @throws InterruptedException
* @throws IOException
*/
public void addSeed(WebURL pageUrl) throws IOException, InterruptedException {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl.getURL());
if (canonicalUrl == null) {
logger.error("Invalid seed URL: {}", pageUrl);
} else {
int docId = pageUrl.getDocid();
pageUrl.setURL(canonicalUrl);
if (docId < 0) {
docId = docIdServer.getDocId(pageUrl);
if (docId > 0) {
logger.trace("This URL is already seen.");
return;
}
docId = docIdServer.getNewDocID(pageUrl);
pageUrl.setDocid(docId);
} else {
try {
docIdServer.addUrlAndDocId(pageUrl);
} catch (RuntimeException e) {
if (config.isHaltOnError()) {
throw e;
} else {
logger.error("Could not add seed: {}", e.getMessage());
}
}
}
pageUrl.setTldList(tldList);
pageUrl.setDepth((short) 0);
if (robotstxtServer.allows(pageUrl)) {
frontier.schedule(pageUrl);
} else {
// using the WARN level here, as the user specifically asked to add this seed
logger.warn("Robots.txt does not allow this seed: {}", pageUrl);
}
}
}
/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
* Urls and their associated document ids and want to have a new crawl which
* is aware of the previously seen Urls and won't re-crawl them.
*
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
* the next URL that is found during the crawl will get a doc id of 8. Also
* you need to ensure to add seen Urls in increasing order of document ids.
*
* @param url
* the URL of the page
* @param docId
* the document id that you want to be assigned to this URL.
* @throws UnsupportedEncodingException
*
*
*/
public void addSeenUrl(String url, int docId) throws UnsupportedEncodingException {
WebURL webUrl = createEmptyWebURL(url);
webUrl.setURL(url);
webUrl.setDocid(docId);
addSeenUrl(webUrl);
}
/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
* Urls and their associated document ids and want to have a new crawl which
* is aware of the previously seen Urls and won't re-crawl them.
*
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
* the next URL that is found during the crawl will get a doc id of 8. Also
* you need to ensure to add seen Urls in increasing order of document ids.
*
* @param url
* the URL of the page
* @throws UnsupportedEncodingException
*
*/
public void addSeenUrl(WebURL url) throws UnsupportedEncodingException {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url.getURL());
if (canonicalUrl == null) {
logger.error("Invalid Url: {} (can't cannonicalize it!)", url);
} else {
url.setURL(canonicalUrl);
try {
docIdServer.addUrlAndDocId(url);
} catch (RuntimeException e) {
if (config.isHaltOnError()) {
throw e;
} else {
logger.error("Could not add seen url: {}", e.getMessage());
}
}
}
}
public PageFetcherInterface getPageFetcher() {
return pageFetcher;
}
public void setPageFetcher(PageFetcherInterface pageFetcher) {
this.pageFetcher = pageFetcher;
}
public RobotstxtServer getRobotstxtServer() {
return robotstxtServer;
}
public void setRobotstxtServer(RobotstxtServer robotstxtServer) {
this.robotstxtServer = robotstxtServer;
}
public FrontierInterface getFrontier() {
return frontier;
}
public void setFrontier(FrontierInterface frontier) {
this.frontier = frontier;
}
public DocIDServerInterface getDocIdServer() {
return docIdServer;
}
public void setDocIdServer(DocIDServerInterface docIdServer) {
this.docIdServer = docIdServer;
}
/**
* @deprecated implements a factory {@link WebCrawlerFactory} and inject your cutom data as
* shown here .
*/
@Deprecated
public Object getCustomData() {
return customData;
}
/**
* @deprecated implements a factory {@link WebCrawlerFactory} and inject your cutom data as
* shown here .
*/
@Deprecated
public void setCustomData(Object customData) {
this.customData = customData;
}
public boolean isFinished() {
return this.finished;
}
public boolean isShuttingDown() {
return shuttingDown;
}
/**
* Set the current crawling session set to 'shutdown'. Crawler threads
* monitor the shutdown flag and when it is set to true, they will no longer
* process new pages.
*/
public void shutdown() {
logger.info("Shutting down...");
this.shuttingDown = true;
pageFetcher.shutDown();
frontier.finish();
}
public CrawlConfig getConfig() {
return config;
}
protected synchronized Throwable getError() {
return error;
}
private synchronized void setError(Throwable e) {
this.error = e;
}
public TLDList getTldList() {
return tldList;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy