
com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.process;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlThreadAbstract;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.database.UrlCrawlQueue;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternListMatcher;
import com.jaeksoft.searchlib.crawler.web.script.WebScriptItem;
import com.jaeksoft.searchlib.crawler.web.script.WebScriptManager;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
public class WebCrawlThread extends
CrawlThreadAbstract {
private UrlItem currentUrlItem;
private long delayBetweenAccesses;
private HttpDownloader httpDownloader;
private HttpDownloader httpDownloaderRobotsTxt;
private long nextTimeTarget;
private HostUrlList hostUrlList;
private Crawl currentCrawl;
private PatternListMatcher exclusionMatcher;
private PatternListMatcher inclusionMatcher;
private UrlCrawlQueue crawlQueue;
private final WebScriptManager webScriptManager;
protected WebCrawlThread(Config config, WebCrawlMaster crawlMaster,
CrawlStatistics sessionStats, HostUrlList hostUrlList)
throws SearchLibException {
super(config, crawlMaster, null, null);
this.crawlQueue = (UrlCrawlQueue) crawlMaster.getCrawlQueue();
this.currentUrlItem = null;
this.currentCrawl = null;
currentStats = new CrawlStatistics(sessionStats);
WebPropertyManager propertyManager = config.getWebPropertyManager();
delayBetweenAccesses = propertyManager.getDelayBetweenAccesses()
.getValue();
nextTimeTarget = 0;
this.hostUrlList = hostUrlList;
httpDownloader = crawlMaster.getNewHttpDownloader(false);
httpDownloaderRobotsTxt = new HttpDownloader(propertyManager
.getUserAgent().getValue(), false,
propertyManager.getProxyHandler());
exclusionMatcher = propertyManager.getExclusionEnabled().getValue() ? config
.getExclusionPatternManager().getPatternListMatcher() : null;
inclusionMatcher = propertyManager.getInclusionEnabled().getValue() ? config
.getInclusionPatternManager().getPatternListMatcher() : null;
webScriptManager = config.getWebScriptManager();
}
private void sleepInterval() {
long ms = nextTimeTarget - System.currentTimeMillis();
if (ms < 0)
return;
sleepMs(ms);
}
@Override
public void runner() throws Exception {
List urlList = hostUrlList.getUrlList();
currentStats.addListSize(urlList.size());
Iterator iterator = urlList.iterator();
WebCrawlMaster crawlMaster = (WebCrawlMaster) getThreadMaster();
List scriptList = webScriptManager.getItems("http://"
+ hostUrlList.getNamedItem().getName());
if (scriptList != null)
for (WebScriptItem scriptItem : scriptList)
scriptItem.exec(httpDownloader);
while (iterator.hasNext()) {
ListType listType = hostUrlList.getListType();
if (listType == ListType.NEW_URL || listType == ListType.OLD_URL) {
if (crawlMaster.isAborted())
break;
if (crawlMaster.urlLeft() < 0)
break;
}
currentUrlItem = iterator.next();
currentCrawl = crawl();
if (currentCrawl != null)
crawlQueue.add(currentStats, currentCrawl);
else
crawlQueue.delete(currentStats, currentUrlItem.getUrl());
if (isAborted())
break;
}
setStatus(CrawlStatus.INDEXATION);
crawlQueue.index(!crawlMaster.isRunning());
urlList.clear();
}
private Crawl crawl() throws SearchLibException {
Config config = getConfig();
setStatus(CrawlStatus.CRAWL);
currentStats.incUrlCount();
Crawl crawl = ((WebCrawlMaster) getThreadMaster()).getNewCrawl(this);
try {
// Check the url
URL url = currentUrlItem.getURL();
// Check if url is allowed by pattern list
if (url != null)
if (inclusionMatcher != null
&& !inclusionMatcher.matchPattern(url, null)) {
currentUrlItem
.setFetchStatus(FetchStatus.NOT_IN_INCLUSION_LIST);
url = null;
}
if (url != null)
if (exclusionMatcher != null
&& exclusionMatcher.matchPattern(url, null)) {
currentUrlItem
.setFetchStatus(FetchStatus.BLOCKED_BY_EXCLUSION_LIST);
url = null;
}
if (url == null)
return null;
// Fetch started
currentStats.incFetchedCount();
sleepInterval();
setStatus(CrawlStatus.CRAWL);
// NextTimeTarget is immediate by default
nextTimeTarget = System.currentTimeMillis();
if (crawl.checkRobotTxtAllow(httpDownloaderRobotsTxt)) {
DownloadItem downloadItem = crawl.download(httpDownloader);
// If we really crawled the content we honor the pause
if (downloadItem == null || !downloadItem.isFromCache())
nextTimeTarget += +delayBetweenAccesses * 1000;
else
currentStats.incFromCacheCount();
}
if (currentUrlItem.getFetchStatus() == FetchStatus.FETCHED
&& currentUrlItem.getParserStatus() == ParserStatus.PARSED
&& currentUrlItem.getIndexStatus() != IndexStatus.META_NOINDEX) {
currentUrlItem.setIndexStatus(IndexStatus.TO_INDEX);
currentStats.incParsedCount();
config.getScreenshotManager().capture(url,
crawl.getCredentialItem(), true, 120);
} else
currentStats.incIgnoredCount();
} catch (MalformedURLException e) {
crawl.setError(e.getMessage());
currentUrlItem.setFetchStatus(FetchStatus.URL_ERROR);
} catch (URISyntaxException e) {
crawl.setError(e.getMessage());
currentUrlItem.setFetchStatus(FetchStatus.URL_ERROR);
} catch (ClassNotFoundException e) {
crawl.setError(e.getMessage());
currentUrlItem.setFetchStatus(FetchStatus.URL_ERROR);
}
return crawl;
}
public UrlItem getCurrentUrlItem() {
synchronized (this) {
return currentUrlItem;
}
}
public Crawl getCurrentCrawl() {
synchronized (this) {
return currentCrawl;
}
}
public HostUrlList getHostUrlList() {
synchronized (this) {
return hostUrlList;
}
}
@Override
public void release() {
if (httpDownloader != null)
httpDownloader.release();
if (httpDownloader != null)
httpDownloaderRobotsTxt.release();
super.release();
}
@Override
protected String getCurrentInfo() {
if (currentUrlItem == null)
return "";
return currentUrlItem.getUrl();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy