
com.jaeksoft.searchlib.crawler.web.process.WebCrawlMaster Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface,
the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and
easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and
Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2016 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.process;
import com.jaeksoft.searchlib.ClientFactory;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.AbstractManager;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlMasterAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlQueueAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.web.database.*;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternListMatcher;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapItem;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapList;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapUrl;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.scheduler.TaskManager;
import org.apache.commons.lang3.RandomUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.*;
public class WebCrawlMaster extends CrawlMasterAbstract {
private final LinkedList hostList;
private volatile int maxUrlPerSession = 0;
private final UrlCrawlQueue urlCrawlQueue;
public WebCrawlMaster(Config config) throws SearchLibException, IOException {
super(config);
urlCrawlQueue = new UrlCrawlQueue(config);
hostList = new LinkedList<>();
if (config.getWebPropertyManager().getCrawlEnabled().getValue()) {
Logging.info("Webcrawler is starting for " + config.getIndexName());
start(false);
}
}
@Override
public void runner() throws Exception {
Config config = getConfig();
WebPropertyManager propertyManager = config.getWebPropertyManager();
if (ClientFactory.INSTANCE.properties.isDisableWebCrawler()) {
abort();
propertyManager.getCrawlEnabled().setValue(false);
throw new InterruptedException("The webcrawler is disabled.");
}
urlCrawlQueue.setMaxBufferSize(propertyManager.getIndexDocumentBufferSize().getValue());
while (!isAborted()) {
currentStats = new CrawlStatistics();
addStatistics(currentStats);
urlCrawlQueue.setStatistiques(currentStats);
final int threadNumber = propertyManager.getMaxThreadNumber().getValue();
maxUrlPerSession = propertyManager.getMaxUrlPerSession().getValue();
final int maxUrlPerHost = propertyManager.getMaxUrlPerHost().getValue();
final PatternListMatcher exclusionMatcher = propertyManager.getExclusionEnabled().getValue() ?
config.getExclusionPatternManager().getPatternListMatcher() :
null;
final PatternListMatcher inclusionMatcher = propertyManager.getInclusionEnabled().getValue() ?
config.getInclusionPatternManager().getPatternListMatcher() :
null;
final Integer maxDepth = propertyManager.getMaxDepth().getValue();
String schedulerJobName = propertyManager.getSchedulerAfterSession().getValue();
synchronized (hostList) {
hostList.clear();
}
extractSiteMapList(inclusionMatcher, exclusionMatcher);
extractHostList(maxUrlPerHost, maxDepth);
while (!isAborted()) {
int howMany = urlLeftPerHost(maxUrlPerHost);
if (howMany <= 0)
break;
NamedItem host = getNextHost();
if (host == null)
break;
HostUrlList hostUrlList = getNextUrlList(host, howMany, maxDepth);
if (hostUrlList == null)
continue;
WebCrawlThread crawlThread = new WebCrawlThread(config, this, currentStats, hostUrlList);
add(crawlThread);
while (getThreadsCount() >= threadNumber && !isAborted())
sleepSec(5);
}
setStatus(CrawlStatus.WAITING_CHILD);
while (getThreadsCount() > 0) {
waitForChild(1800);
if (isAborted())
break;
}
setStatus(CrawlStatus.INDEXATION);
urlCrawlQueue.index(true);
if (schedulerJobName != null && schedulerJobName.length() > 0) {
setStatus(CrawlStatus.EXECUTE_SCHEDULER_JOB);
TaskManager.getInstance().executeJob(config.getIndexName(), schedulerJobName);
}
if (isOnce())
break;
sleepSec(5);
}
urlCrawlQueue.index(true);
setStatus(CrawlStatus.NOT_RUNNING);
}
private void extractHostList(final int maxUrlPerHost, final Integer maxDepth)
throws IOException, ParseException, SyntaxError, URISyntaxException, ClassNotFoundException,
InterruptedException, SearchLibException, InstantiationException, IllegalAccessException {
Config config = getConfig();
UrlManager urlManager = config.getUrlManager();
setStatus(CrawlStatus.EXTRACTING_HOSTLIST);
Set hostSet = new TreeSet();
WebPropertyManager propertyManager = config.getWebPropertyManager();
final Date fetchIntervalDate = AbstractManager.getPastDate(propertyManager.getFetchInterval().getValue(),
propertyManager.getFetchIntervalUnit().getValue());
int urlLimit = maxUrlPerSession;
// First try fetch priority
NamedItem.Selection selection =
new NamedItem.Selection(ListType.PRIORITY_URL, FetchStatus.FETCH_FIRST, null, null);
urlLimit = urlManager.getHostToFetch(selection, urlLimit, maxUrlPerHost, maxDepth, hostList, hostSet);
// Second try old URLs
selection = new NamedItem.Selection(ListType.OLD_URL, null, fetchIntervalDate, null);
urlLimit = urlManager.getHostToFetch(selection, urlLimit, maxUrlPerHost, maxDepth, hostList, hostSet);
// Finally try new unfetched URLs
selection = new NamedItem.Selection(ListType.NEW_URL, FetchStatus.UN_FETCHED, null, fetchIntervalDate);
urlLimit = urlManager.getHostToFetch(selection, urlLimit, maxUrlPerHost, maxDepth, hostList, hostSet);
currentStats.addHostListSize(hostList.size());
}
private void extractSiteMapList(final PatternListMatcher inclusionMatcher,
final PatternListMatcher exclusionMatcher) throws SearchLibException, IOException {
HttpDownloader httpDownloader = null;
try {
httpDownloader = getNewHttpDownloader(true);
SiteMapList siteMapList = getConfig().getSiteMapList();
if (siteMapList != null && siteMapList.getArray() != null) {
setStatus(CrawlStatus.LOADING_SITEMAP);
UrlManager urlManager = getConfig().getUrlManager();
List workInsertUrlList = new ArrayList();
for (SiteMapItem siteMap : siteMapList.getArray()) {
Set siteMapUrlSet = siteMap.load(getNewHttpDownloader(true), null);
for (SiteMapUrl siteMapUrl : siteMapUrlSet) {
URI uri = siteMapUrl.getLoc();
String sUri = uri.toString();
URL url;
try {
url = uri.toURL();
} catch (MalformedURLException e) {
continue;
}
if (exclusionMatcher != null)
if (exclusionMatcher.matchPattern(url, sUri))
continue;
if (inclusionMatcher != null)
if (!inclusionMatcher.matchPattern(url, sUri))
continue;
if (!urlManager.exists(sUri)) {
workInsertUrlList.add(urlManager
.getNewUrlItem(new LinkItem(sUri, LinkItem.Origin.sitemap, null, 0)));
}
}
}
if (workInsertUrlList.size() > 0)
urlManager.updateUrlItems(workInsertUrlList);
}
} finally {
if (httpDownloader != null)
httpDownloader.release();
}
}
public HttpDownloader getNewHttpDownloader(boolean followRedirect, String userAgent, boolean useProxies)
throws SearchLibException, IOException {
Config config = getConfig();
WebPropertyManager propertyManager = config.getWebPropertyManager();
if (StringUtils.isEmpty(userAgent))
userAgent = propertyManager.getUserAgent().getValue();
return new HttpDownloader(userAgent, followRedirect, useProxies ? propertyManager.getProxyHandler() : null,
propertyManager.getConnectionTimeOut().getValue() * 1000);
}
final public HttpDownloader getNewHttpDownloader(final boolean followRedirect)
throws SearchLibException, IOException {
return getNewHttpDownloader(followRedirect, null, true);
}
private NamedItem getNextHost() {
synchronized (hostList) {
int s = hostList.size();
if (s > 0) {
NamedItem host = hostList.remove(RandomUtils.nextInt(0, s));
if (host != null) {
currentStats.incHostCount();
return host;
}
}
}
return null;
}
protected int urlLeft() {
return (int) (maxUrlPerSession - currentStats.getFetchedCount());
}
private int urlLeftPerHost(int maxUrlPerHost) {
int leftCount = urlLeft();
if (leftCount < 0)
return leftCount;
if (leftCount > maxUrlPerHost)
leftCount = maxUrlPerHost;
return leftCount;
}
private HostUrlList getNextUrlList(final NamedItem host, final int count, final Integer maxDepth)
throws ParseException, IOException, SyntaxError, URISyntaxException, ClassNotFoundException,
InterruptedException, SearchLibException, InstantiationException, IllegalAccessException {
setStatus(CrawlStatus.EXTRACTING_URLLIST);
setInfo(host.getName());
UrlManager urlManager = getConfig().getUrlManager();
List urlList = new ArrayList();
HostUrlList hostUrlList = new HostUrlList(urlList, host);
hostUrlList.setListType(host.selection.listType);
urlManager.getUrlToFetch(host, count, maxDepth, urlList);
setInfo(null);
return hostUrlList;
}
public boolean isFull() throws IOException {
return currentStats.getFetchedCount() >= getConfig().getWebPropertyManager().getMaxUrlPerSession().getValue();
}
public Crawl getNewCrawl(WebCrawlThread crawlThread) throws SearchLibException, IOException {
return new Crawl(crawlThread);
}
public WebCrawlThread manualCrawl(URL url, HostUrlList.ListType listType)
throws SearchLibException, ParseException, IOException, SyntaxError, URISyntaxException,
ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException {
Config config = getConfig();
if (currentStats == null)
currentStats = new CrawlStatistics();
UrlManager urlManager = config.getUrlManager();
List urlItemList = new ArrayList();
UrlItem urlItem = urlManager.getUrlToFetch(url);
if (urlItem == null)
urlItem = urlManager.getNewUrlItem(new LinkItem(url.toExternalForm(), LinkItem.Origin.manual, null, 0));
urlItemList.add(urlItem);
HostUrlList hostUrlList = new HostUrlList(urlItemList, new NamedItem(url.getHost()));
hostUrlList.setListType(listType);
WebCrawlThread crawlThread = new WebCrawlThread(config, this, new CrawlStatistics(), hostUrlList);
crawlThread.execute(180);
return crawlThread;
}
public CrawlQueueAbstract getCrawlQueue() {
return urlCrawlQueue;
}
@Override
protected WebCrawlThread[] getNewArray(int size) {
return new WebCrawlThread[size];
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy