com.jaeksoft.searchlib.crawler.web.process.WebCrawlMaster Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2015 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.process;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.lang3.StringUtils;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.AbstractManager;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlMasterAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlQueueAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem;
import com.jaeksoft.searchlib.crawler.web.database.NamedItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlCrawlQueue;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternListMatcher;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapItem;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapList;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapUrl;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.scheduler.TaskManager;

public class WebCrawlMaster extends
		CrawlMasterAbstract {

	private final LinkedList hostList;

	private Date fetchIntervalDate;

	private int maxUrlPerSession;

	private int maxUrlPerHost;

	private final PatternListMatcher exclusionMatcher;

	private final PatternListMatcher inclusionMatcher;

	private final UrlCrawlQueue urlCrawlQueue;

	public WebCrawlMaster(Config config) throws SearchLibException {
		super(config);
		WebPropertyManager propertyManager = config.getWebPropertyManager();
		urlCrawlQueue = new UrlCrawlQueue(config);
		exclusionMatcher = propertyManager.getExclusionEnabled().getValue() ? config
				.getExclusionPatternManager().getPatternListMatcher() : null;
		inclusionMatcher = propertyManager.getInclusionEnabled().getValue() ? config
				.getInclusionPatternManager().getPatternListMatcher() : null;
		hostList = new LinkedList();
		if (propertyManager.getCrawlEnabled().getValue()) {
			Logging.info("Webcrawler is starting for " + config.getIndexName());
			start(false);
		}
	}

	@Override
	public void runner() throws Exception {
		Config config = getConfig();
		WebPropertyManager propertyManager = config.getWebPropertyManager();
		urlCrawlQueue.setMaxBufferSize(propertyManager
				.getIndexDocumentBufferSize().getValue());
		while (!isAborted()) {

			currentStats = new CrawlStatistics();
			addStatistics(currentStats);
			urlCrawlQueue.setStatistiques(currentStats);

			int threadNumber = propertyManager.getMaxThreadNumber().getValue();
			maxUrlPerSession = propertyManager.getMaxUrlPerSession().getValue();
			maxUrlPerHost = propertyManager.getMaxUrlPerHost().getValue();
			String schedulerJobName = propertyManager
					.getSchedulerAfterSession().getValue();

			synchronized (hostList) {
				hostList.clear();
			}

			extractSiteMapList();
			extractHostList();

			while (!isAborted()) {

				int howMany = urlLeftPerHost();
				if (howMany <= 0)
					break;

				NamedItem host = getNextHost();
				if (host == null)
					break;

				HostUrlList hostUrlList = getNextUrlList(host, howMany);
				if (hostUrlList == null)
					continue;

				WebCrawlThread crawlThread = new WebCrawlThread(config, this,
						currentStats, hostUrlList);
				add(crawlThread);

				while (getThreadsCount() >= threadNumber && !isAborted())
					sleepSec(5);
			}

			setStatus(CrawlStatus.WAITING_CHILD);
			while (getThreadsCount() > 0) {
				waitForChild(1800);
				if (isAborted())
					break;
			}
			setStatus(CrawlStatus.INDEXATION);
			urlCrawlQueue.index(true);
			if (currentStats.getUrlCount() > 0)
				config.getUrlManager().reload(false, null);
			if (schedulerJobName != null && schedulerJobName.length() > 0) {
				setStatus(CrawlStatus.EXECUTE_SCHEDULER_JOB);
				TaskManager.getInstance().executeJob(config.getIndexName(),
						schedulerJobName);
			}

			if (isOnce())
				break;
			sleepSec(5);
		}
		urlCrawlQueue.index(true);
		setStatus(CrawlStatus.NOT_RUNNING);
	}

	private void extractHostList() throws IOException, ParseException,
			SyntaxError, URISyntaxException, ClassNotFoundException,
			InterruptedException, SearchLibException, InstantiationException,
			IllegalAccessException {
		Config config = getConfig();
		UrlManager urlManager = config.getUrlManager();
		urlManager.reload(false, null);
		setStatus(CrawlStatus.EXTRACTING_HOSTLIST);

		Set hostSet = new TreeSet();

		WebPropertyManager propertyManager = config.getWebPropertyManager();
		fetchIntervalDate = AbstractManager.getPastDate(propertyManager
				.getFetchInterval().getValue(), propertyManager
				.getFetchIntervalUnit().getValue());

		int urlLimit = maxUrlPerSession;
		// First try fetch priority
		NamedItem.Selection selection = new NamedItem.Selection(
				ListType.PRIORITY_URL, FetchStatus.FETCH_FIRST, null, null);
		urlLimit = urlManager.getHostToFetch(selection, urlLimit,
				maxUrlPerHost, hostList, hostSet);

		// Second try old URLs
		selection = new NamedItem.Selection(ListType.OLD_URL, null,
				fetchIntervalDate, null);
		urlLimit = urlManager.getHostToFetch(selection, urlLimit,
				maxUrlPerHost, hostList, hostSet);

		// Finally try new unfetched URLs
		selection = new NamedItem.Selection(ListType.NEW_URL,
				FetchStatus.UN_FETCHED, null, fetchIntervalDate);
		urlLimit = urlManager.getHostToFetch(selection, urlLimit,
				maxUrlPerHost, hostList, hostSet);

		currentStats.addHostListSize(hostList.size());

	}

	private void extractSiteMapList() throws SearchLibException {
		HttpDownloader httpDownloader = null;
		try {
			httpDownloader = getNewHttpDownloader(true);
			SiteMapList siteMapList = getConfig().getSiteMapList();
			if (siteMapList != null && siteMapList.getArray() != null) {
				setStatus(CrawlStatus.LOADING_SITEMAP);
				UrlManager urlManager = getConfig().getUrlManager();
				List workInsertUrlList = new ArrayList();
				for (SiteMapItem siteMap : siteMapList.getArray()) {
					Set siteMapUrlSet = siteMap.load(
							getNewHttpDownloader(true), null);
					for (SiteMapUrl siteMapUrl : siteMapUrlSet) {

						URI uri = siteMapUrl.getLoc();
						String sUri = uri.toString();
						URL url;
						try {
							url = uri.toURL();
						} catch (MalformedURLException e) {
							continue;
						}

						if (exclusionMatcher != null)
							if (exclusionMatcher.matchPattern(url, sUri))
								continue;
						if (inclusionMatcher != null)
							if (!inclusionMatcher.matchPattern(url, sUri))
								continue;

						if (!urlManager.exists(sUri)) {
							workInsertUrlList.add(urlManager
									.getNewUrlItem(new LinkItem(sUri,
											LinkItem.Origin.sitemap, null)));
						}
					}
				}
				if (workInsertUrlList.size() > 0)
					urlManager.updateUrlItems(workInsertUrlList);
			}
		} finally {
			if (httpDownloader != null)
				httpDownloader.release();
		}
	}

	public HttpDownloader getNewHttpDownloader(boolean followRedirect,
			String userAgent, boolean useProxies) throws SearchLibException {
		Config config = getConfig();
		WebPropertyManager propertyManager = config.getWebPropertyManager();
		if (StringUtils.isEmpty(userAgent))
			userAgent = propertyManager.getUserAgent().getValue();
		return new HttpDownloader(userAgent, followRedirect,
				useProxies ? propertyManager.getProxyHandler() : null);
	}

	final public HttpDownloader getNewHttpDownloader(
			final boolean followRedirect) throws SearchLibException {
		return getNewHttpDownloader(followRedirect, null, true);
	}

	private NamedItem getNextHost() {
		synchronized (hostList) {
			int s = hostList.size();
			if (s > 0) {
				NamedItem host = hostList.remove(new Random().nextInt(s));
				if (host != null) {
					currentStats.incHostCount();
					return host;
				}
			}
		}
		return null;
	}

	protected int urlLeft() {
		return (int) (maxUrlPerSession - currentStats.getFetchedCount());
	}

	private int urlLeftPerHost() {
		int leftCount = urlLeft();
		if (leftCount < 0)
			return leftCount;
		if (leftCount > maxUrlPerHost)
			leftCount = maxUrlPerHost;
		return leftCount;
	}

	private HostUrlList getNextUrlList(NamedItem host, int count)
			throws ParseException, IOException, SyntaxError,
			URISyntaxException, ClassNotFoundException, InterruptedException,
			SearchLibException, InstantiationException, IllegalAccessException {

		setStatus(CrawlStatus.EXTRACTING_URLLIST);
		setInfo(host.getName());
		UrlManager urlManager = getConfig().getUrlManager();

		List urlList = new ArrayList();
		HostUrlList hostUrlList = new HostUrlList(urlList, host);
		hostUrlList.setListType(host.selection.listType);

		urlManager.getUrlToFetch(host, count, urlList);

		setInfo(null);
		return hostUrlList;
	}

	public boolean isFull() throws SearchLibException {
		return currentStats.getFetchedCount() >= getConfig()
				.getWebPropertyManager().getMaxUrlPerSession().getValue();
	}

	public Crawl getNewCrawl(WebCrawlThread crawlThread)
			throws SearchLibException {
		return new Crawl(crawlThread);

	}

	public WebCrawlThread manualCrawl(URL url, HostUrlList.ListType listType)
			throws SearchLibException, ParseException, IOException,
			SyntaxError, URISyntaxException, ClassNotFoundException,
			InterruptedException, InstantiationException,
			IllegalAccessException {
		Config config = getConfig();
		if (currentStats == null)
			currentStats = new CrawlStatistics();
		UrlManager urlManager = config.getUrlManager();
		List urlItemList = new ArrayList();
		UrlItem urlItem = urlManager.getUrlToFetch(url);
		if (urlItem == null)
			urlItem = urlManager.getNewUrlItem(new LinkItem(url
					.toExternalForm(), LinkItem.Origin.manual, null));
		urlItemList.add(urlItem);
		HostUrlList hostUrlList = new HostUrlList(urlItemList, new NamedItem(
				url.getHost()));
		hostUrlList.setListType(listType);
		WebCrawlThread crawlThread = new WebCrawlThread(config, this,
				new CrawlStatistics(), hostUrlList);
		crawlThread.execute(180);
		return crawlThread;

	}

	public CrawlQueueAbstract getCrawlQueue() {
		return urlCrawlQueue;
	}

	@Override
	protected WebCrawlThread[] getNewArray(int size) {
		return new WebCrawlThread[size];
	}

}