All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.web.spider.Crawl Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.spider;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;

import org.apache.commons.io.FilenameUtils;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.ClientCatalog;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.FieldMap;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.web.database.CookieItem;
import com.jaeksoft.searchlib.crawler.web.database.CookieManager;
import com.jaeksoft.searchlib.crawler.web.database.CredentialItem;
import com.jaeksoft.searchlib.crawler.web.database.CredentialManager;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem.Origin;
import com.jaeksoft.searchlib.crawler.web.database.PatternManager;
import com.jaeksoft.searchlib.crawler.web.database.RobotsTxtStatus;
import com.jaeksoft.searchlib.crawler.web.database.UrlFilterItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread;
import com.jaeksoft.searchlib.crawler.web.robotstxt.RobotsTxt;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.HtmlParser;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.parser.ParserFieldEnum;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.parser.ParserSelector;
import com.jaeksoft.searchlib.plugin.IndexPluginList;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.streamlimiter.LimitException;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;

public class Crawl {

	private List targetIndexDocuments;
	private HostUrlList hostUrlList;
	private final UrlItem urlItem;
	private CredentialManager credentialManager;
	private CookieManager cookieManager;
	private CredentialItem credentialItem;
	private String userAgent;
	private ParserSelector parserSelector;
	private Config config;
	private Parser parser;
	private String error;
	private List discoverLinks;
	private FieldMap urlFieldMap;
	private URI redirectUrlLocation;
	private boolean inclusionEnabled;
	private boolean exclusionEnabled;
	private boolean robotsTxtEnabled;

	public Crawl(HostUrlList hostUrlList, UrlItem urlItem, Config config,
			ParserSelector parserSelector) throws SearchLibException {
		this.credentialManager = config.getWebCredentialManager();
		this.cookieManager = config.getWebCookieManager();
		this.credentialItem = null;
		WebPropertyManager propertyManager = config.getWebPropertyManager();
		this.hostUrlList = hostUrlList;
		this.targetIndexDocuments = null;
		this.urlFieldMap = config.getWebCrawlerFieldMap();
		this.discoverLinks = null;
		this.urlItem = urlItem;
		this.urlItem.setWhenNow();
		this.userAgent = propertyManager.getUserAgent().getValue();
		this.parser = null;
		this.parserSelector = parserSelector;
		this.config = config;
		this.error = null;
		this.redirectUrlLocation = null;
		this.exclusionEnabled = propertyManager.getExclusionEnabled()
				.getValue();
		this.inclusionEnabled = propertyManager.getInclusionEnabled()
				.getValue();
		this.robotsTxtEnabled = propertyManager.getRobotsTxtEnabled()
				.getValue();
	}

	public Crawl(WebCrawlThread crawlThread) throws SearchLibException {
		this(crawlThread.getHostUrlList(), crawlThread.getCurrentUrlItem(),
				crawlThread.getConfig(), crawlThread.getConfig()
						.getParserSelector());
	}

	protected void parseContent(InputStream inputStream)
			throws InstantiationException, IllegalAccessException,
			ClassNotFoundException, IOException, SearchLibException,
			NoSuchAlgorithmException, URISyntaxException {
		if (parserSelector == null) {
			urlItem.setParserStatus(ParserStatus.NOPARSER);
			return;
		}
		String fileName = urlItem.getContentDispositionFilename();
		if (fileName == null) {
			URL url = urlItem.getURL();
			if (url != null)
				fileName = FilenameUtils.getName(url.getFile());
		}
		IndexDocument sourceDocument = new IndexDocument();
		urlItem.populate(sourceDocument);
		Date parserStartDate = new Date();
		// TODO Which language for OCR ?
		parser = parserSelector.parseStream(sourceDocument, fileName,
				urlItem.getContentBaseType(), urlItem.getUrl(), inputStream,
				null, parserSelector.getWebCrawlerDefaultParser());
		if (parser == null) {
			urlItem.setParserStatus(ParserStatus.NOPARSER);
			return;
		}

		if (parser.getError() != null) {
			urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
			return;
		}
		urlItem.clearInLinks();
		urlItem.clearOutLinks();

		for (ParserResultItem result : parser.getParserResults()) {
			urlItem.addInLinks(result
					.getFieldContent(ParserFieldEnum.internal_link));
			urlItem.addInLinks(result
					.getFieldContent(ParserFieldEnum.internal_link_nofollow));
			urlItem.addOutLinks(result
					.getFieldContent(ParserFieldEnum.external_link));
			urlItem.addOutLinks(result
					.getFieldContent(ParserFieldEnum.external_link_nofollow));
			urlItem.setLang(result.getFieldValue(ParserFieldEnum.lang, 0));
			urlItem.setLangMethod(result.getFieldValue(
					ParserFieldEnum.lang_method, 0));
			urlItem.setContentTypeCharset(result.getFieldValue(
					ParserFieldEnum.charset, 0));
		}
		ParserStatus parsedStatus = ParserStatus.PARSED;
		if (parser instanceof HtmlParser)
			if (!((HtmlParser) parser).isCanonical())
				parsedStatus = ParserStatus.PARSED_NON_CANONICAL;
		urlItem.setParserStatus(parsedStatus);
		String oldMd5size = urlItem.getMd5size();
		String newMd5size = parser.getMd5size();
		urlItem.setMd5size(newMd5size);
		Date oldContentUpdateDate = urlItem.getContentUpdateDate();
		Date newContentUpdateDate = null;
		if (oldContentUpdateDate == null)
			newContentUpdateDate = parserStartDate;
		else {
			if (oldMd5size != null && newMd5size != null)
				if (!oldMd5size.equals(newMd5size))
					newContentUpdateDate = parserStartDate;
		}
		if (newContentUpdateDate != null)
			urlItem.setContentUpdateDate(newContentUpdateDate);

		for (ParserResultItem result : parser.getParserResults()) {
			FieldContent fieldContent = result
					.getFieldContent(ParserFieldEnum.meta_robots);
			if (fieldContent != null) {
				List fieldValues = fieldContent.getValues();
				if (fieldValues != null) {
					for (FieldValueItem item : result.getFieldContent(
							ParserFieldEnum.meta_robots).getValues())
						if ("noindex".equalsIgnoreCase(item.getValue())) {
							urlItem.setIndexStatus(IndexStatus.META_NOINDEX);
							break;
						}
				}
			}
		}
	}

	public boolean checkRobotTxtAllow(HttpDownloader httpDownloader)
			throws MalformedURLException, SearchLibException,
			URISyntaxException, ClassNotFoundException {
		RobotsTxtStatus robotsTxtStatus;
		if (robotsTxtEnabled) {
			RobotsTxt robotsTxt = config.getRobotsTxtCache().getRobotsTxt(
					httpDownloader, config, urlItem.getURL(), false);
			robotsTxtStatus = robotsTxt.getStatus(userAgent, urlItem);
		} else
			robotsTxtStatus = RobotsTxtStatus.DISABLED;
		urlItem.setRobotsTxtStatus(robotsTxtStatus);
		if (robotsTxtStatus == RobotsTxtStatus.DISABLED
				|| robotsTxtStatus == RobotsTxtStatus.ALLOW)
			return true;
		if (robotsTxtStatus == RobotsTxtStatus.NO_ROBOTSTXT)
			return true;
		urlItem.setFetchStatus(FetchStatus.NOT_ALLOWED);
		return false;
	}

	/**
	 * Download the file and extract content informations
	 * 
	 * @param httpDownloader
	 */
	public DownloadItem download(HttpDownloader httpDownloader) {
		synchronized (this) {
			InputStream is = null;
			DownloadItem downloadItem = null;
			try {
				URL url = urlItem.getURL();
				if (url == null)
					throw new MalformedURLException("Malformed URL: "
							+ urlItem.getUrl());
				// URL normalisation
				URI uri = url.toURI();
				url = uri.toURL();

				credentialItem = credentialManager == null ? null
						: credentialManager.matchCredential(url);

				List cookieList = cookieManager.getCookies(url
						.toExternalForm());
				downloadItem = ClientCatalog.getCrawlCacheManager().loadCache(
						uri);

				boolean fromCache = (downloadItem != null);

				if (!fromCache)
					downloadItem = httpDownloader.get(uri, credentialItem,
							null, cookieList);
				else if (Logging.isDebug)
					Logging.debug("Crawl cache deliver: " + uri);

				urlItem.setContentDispositionFilename(downloadItem
						.getContentDispositionFilename());

				urlItem.setContentBaseType(downloadItem.getContentBaseType());

				urlItem.setContentTypeCharset(downloadItem
						.getContentTypeCharset());

				urlItem.setContentEncoding(downloadItem.getContentEncoding());

				urlItem.setContentLength(downloadItem.getContentLength());

				urlItem.setLastModifiedDate(downloadItem.getLastModified());

				urlItem.setFetchStatus(FetchStatus.FETCHED);

				urlItem.setHeaders(downloadItem.getHeaders());

				Integer code = downloadItem.getStatusCode();
				if (code == null)
					throw new IOException("Http status is null");

				urlItem.setResponseCode(code);
				redirectUrlLocation = downloadItem.getRedirectLocation();
				if (redirectUrlLocation != null)
					urlItem.setRedirectionUrl(redirectUrlLocation.toURL()
							.toExternalForm());

				urlItem.setBacklinkCount(config.getUrlManager().countBackLinks(
						urlItem.getUrl()));

				if (code >= 200 && code < 300) {
					if (!fromCache)
						is = ClientCatalog.getCrawlCacheManager().storeCache(
								downloadItem);
					else
						is = downloadItem.getContentInputStream();
					parseContent(is);
				} else if (code == 301) {
					urlItem.setFetchStatus(FetchStatus.REDIR_PERM);
				} else if (code > 301 && code < 400) {
					urlItem.setFetchStatus(FetchStatus.REDIR_TEMP);
				} else if (code >= 400 && code < 500) {
					urlItem.setFetchStatus(FetchStatus.GONE);
				} else if (code >= 500 && code < 600) {
					urlItem.setFetchStatus(FetchStatus.HTTP_ERROR);
				}
			} catch (FileNotFoundException e) {
				Logging.info("FileNotFound: " + urlItem.getUrl());
				urlItem.setFetchStatus(FetchStatus.GONE);
				setError("FileNotFound: " + urlItem.getUrl());
			} catch (LimitException e) {
				Logging.warn(e.toString() + " (" + urlItem.getUrl() + ")");
				urlItem.setFetchStatus(FetchStatus.SIZE_EXCEED);
				setError(e.getMessage());
			} catch (InstantiationException e) {
				Logging.error(e.getMessage(), e);
				urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
				setError(e.getMessage());
			} catch (IllegalAccessException e) {
				Logging.error(e.getMessage(), e);
				urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
				setError(e.getMessage());
			} catch (ClassNotFoundException e) {
				Logging.error(e.getMessage(), e);
				urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
				setError(e.getMessage());
			} catch (URISyntaxException e) {
				Logging.warn(e.getMessage(), e);
				urlItem.setFetchStatus(FetchStatus.URL_ERROR);
				setError(e.getMessage());
			} catch (MalformedURLException e) {
				Logging.warn(e.getMessage(), e);
				urlItem.setFetchStatus(FetchStatus.URL_ERROR);
				setError(e.getMessage());
			} catch (IOException e) {
				Logging.error(e.getMessage(), e);
				urlItem.setFetchStatus(FetchStatus.ERROR);
				setError(e.getMessage());
			} catch (IllegalArgumentException e) {
				Logging.error(e.getMessage(), e);
				urlItem.setFetchStatus(FetchStatus.ERROR);
				setError(e.getMessage());
			} catch (Exception e) {
				Logging.error(e.getMessage(), e);
				urlItem.setFetchStatus(FetchStatus.ERROR);
				setError(e.getMessage());
			} finally {
				IOUtils.close(is);
			}
			return downloadItem;
		}
	}

	public void setError(String error) {
		this.error = error;
	}

	public String getUserAgent() {
		return userAgent;
	}

	public Parser getParser() {
		return parser;
	}

	public String getContentType() {
		if (urlItem == null)
			return null;
		return urlItem.getContentBaseType();
	}

	public StreamLimiter getStreamLimiter() {
		if (parser == null)
			return null;
		return parser.getStreamLimiter();
	}

	public String getError() {
		return error;
	}

	public UrlItem getUrlItem() {
		return urlItem;
	}

	public HostUrlList getHostUrlList() {
		return hostUrlList;
	}

	public CredentialItem getCredentialItem() {
		return credentialItem;
	}

	public IndexDocument getTargetIndexDocument(int documentPos)
			throws SearchLibException, IOException, URISyntaxException {
		if (targetIndexDocuments == null)
			getTargetIndexDocuments();
		if (targetIndexDocuments == null)
			return null;
		if (documentPos >= targetIndexDocuments.size())
			return null;
		return targetIndexDocuments.get(documentPos);
	}

	public List getTargetIndexDocuments()
			throws SearchLibException, IOException, URISyntaxException {
		synchronized (this) {
			if (targetIndexDocuments != null)
				return targetIndexDocuments;

			targetIndexDocuments = new ArrayList(0);

			if (parser == null)
				return targetIndexDocuments;

			List results = parser.getParserResults();
			if (results == null)
				return targetIndexDocuments;

			for (ParserResultItem result : results) {
				IndexDocument targetIndexDocument = new IndexDocument(
						LanguageEnum.findByCode(urlItem.getLang()));

				IndexDocument urlIndexDocument = new IndexDocument();
				urlItem.populate(urlIndexDocument);
				urlFieldMap.mapIndexDocument(urlIndexDocument,
						targetIndexDocument);

				if (result != null)
					result.populate(targetIndexDocument);

				IndexPluginList indexPluginList = config.getWebCrawlMaster()
						.getIndexPluginList();

				if (indexPluginList != null) {
					if (!indexPluginList.run((Client) config, getContentType(),
							getStreamLimiter(), targetIndexDocument)) {
						urlItem.setIndexStatus(IndexStatus.PLUGIN_REJECTED);
						urlItem.populate(urlIndexDocument);
						continue;
					}
				}

				targetIndexDocuments.add(targetIndexDocument);
			}
			return targetIndexDocuments;
		}
	}

	final private static void addDiscoverLink(UrlManager urlManager,
			PatternManager inclusionManager, PatternManager exclusionManager,
			String href, Origin origin, String parentUrl, URL currentURL,
			UrlFilterItem[] urlFilterList, List newUrlList) {
		if (href == null)
			return;
		try {
			URL url = currentURL != null ? LinkUtils.getLink(currentURL, href,
					urlFilterList, false) : LinkUtils.newEncodedURL(href);

			if (exclusionManager != null)
				if (exclusionManager.matchPattern(url))
					return;
			if (inclusionManager != null)
				if (!inclusionManager.matchPattern(url))
					return;
			newUrlList
					.add(new LinkItem(url.toExternalForm(), origin, parentUrl));
		} catch (MalformedURLException e) {
			Logging.warn(href + " " + e.getMessage(), e);
		} catch (URISyntaxException e) {
			Logging.warn(href + " " + e.getMessage(), e);
		}
	}

	final private static void addDiscoverLinks(UrlManager urlManager,
			PatternManager inclusionManager, PatternManager exclusionManager,
			Collection linkSet, Origin origin, String parentUrl,
			URL currentURL, UrlFilterItem[] urlFilterList,
			List newUrlList) throws NoSuchAlgorithmException,
			IOException, SearchLibException {
		if (linkSet == null)
			return;
		for (String link : linkSet)
			addDiscoverLink(urlManager, inclusionManager, exclusionManager,
					link, origin, parentUrl, currentURL, urlFilterList,
					newUrlList);
	}

	public List getDiscoverLinks() throws NoSuchAlgorithmException,
			IOException, SearchLibException, URISyntaxException {
		synchronized (this) {
			if (discoverLinks != null)
				return discoverLinks;
			UrlManager urlManager = config.getUrlManager();
			PatternManager inclusionManager = inclusionEnabled ? config
					.getInclusionPatternManager() : null;
			PatternManager exclusionManager = exclusionEnabled ? config
					.getExclusionPatternManager() : null;
			UrlFilterItem[] urlFilterList = config.getUrlFilterList()
					.getArray();
			String parentUrl = urlItem.getUrl();
			URL currentURL = urlItem.getURL();
			if (currentURL == null)
				return discoverLinks;
			discoverLinks = new ArrayList();
			if (redirectUrlLocation != null) {
				addDiscoverLink(urlManager, inclusionManager, exclusionManager,
						redirectUrlLocation.toString(), Origin.redirect,
						parentUrl, currentURL, urlFilterList, discoverLinks);
			}
			if (parser != null
					&& urlItem.getFetchStatus() == FetchStatus.FETCHED)
				addDiscoverLinks(urlManager, inclusionManager,
						exclusionManager, parser.getDetectedLinks(),
						Origin.content, parentUrl, currentURL, urlFilterList,
						discoverLinks);
			urlManager.removeExisting(discoverLinks);
			return discoverLinks;
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy