All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.web.database.UrlManager Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.database;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import javax.xml.transform.TransformerConfigurationException;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.json.JSONException;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.ClientCatalog;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.ItemField;
import com.jaeksoft.searchlib.crawler.TargetStatus;
import com.jaeksoft.searchlib.crawler.cache.CrawlCacheManager;
import com.jaeksoft.searchlib.crawler.common.database.AbstractManager;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem.Origin;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapItem;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapUrl;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.facet.Facet;
import com.jaeksoft.searchlib.facet.FacetField;
import com.jaeksoft.searchlib.facet.FacetItem;
import com.jaeksoft.searchlib.filter.FilterAbstract;
import com.jaeksoft.searchlib.filter.QueryFilter;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.query.QueryUtils;
import com.jaeksoft.searchlib.request.AbstractSearchRequest;
import com.jaeksoft.searchlib.request.SearchPatternRequest;
import com.jaeksoft.searchlib.result.AbstractResultSearch;
import com.jaeksoft.searchlib.result.ResultDocument;
import com.jaeksoft.searchlib.scheduler.TaskLog;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDateFormat;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeSimpleDateFormat;
import com.jaeksoft.searchlib.util.InfoCallback;
import com.jaeksoft.searchlib.util.ThreadUtils;
import com.jaeksoft.searchlib.util.XmlWriter;

public class UrlManager extends AbstractManager {

	public static enum SearchTemplate {
		urlSearch, urlExport, hostFacet;
	}

	public UrlManager() {
	}

	public void init(Client client, File dataDir) throws SearchLibException,
			URISyntaxException, FileNotFoundException {
		dataDir = new File(dataDir, "web_crawler_url");
		if (!dataDir.exists())
			dataDir.mkdir();
		Client dbClient = new Client(dataDir,
				"/com/jaeksoft/searchlib/url_config.xml", true);
		super.init(client, dbClient);
	}

	public void deleteUrls(Collection workDeleteUrlList)
			throws SearchLibException {
		String targetField = findIndexedFieldOfTargetIndex(
				targetClient.getWebCrawlerFieldMap(),
				UrlItemFieldEnum.INSTANCE.url.getName());
		if (targetField != null)
			targetClient.deleteDocuments(targetField, workDeleteUrlList);
		dbClient.deleteDocuments(UrlItemFieldEnum.INSTANCE.url.getName(),
				workDeleteUrlList);
	}

	public boolean exists(String sUrl) throws SearchLibException {
		AbstractSearchRequest request = (AbstractSearchRequest) dbClient
				.getNewRequest(SearchTemplate.urlExport.name());
		request.setQueryString("url:\"" + sUrl + '"');
		return (getUrlList(request, 0, 0, null) > 0);
	}

	public void removeExisting(List linkList)
			throws SearchLibException {
		Iterator it = linkList.iterator();
		while (it.hasNext())
			if (exists(it.next().getUrl()))
				it.remove();
	}

	public void inject(List urls, InfoCallback infoCallback)
			throws SearchLibException {
		try {
			int already = 0;
			int injected = 0;
			List injectList = new ArrayList(0);
			for (String url : urls) {
				if (exists(url))
					already++;
				else {
					UrlItem item = getNewUrlItem(url);
					IndexDocument indexDocument = new IndexDocument();
					item.populate(indexDocument);
					injectList.add(indexDocument);
				}
			}
			if (injectList.size() > 0) {
				injected = dbClient.updateDocuments(injectList);
				if (injected > 0)
					dbClient.reload();
			}
			if (infoCallback != null)
				infoCallback.setInfo("Injected: " + injected + " - Already: "
						+ already);
		} catch (IOException e) {
			throw new SearchLibException(e);
		}
	}

	public void injectPrefix(List patternList)
			throws SearchLibException {
		Iterator it = patternList.iterator();
		List urlList = new ArrayList(0);
		while (it.hasNext()) {
			PatternItem item = it.next();
			if (item.getStatus() == PatternItem.Status.INJECTED) {
				URL url = item.tryExtractURL();
				if (url != null)
					urlList.add(url.toExternalForm());
			}
		}
		inject(urlList, null);
	}

	private void filterQueryToFetch(AbstractSearchRequest request,
			FetchStatus fetchStatus, Date before, Date after)
			throws ParseException {
		if (fetchStatus != null) {
			StringBuilder query = new StringBuilder();
			query.append("fetchStatus:");
			query.append(fetchStatus.value);
			request.addFilter(query.toString(), false);
		}
		if (before != null) {
			StringBuilder query = new StringBuilder();
			query.append("when:[00000000000000 TO ");
			query.append(UrlItem.whenDateFormat.format(before));
			query.append("]");
			request.addFilter(query.toString(), false);
		}
		if (after != null) {
			StringBuilder query = new StringBuilder();
			query.append("when:[");
			query.append(UrlItem.whenDateFormat.format(after));
			query.append(" TO 99999999999999]");
			request.addFilter(query.toString(), false);
		}
	}

	private void getFacetLimit(ItemField field,
			AbstractSearchRequest searchRequest, int limit, List list)
			throws SearchLibException {
		AbstractResultSearch result = (AbstractResultSearch) dbClient
				.request(searchRequest);
		Facet facet = result.getFacetList().getByField(field.getName());
		for (FacetItem facetItem : facet) {
			if (limit-- == 0)
				break;
			if (facetItem.getCount() == 0)
				continue;
			String term = facetItem.getTerm();
			if (term == null)
				continue;
			if (term.length() == 0)
				continue;
			synchronized (list) {
				list.add(new NamedItem(term, facetItem.getCount()));
			}
		}
	}

	private AbstractSearchRequest getHostFacetSearchRequest() {
		AbstractSearchRequest searchRequest = new SearchPatternRequest(dbClient);
		searchRequest.setDefaultOperator("OR");
		searchRequest.setRows(0);
		searchRequest.getFacetFieldList().put(
				new FacetField("host", 1, false, false, null));
		return searchRequest;
	}

	public void getHostToFetch(FetchStatus fetchStatus, Date before,
			Date after, int limit, List hostList)
			throws SearchLibException {
		AbstractSearchRequest searchRequest = getHostFacetSearchRequest();
		searchRequest.setEmptyReturnsAll(true);
		try {
			filterQueryToFetch(searchRequest, fetchStatus, before, after);
		} catch (ParseException e) {
			throw new SearchLibException(e);
		}
		getFacetLimit(UrlItemFieldEnum.INSTANCE.host, searchRequest, limit,
				hostList);
	}

	public void getStartingWith(String queryString, ItemField field,
			String start, int limit, List list)
			throws ParseException, IOException, SyntaxError,
			URISyntaxException, ClassNotFoundException, InterruptedException,
			SearchLibException, InstantiationException, IllegalAccessException {
		AbstractSearchRequest searchRequest = (AbstractSearchRequest) dbClient
				.getNewRequest(field + "Facet");
		searchRequest.setQueryString(queryString);
		searchRequest.getFilterList().add(
				new QueryFilter(field + ":" + start + "*", false,
						FilterAbstract.Source.REQUEST, null));
		getFacetLimit(field, searchRequest, limit, list);
	}

	public final UrlItem getNewUrlItem(LinkItem linkItem) {
		UrlItem ui = new UrlItem();
		ui.setUrl(linkItem.getUrl());
		ui.setParentUrl(linkItem.getParentUrl());
		ui.setOrigin(linkItem.getOrigin());
		return ui;
	}

	final protected UrlItem getNewUrlItem(ResultDocument item) {
		UrlItem ui = new UrlItem();
		ui.init(item);
		return ui;
	}

	final protected UrlItem getNewUrlItem(SiteMapUrl siteMapUrl) {
		UrlItem ui = new UrlItem();
		ui.setUrl(siteMapUrl.getLoc().toString());
		ui.setOrigin(Origin.sitemap);
		return ui;
	}

	public final UrlItem getNewUrlItem(String url) {
		UrlItem ui = new UrlItem();
		ui.setUrl(url);
		ui.setOrigin(Origin.manual);
		return ui;
	}

	public void getUrlToFetch(NamedItem host, FetchStatus fetchStatus,
			Date before, Date after, long limit, List urlList)
			throws SearchLibException {
		AbstractSearchRequest searchRequest = (AbstractSearchRequest) dbClient
				.getNewRequest("urlSearch");
		try {
			searchRequest.addFilter(
					"host:\"" + QueryUtils.escapeQuery(host.getName()) + "\"",
					false);
			searchRequest.setEmptyReturnsAll(true);
			filterQueryToFetch(searchRequest, fetchStatus, before, after);
		} catch (ParseException e) {
			throw new SearchLibException(e);
		}
		searchRequest.setRows((int) limit);
		AbstractResultSearch result = (AbstractResultSearch) dbClient
				.request(searchRequest);
		for (ResultDocument item : result)
			urlList.add(getNewUrlItem(item));
	}

	public UrlItem getUrlToFetch(URL url) throws SearchLibException {
		AbstractSearchRequest searchRequest = (AbstractSearchRequest) dbClient
				.getNewRequest("urlSearch");
		return getUrl(searchRequest, url.toExternalForm());
	}

	public AbstractSearchRequest getSearchRequest(
			SearchTemplate urlSearchTemplate) throws SearchLibException {
		return (AbstractSearchRequest) dbClient.getNewRequest(urlSearchTemplate
				.name());
	}

	public int countBackLinks(String url) throws SearchLibException {
		try {
			AbstractSearchRequest searchRequest = (AbstractSearchRequest) dbClient
					.getNewRequest("urlExport");
			StringBuilder sb = new StringBuilder();
			UrlItemFieldEnum.INSTANCE.inlink.addQuery(sb, url, true);
			sb.append(" OR");
			UrlItemFieldEnum.INSTANCE.outlink.addQuery(sb, url, true);
			UrlItemFieldEnum.INSTANCE.parserStatus.addFilterQuery(
					searchRequest, ParserStatus.PARSED.value, false, false);
			searchRequest.setQueryString(sb.toString());
			searchRequest.setRows(0);
			AbstractResultSearch result = (AbstractResultSearch) dbClient
					.request(searchRequest);
			return result.getNumFound();
		} catch (ParseException e) {
			throw new SearchLibException(e);
		}
	}

	public AbstractSearchRequest getSearchRequest(
			SearchTemplate urlSearchTemplate, String like, String host,
			boolean includingSubDomain, String lang, String langMethod,
			String contentBaseType, String contentTypeCharset,
			String contentEncoding, Integer minContentLength,
			Integer maxContentLength, RobotsTxtStatus robotsTxtStatus,
			FetchStatus fetchStatus, Integer responseCode,
			ParserStatus parserStatus, IndexStatus indexStatus, Date startDate,
			Date endDate, Date startModifiedDate, Date endModifiedDate)
			throws SearchLibException {
		try {
			AbstractSearchRequest searchRequest = (AbstractSearchRequest) dbClient
					.getNewRequest(urlSearchTemplate.name());
			StringBuilder query = new StringBuilder();
			if (like != null) {
				like = like.trim();
				if (like.length() > 0) {
					like = QueryUtils.escapeQuery(like,
							QueryUtils.CONTROL_CHARS);
					like = QueryUtils.escapeQuery(like, QueryUtils.RANGE_CHARS);
					like = QueryUtils.escapeQuery(like,
							QueryUtils.AND_OR_NOT_CHARS);
					UrlItemFieldEnum.INSTANCE.url.addQuery(query, like, false);
				}
			}
			if (host != null) {
				host = host.trim();
				if (host.length() > 0)
					if (includingSubDomain)
						UrlItemFieldEnum.INSTANCE.subhost.addFilterQuery(
								searchRequest, QueryUtils.escapeQuery(host),
								false, false);
					else
						UrlItemFieldEnum.INSTANCE.host.addFilterQuery(
								searchRequest, QueryUtils.escapeQuery(host),
								false, false);
			}
			if (lang != null) {
				lang = lang.trim();
				if (lang.length() > 0)
					UrlItemFieldEnum.INSTANCE.lang.addFilterQuery(
							searchRequest, QueryUtils.escapeQuery(lang), false,
							false);
			}
			if (langMethod != null) {
				langMethod = langMethod.trim();
				if (langMethod.length() > 0)
					UrlItemFieldEnum.INSTANCE.langMethod.addFilterQuery(
							searchRequest, QueryUtils.escapeQuery(langMethod),
							true, false);
			}
			if (contentBaseType != null) {
				contentBaseType = contentBaseType.trim();
				if (contentBaseType.length() > 0)
					UrlItemFieldEnum.INSTANCE.contentBaseType.addFilterQuery(
							searchRequest,
							QueryUtils.escapeQuery(contentBaseType), true,
							false);
			}
			if (contentTypeCharset != null) {
				contentTypeCharset = contentTypeCharset.trim();
				if (contentTypeCharset.length() > 0)
					UrlItemFieldEnum.INSTANCE.contentTypeCharset
							.addFilterQuery(searchRequest,
									QueryUtils.escapeQuery(contentTypeCharset),
									false, false);
			}
			if (contentEncoding != null) {
				contentEncoding = contentEncoding.trim();
				if (contentEncoding.length() > 0)
					UrlItemFieldEnum.INSTANCE.contentEncoding.addFilterQuery(
							searchRequest,
							QueryUtils.escapeQuery(contentEncoding), true,
							false);
			}

			if (robotsTxtStatus != null
					&& robotsTxtStatus != RobotsTxtStatus.ALL)
				UrlItemFieldEnum.INSTANCE.robotsTxtStatus.addFilterQuery(
						searchRequest, robotsTxtStatus.value, false, false);
			if (responseCode != null)
				UrlItemFieldEnum.INSTANCE.responseCode.addFilterQuery(
						searchRequest, responseCode, false, false);
			if (fetchStatus != null && fetchStatus != FetchStatus.ALL)
				UrlItemFieldEnum.INSTANCE.fetchStatus.addFilterQuery(
						searchRequest, fetchStatus.value, false, false);
			if (parserStatus != null && parserStatus != ParserStatus.ALL)
				UrlItemFieldEnum.INSTANCE.parserStatus.addFilterQuery(
						searchRequest, parserStatus.value, false, false);
			if (indexStatus != null && indexStatus != IndexStatus.ALL)
				UrlItemFieldEnum.INSTANCE.indexStatus.addFilterQuery(
						searchRequest, indexStatus.value, false, false);

			if (minContentLength != null || maxContentLength != null) {
				String from, to;
				if (minContentLength == null)
					from = UrlItem.longFormat.format(0);
				else
					from = UrlItem.longFormat.format(minContentLength);
				if (maxContentLength == null)
					to = UrlItem.longFormat.format(Integer.MAX_VALUE);
				else
					to = UrlItem.longFormat.format(maxContentLength);
				UrlItemFieldEnum.INSTANCE.contentLength.addQueryRange(query,
						from, to, false);
			}

			if (startDate != null || endDate != null) {
				String from, to;
				if (startDate == null)
					from = "00000000000000";
				else
					from = UrlItem.whenDateFormat.format(startDate);
				if (endDate == null)
					to = "99999999999999";
				else
					to = UrlItem.whenDateFormat.format(endDate);
				UrlItemFieldEnum.INSTANCE.when.addFilterRange(searchRequest,
						from, to, false, false);
			}

			if (startModifiedDate != null || endModifiedDate != null) {
				String from, to;
				if (startModifiedDate == null)
					from = "00000000000000";
				else
					from = UrlItem.whenDateFormat.format(startModifiedDate);
				if (endModifiedDate == null)
					to = "99999999999999";
				else
					to = UrlItem.whenDateFormat.format(endModifiedDate);
				UrlItemFieldEnum.INSTANCE.lastModifiedDate.addFilterRange(
						searchRequest, from, to, false, false);
			}

			searchRequest.setEmptyReturnsAll(true);
			searchRequest.setQueryString(query.toString().trim());
			return searchRequest;
		} catch (ParseException e) {
			throw new SearchLibException(e);
		}
	}

	private UrlItem getUrl(AbstractSearchRequest request, String sUrl)
			throws SearchLibException {
		if (request == null)
			request = (AbstractSearchRequest) dbClient
					.getNewRequest(SearchTemplate.urlSearch.name());
		else
			request.reset();
		request.setQueryString("url:\"" + QueryUtils.escapeQuery(sUrl) + '"');
		request.setStart(0);
		request.setRows(1);
		try {
			AbstractResultSearch result = (AbstractResultSearch) dbClient
					.request(request);
			for (ResultDocument doc : result)
				return getNewUrlItem(doc);
			return null;
		} catch (RuntimeException e) {
			throw new SearchLibException(e);
		}
	}

	public long getUrlList(AbstractSearchRequest searchRequest, long start,
			long rows, List list) throws SearchLibException {
		searchRequest.reset();
		searchRequest.setStart((int) start);
		searchRequest.setRows((int) rows);
		try {
			AbstractResultSearch result = (AbstractResultSearch) dbClient
					.request(searchRequest);
			if (list != null)
				for (ResultDocument doc : result)
					list.add(getNewUrlItem(doc));
			return result.getNumFound();
		} catch (RuntimeException e) {
			throw new SearchLibException(e);
		}
	}

	public Facet getHostFacetList(int minCount) throws SearchLibException {
		try {
			AbstractSearchRequest searchRequest = (AbstractSearchRequest) dbClient
					.getNewRequest(UrlManager.SearchTemplate.hostFacet.name());
			searchRequest.setEmptyReturnsAll(true);
			FacetField facetField = searchRequest.getFacetFieldList().get(
					UrlItemFieldEnum.INSTANCE.host.getName());
			if (minCount < 0)
				minCount = 0;
			facetField.setMinCount(minCount);
			AbstractResultSearch result = (AbstractResultSearch) dbClient
					.request(searchRequest);
			if (result == null)
				return null;
			return result.getFacetList().getByField(
					UrlItemFieldEnum.INSTANCE.host.getName());
		} catch (RuntimeException e) {
			throw new SearchLibException(e);
		}
	}

	public void updateUrlItem(UrlItem urlItem) throws SearchLibException {
		try {
			IndexDocument indexDocument = new IndexDocument();
			urlItem.populate(indexDocument);
			dbClient.updateDocument(indexDocument);
		} catch (IOException e) {
			throw new SearchLibException(e);
		}
	}

	public void updateUrlItems(List urlItems)
			throws SearchLibException {
		try {
			if (urlItems == null)
				return;
			List documents = new ArrayList(
					urlItems.size());
			for (UrlItem urlItem : urlItems) {
				if (urlItem == null)
					continue;
				IndexDocument indexDocument = new IndexDocument();
				urlItem.populate(indexDocument);
				documents.add(indexDocument);
			}
			if (documents.size() > 0)
				dbClient.updateDocuments(documents);
		} catch (IOException e) {
			throw new SearchLibException(e);
		}
	}

	/**
	 * Update the targeted index with crawl results
	 * 
	 * @param crawls
	 * @throws SearchLibException
	 */
	public void updateCrawlTarget(List crawls) throws SearchLibException {
		try {
			if (crawls == null)
				return;
			// Update target index
			List documentsToUpdate = new ArrayList(
					crawls.size());
			List documentsToDelete = new ArrayList(
					crawls.size());
			String uniqueField = targetClient.getSchema().getUniqueField();
			for (Crawl crawl : crawls) {
				if (crawl == null)
					continue;
				if (crawl.getHostUrlList().getListType() == ListType.DBCRAWL)
					continue;
				UrlItem currentUrlItem = crawl.getUrlItem();
				List indexDocuments = crawl
						.getTargetIndexDocuments();
				TargetStatus targetStatus = currentUrlItem.getIndexStatus().targetStatus;
				if (targetStatus == TargetStatus.TARGET_UPDATE) {
					if (CollectionUtils.isEmpty(indexDocuments)) {
						currentUrlItem
								.setIndexStatus(IndexStatus.NOTHING_TO_INDEX);
						continue;
					}
					for (IndexDocument indexDocument : indexDocuments) {
						if (indexDocument == null)
							continue;
						if (uniqueField != null
								&& !indexDocument.hasContent(uniqueField)) {
							currentUrlItem
									.setIndexStatus(IndexStatus.INDEX_ERROR);
						} else
							documentsToUpdate.add(indexDocument);
					}
				} else if (targetStatus == TargetStatus.TARGET_DELETE)
					documentsToDelete.add(crawl.getUrlItem().getUrl());
			}
			if (documentsToUpdate.size() > 0) {
				targetClient.updateDocuments(documentsToUpdate);
				for (Crawl crawl : crawls) {
					UrlItem currentUrlItem = crawl.getUrlItem();
					IndexStatus indexStatus = currentUrlItem.getIndexStatus();
					if (indexStatus == IndexStatus.TO_INDEX
							|| indexStatus == IndexStatus.NOT_INDEXED)
						currentUrlItem.setIndexStatus(IndexStatus.INDEXED);
				}
			}
			if (documentsToDelete.size() > 0) {
				String targetField = findIndexedFieldOfTargetIndex(
						targetClient.getWebCrawlerFieldMap(),
						UrlItemFieldEnum.INSTANCE.url.getName());
				if (targetField != null)
					targetClient
							.deleteDocuments(targetField, documentsToDelete);
				targetClient.getScreenshotManager().delete(documentsToDelete);
			}
		} catch (IOException e) {
			throw new SearchLibException(e);
		} catch (URISyntaxException e) {
			throw new SearchLibException(e);
		}
	}

	public File exportURLs(AbstractSearchRequest searchRequest)
			throws SearchLibException {
		PrintWriter pw = null;
		File tempFile = null;
		try {
			tempFile = File.createTempFile("OSS_web_crawler_URLs", ".txt");
			pw = new PrintWriter(tempFile);
			int currentPos = 0;
			List uList = new ArrayList();
			for (;;) {
				int totalSize = (int) getUrlList(searchRequest, currentPos,
						1000, uList);
				for (UrlItem u : uList)
					pw.println(u.getUrl());
				if (uList.size() == 0)
					break;
				uList.clear();
				currentPos += 1000;
				if (currentPos >= totalSize)
					break;
			}
			pw.close();
			pw = null;

		} catch (IOException e) {
			throw new SearchLibException(e);
		} finally {
			if (pw != null)
				pw.close();
		}
		return tempFile;
	}

	public File exportCrawlCache(AbstractSearchRequest searchRequest)
			throws IOException, SearchLibException {
		File tempFile = null;
		ZipArchiveOutputStream zipOutput = null;
		CrawlCacheManager crawlCacheManager = ClientCatalog
				.getCrawlCacheManager();
		if (crawlCacheManager.isDisabled())
			throw new SearchLibException("The crawlCache is disabled.");
		try {
			tempFile = File
					.createTempFile("OSS_web_crawler_crawlcache", ".zip");
			zipOutput = new ZipArchiveOutputStream(tempFile);
			int currentPos = 0;
			List uList = new ArrayList();
			for (;;) {
				int totalSize = (int) getUrlList(searchRequest, currentPos,
						1000, uList);
				if (uList.size() == 0)
					break;
				for (UrlItem u : uList) {
					URL url = u.getURL();
					if (url == null)
						continue;
					DownloadItem downloadItem = crawlCacheManager.loadCache(url
							.toURI());
					if (downloadItem == null)
						continue;
					downloadItem.writeToZip(zipOutput);
				}
				uList.clear();
				currentPos += 1000;
				if (currentPos >= totalSize)
					break;
			}
			zipOutput.close();
			zipOutput = null;
			return tempFile;
		} catch (JSONException e) {
			throw new IOException(e);
		} catch (URISyntaxException e) {
			throw new IOException(e);
		} finally {
			if (zipOutput != null)
				IOUtils.closeQuietly(zipOutput);
		}
	}

	public File exportSiteMap(AbstractSearchRequest searchRequest)
			throws SearchLibException {
		PrintWriter pw = null;
		File tempFile = null;
		try {
			tempFile = File.createTempFile("OSS_web_crawler_URLs", ".xml");
			pw = new PrintWriter(tempFile);
			ThreadSafeDateFormat dateformat = new ThreadSafeSimpleDateFormat(
					"yyyy-MM-dd");
			XmlWriter xmlWriter = new XmlWriter(pw, "UTF-8");
			xmlWriter.startElement("urlset", "xmlns",
					"http://www.sitemaps.org/schemas/sitemap/0.9");
			int currentPos = 0;
			List uList = new ArrayList();
			for (;;) {
				int totalSize = (int) getUrlList(searchRequest, currentPos,
						1000, uList);
				for (UrlItem u : uList) {
					xmlWriter.startElement("url");
					xmlWriter.writeSubTextNodeIfAny("loc", u.getUrl());
					if (u.getLastModifiedDate() != null)
						xmlWriter.writeSubTextNodeIfAny("lastmod",
								dateformat.format(u.getLastModifiedDate()));
					xmlWriter.endElement();
				}
				if (uList.size() == 0)
					break;
				uList.clear();
				currentPos += 1000;
				if (currentPos >= totalSize)
					break;
			}
			xmlWriter.endElement();
			xmlWriter.endDocument();
			pw.close();
			pw = null;
		} catch (IOException e) {
			throw new SearchLibException(e);
		} catch (SAXException e) {
			throw new SearchLibException(e);
		} catch (TransformerConfigurationException e) {
			throw new SearchLibException(e);
		} finally {
			if (pw != null)
				pw.close();
		}
		return tempFile;
	}

	public long deleteUrls(AbstractSearchRequest searchRequest, int bufferSize,
			TaskLog taskLog) throws SearchLibException {
		setCurrentTaskLog(taskLog);
		try {
			long total = 0;
			List urlItemList = new ArrayList();
			long last = 0;
			for (;;) {
				urlItemList.clear();
				long len = getUrlList(searchRequest, 0, bufferSize, urlItemList);
				if (urlItemList.size() == 0)
					break;
				if (len == last) {
					Logging.warn("URLManager loop redundancy (deleteUrls): "
							+ len + "/" + total);
					break;
				}
				last = len;
				List urlList = new ArrayList(urlItemList.size());
				for (UrlItem urlItem : urlItemList)
					urlList.add(urlItem.getUrl());
				dbClient.deleteDocuments(
						UrlItemFieldEnum.INSTANCE.url.getName(), urlList);
				total += urlItemList.size();
				taskLog.setInfo(total + " URL(s) deleted");
				if (taskLog.isAbortRequested())
					throw new SearchLibException.AbortException();
				ThreadUtils.sleepMs(100);
			}
			return total;
		} finally {
			resetCurrentTaskLog();
		}
	}

	public long synchronizeIndex(AbstractSearchRequest searchRequest,
			int bufferSize, TaskLog taskLog) throws SearchLibException {
		String targetField = findIndexedFieldOfTargetIndex(
				targetClient.getWebCrawlerFieldMap(),
				UrlItemFieldEnum.INSTANCE.url.getName());
		return synchronizeIndex(searchRequest, targetField,
				UrlItemFieldEnum.INSTANCE.url.getName(), bufferSize, taskLog);
	}

	public long updateFetchStatus(AbstractSearchRequest searchRequest,
			FetchStatus fetchStatus, int bufferSize, TaskLog taskLog)
			throws SearchLibException, IOException {
		setCurrentTaskLog(taskLog);
		try {
			long total = 0;
			UrlItemFieldEnum.INSTANCE.fetchStatus.addFilterQuery(searchRequest,
					fetchStatus.value, false, true);
			List urlItemList = new ArrayList();
			long last = 0;
			for (;;) {
				urlItemList.clear();
				long len = getUrlList(searchRequest, 0, bufferSize, urlItemList);
				if (urlItemList.size() == 0)
					break;
				if (len == last) {
					Logging.warn("URLManager loop redundancy (updateFetchStatus): "
							+ len + "/" + total);
					break;
				}
				last = len;
				for (UrlItem urlItem : urlItemList)
					urlItem.setFetchStatus(fetchStatus);
				updateUrlItems(urlItemList);
				total += urlItemList.size();
				taskLog.setInfo(total + " URL(s) updated");
				if (taskLog.isAbortRequested())
					throw new SearchLibException.AbortException();
				ThreadUtils.sleepMs(100);
			}
			return total;
		} catch (ParseException e) {
			throw new SearchLibException(e);
		} finally {
			resetCurrentTaskLog();
		}
	}

	public long updateSiteMap(TaskLog taskLog) throws SearchLibException {
		setCurrentTaskLog(taskLog);
		HttpDownloader httpDownloader = null;
		try {
			AbstractSearchRequest request = (AbstractSearchRequest) dbClient
					.getNewRequest(SearchTemplate.urlSearch.name());
			long inserted = 0;
			long existing = 0;
			long setToFetchFirst = 0;
			int everyTen = 0;
			targetClient.getSiteMapList();
			httpDownloader = targetClient.getWebCrawlMaster()
					.getNewHttpDownloader(true);
			Set siteMapUrlSet = new HashSet(0);
			List urlItemList = new ArrayList(0);
			long now = System.currentTimeMillis();
			for (SiteMapItem siteMapItem : targetClient.getSiteMapList()
					.getArray()) {
				taskLog.setInfo("Loading " + siteMapItem.getUri());
				siteMapUrlSet.clear();
				urlItemList.clear();
				siteMapItem.load(httpDownloader, siteMapUrlSet);
				for (SiteMapUrl siteMapUrl : siteMapUrlSet) {
					UrlItem urlItem = getUrl(request, siteMapUrl.getLoc()
							.toString());
					if (urlItem == null) {
						urlItemList.add(getNewUrlItem(siteMapUrl));
						inserted++;
					} else {
						existing++;
						long timeDistanceMs = now - urlItem.getWhen().getTime();
						FetchStatus fetchStatus = urlItem.getFetchStatus();
						if (fetchStatus == FetchStatus.UN_FETCHED
								|| (fetchStatus == FetchStatus.FETCHED && siteMapUrl
										.getChangeFreq().needUpdate(
												timeDistanceMs))) {
							if (fetchStatus != FetchStatus.FETCH_FIRST) {
								urlItem.setFetchStatus(FetchStatus.FETCH_FIRST);
								urlItemList.add(urlItem);
								setToFetchFirst++;
							}
						}
					}
					if (everyTen == 10) {
						if (taskLog.isAbortRequested())
							throw new SearchLibException.AbortException();
						everyTen = 0;
						taskLog.setInfo(inserted + "/" + existing
								+ " URL(s) inserted/existing");
					} else
						everyTen++;
				}
				if (urlItemList.size() > 0)
					updateUrlItems(urlItemList);
			}
			taskLog.setInfo(inserted + "/" + existing + "/" + setToFetchFirst
					+ " URL(s) inserted/existing/fetchFirst");
			Logging.info(taskLog.getInfo());
			return inserted + existing;
		} finally {
			if (httpDownloader != null)
				httpDownloader.release();
			resetCurrentTaskLog();
		}
	}

	/**
	 * Update the URL database with crawl results
	 * 
	 * @param crawls
	 * @throws SearchLibException
	 */
	public void updateCrawlUrlDb(List crawls) throws SearchLibException {
		if (crawls == null)
			return;
		List urlItems = new ArrayList();
		for (Crawl crawl : crawls) {
			if (crawl == null)
				continue;
			urlItems.add(crawl.getUrlItem());
		}
		updateUrlItems(urlItems);
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy