All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.file.process.CrawlFileThread Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.file.process;

import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlThreadAbstract;
import com.jaeksoft.searchlib.crawler.file.database.FileCrawlQueue;
import com.jaeksoft.searchlib.crawler.file.database.FileInfo;
import com.jaeksoft.searchlib.crawler.file.database.FileItem;
import com.jaeksoft.searchlib.crawler.file.database.FileManager;
import com.jaeksoft.searchlib.crawler.file.database.FilePathItem;
import com.jaeksoft.searchlib.crawler.file.database.FileTypeEnum;
import com.jaeksoft.searchlib.crawler.file.spider.CrawlFile;

public class CrawlFileThread extends
		CrawlThreadAbstract {

	private FileItem currentFileItem;
	private FileManager fileManager;
	private long delayBetweenAccesses;
	private FilePathItem filePathItem;
	private long nextTimeTarget;

	protected CrawlFileThread(Config config, CrawlFileMaster crawlMaster,
			CrawlStatistics sessionStats, FilePathItem filePathItem)
			throws SearchLibException {
		super(config, crawlMaster, null);
		this.fileManager = config.getFileManager();
		currentStats = new CrawlStatistics(sessionStats);
		delayBetweenAccesses = filePathItem.getDelay();
		nextTimeTarget = 0;
		this.filePathItem = filePathItem;
	}

	private void sleepInterval(long max) {
		long c = System.currentTimeMillis();
		long ms = nextTimeTarget - c;
		nextTimeTarget = c + delayBetweenAccesses;
		if (ms < 0)
			return;
		if (ms > max)
			ms = max;
		sleepMs(ms);
	}

	@Override
	public void runner() throws Exception {

		CrawlFileMaster crawlMaster = (CrawlFileMaster) getThreadMaster();
		FileCrawlQueue crawlQueue = (FileCrawlQueue) crawlMaster
				.getCrawlQueue();

		FilePathItemIterator filePathIterator = new FilePathItemIterator(
				filePathItem);

		ItemIterator itemIterator;

		while ((itemIterator = filePathIterator.next()) != null) {

			if (isAborted() || crawlMaster.isAborted())
				break;

			FileInstanceAbstract fileInstance = itemIterator.getFileInstance();
			currentFileItem = fileManager.getNewFileItem(fileInstance);

			FileTypeEnum type = currentFileItem.getFileType();
			if (type == FileTypeEnum.directory) {
				if (!checkDirectory((ItemDirectoryIterator) itemIterator,
						crawlQueue))
					continue;
			} else if (type == FileTypeEnum.file) {
				if (!checkFile())
					continue;
			}

			CrawlFile crawl = crawl(fileInstance);
			if (crawl != null)
				crawlQueue.add(currentStats, crawl);

			setStatus(CrawlStatus.INDEXATION);
			crawlQueue.index(false);

		}
		crawlQueue.index(!crawlMaster.isRunning());
	}

	private CrawlFile crawl(FileInstanceAbstract fileInstance)
			throws SearchLibException {

		long startTime = System.currentTimeMillis();

		sleepInterval(60000);

		setStatus(CrawlStatus.CRAWL);
		currentStats.incUrlCount();

		CrawlFile crawl = new CrawlFile(fileInstance, currentFileItem,
				getConfig(), currentStats);

		// Fetch started
		currentStats.incFetchedCount();

		crawl.download();

		if (currentFileItem.getFetchStatus() == FetchStatus.FETCHED
				&& currentFileItem.getParserStatus() == ParserStatus.PARSED
				&& currentFileItem.getIndexStatus() != IndexStatus.META_NOINDEX) {
			currentFileItem.setIndexStatus(IndexStatus.TO_INDEX);
			currentStats.incParsedCount();
		} else
			currentStats.incIgnoredCount();

		currentFileItem.setTime((int) (System.currentTimeMillis() - startTime));
		return crawl;
	}

	final private void smartDelete(FileCrawlQueue crawlQueue, FileInfo fileInfo)
			throws SearchLibException {
		crawlQueue.delete(currentStats, fileInfo.getUri());
		if (fileInfo.getFileType() != FileTypeEnum.directory)
			return;
		HashMap indexFileMap = new HashMap();
		try {
			fileManager.getFileInfoList(new URI(fileInfo.getUri()),
					indexFileMap);
			for (FileInfo fi : indexFileMap.values())
				smartDelete(crawlQueue, fi);
		} catch (UnsupportedEncodingException e) {
			Logging.warn(e);
		} catch (URISyntaxException e) {
			Logging.warn(e);
		}
	}

	private boolean checkDirectory(ItemDirectoryIterator itemDirectory,
			FileCrawlQueue crawlQueue) throws UnsupportedEncodingException,
			SearchLibException, URISyntaxException {

		// Load directory from Index
		FileInstanceAbstract fileInstance = itemDirectory.getFileInstance();
		FilePathItem filePathItem = fileInstance.getFilePathItem();
		HashMap indexFileMap = new HashMap();
		fileManager.getFileInfoList(fileInstance.getURI(), indexFileMap);

		// If the filePathItem does not support subdir
		if (!filePathItem.isWithSubDir())
			for (FileInfo fileInfo : indexFileMap.values())
				if (fileInfo.getFileType() == FileTypeEnum.directory)
					smartDelete(crawlQueue, fileInfo);

		// Remove existing files from the map
		FileInstanceAbstract[] files = itemDirectory.getFiles();
		if (files != null)
			for (FileInstanceAbstract file : files)
				indexFileMap.remove(file.getURI().toASCIIString());

		// The file that remain in the map can be removed
		if (indexFileMap.size() > 0)
			for (FileInfo fileInfo : indexFileMap.values())
				smartDelete(crawlQueue, fileInfo);

		return checkFile();
	}

	private boolean checkFile() throws UnsupportedEncodingException,
			SearchLibException, URISyntaxException {
		FileInfo oldFileInfo = fileManager
				.getFileInfo(currentFileItem.getUri());
		// The file is a new file
		if (oldFileInfo == null) {
			return true;
		}
		// The file has been modified
		if (oldFileInfo.isNewCrawlNeeded(currentFileItem))
			return true;
		// The file has not changed, we don't need to craw it
		currentStats.incIgnoredCount();
		return false;
	}

	public FileItem getCurrentFileItem() {
		synchronized (this) {
			return currentFileItem;
		}
	}

	public void setCurrentFileItem(FileItem item) {
		synchronized (this) {
			currentFileItem = item;
		}
	}

	@Override
	public String getCurrentInfo() {
		if (currentFileItem != null)
			return currentFileItem.getDirectory();
		return "";
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy