All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.file.process.CrawlFileMaster Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.file.process;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.LinkedList;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.process.CrawlMasterAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlQueueAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.file.database.FileCrawlQueue;
import com.jaeksoft.searchlib.crawler.file.database.FilePathItem;
import com.jaeksoft.searchlib.crawler.file.database.FilePathManager;
import com.jaeksoft.searchlib.crawler.file.database.FilePropertyManager;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.scheduler.TaskManager;

public class CrawlFileMaster extends
		CrawlMasterAbstract {

	private FileCrawlQueue fileCrawlQueue;

	private final LinkedList filePathList;

	public CrawlFileMaster(Config config) throws SearchLibException {
		super(config);
		FilePropertyManager filePropertyManager = config
				.getFilePropertyManager();
		fileCrawlQueue = new FileCrawlQueue(config);
		filePathList = new LinkedList();
		if (filePropertyManager.getCrawlEnabled().getValue()) {
			Logging.info("The file crawler is starting for "
					+ config.getIndexName());
			start(false);
		}
	}

	@Override
	public void runner() throws Exception {
		Config config = getConfig();
		FilePropertyManager propertyManager = config.getFilePropertyManager();
		fileCrawlQueue.setMaxBufferSize(propertyManager
				.getIndexDocumentBufferSize().getValue());

		while (!isAborted()) {

			currentStats = new CrawlStatistics();
			addStatistics(currentStats);
			fileCrawlQueue.setStatistiques(currentStats);

			int threadNumber = propertyManager.getMaxThreadNumber().getValue();
			String schedulerJobName = propertyManager
					.getSchedulerAfterSession().getValue();

			synchronized (filePathList) {
				filePathList.clear();
			}

			extractFilePathList();

			while (!isAborted()) {

				FilePathItem filePathItem = getNextFilePathItem();
				if (filePathItem == null)
					break;

				CrawlFileThread crawlThread = new CrawlFileThread(config, this,
						currentStats, filePathItem);
				add(crawlThread);

				while (getThreadsCount() >= threadNumber && !isAborted())
					sleepSec(5);
			}
			setStatus(CrawlStatus.WAITING_CHILD);
			while (getThreadsCount() > 0) {
				waitForChild(1800);
				if (isAborted())
					break;
			}
			setStatus(CrawlStatus.INDEXATION);
			fileCrawlQueue.index(true);
			if (fileCrawlQueue.hasContainedData())
				config.getFileManager().reload(false, null);

			if (schedulerJobName != null && schedulerJobName.length() > 0) {
				setStatus(CrawlStatus.EXECUTE_SCHEDULER_JOB);
				TaskManager.getInstance().executeJob(config.getIndexName(),
						schedulerJobName);
			}

			if (isOnce())
				break;
			sleepSec(5);
		}
		fileCrawlQueue.index(true);
		setStatus(CrawlStatus.NOT_RUNNING);
	}

	private void extractFilePathList() throws IOException, ParseException,
			SyntaxError, URISyntaxException, ClassNotFoundException,
			InterruptedException, SearchLibException, InstantiationException,
			IllegalAccessException {
		Config config = getConfig();
		setStatus(CrawlStatus.EXTRACTING_FILEPATHLIST);

		FilePathManager filePathManager = config.getFilePathManager();

		filePathManager.getFilePathsToFetch(filePathList);
		currentStats.addHostListSize(filePathList.size());
	}

	private FilePathItem getNextFilePathItem() {
		synchronized (filePathList) {
			int s = filePathList.size();
			if (s == 0)
				return null;
			FilePathItem filePathItem = filePathList.remove(0);
			if (filePathItem == null)
				return null;
			currentStats.incHostCount();
			return filePathItem;
		}
	}

	public CrawlQueueAbstract getCrawlQueue() {
		return fileCrawlQueue;
	}

	@Override
	protected CrawlFileThread[] getNewArray(int size) {
		return new CrawlFileThread[size];
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy