All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.scheduler.task.TaskUrlManagerAction Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2010-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.scheduler.task;

import java.io.IOException;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.web.database.RobotsTxtStatus;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.request.AbstractSearchRequest;
import com.jaeksoft.searchlib.scheduler.TaskAbstract;
import com.jaeksoft.searchlib.scheduler.TaskLog;
import com.jaeksoft.searchlib.scheduler.TaskProperties;
import com.jaeksoft.searchlib.scheduler.TaskPropertyDef;
import com.jaeksoft.searchlib.scheduler.TaskPropertyType;
import com.jaeksoft.searchlib.util.Variables;

public class TaskUrlManagerAction extends TaskAbstract {

	final private TaskPropertyDef propCommand = new TaskPropertyDef(TaskPropertyType.comboBox, "Command", "Command",
			"Select the command to execute", 30);

	final private TaskPropertyDef propFilterUrl = new TaskPropertyDef(TaskPropertyType.textBox, "URL prefix",
			"URL prefix", "Filter prefix on the URL", 50);

	final private TaskPropertyDef propFilterLang = new TaskPropertyDef(TaskPropertyType.textBox, "Lang", "Lang",
			"Filter on the lang", 5);

	final private TaskPropertyDef propFilterContentBaseType = new TaskPropertyDef(TaskPropertyType.textBox,
			"ContentBaseType", "Content type", "Filter on the content type", 30);

	final private TaskPropertyDef propFilterContentTypeCharset = new TaskPropertyDef(TaskPropertyType.textBox,
			"ContentTypeCharset", "Content charset", "Filter on the content type charset", 10);

	final private TaskPropertyDef propFilterContentEncoding = new TaskPropertyDef(TaskPropertyType.textBox,
			"ContentEncoding", "Content encoding", "Filter on the content encoding", 10);

	final private TaskPropertyDef propFilterMinContentLength = new TaskPropertyDef(TaskPropertyType.textBox,
			"MinContentLength", "Min length", "Filter on the minimum content length", 10);

	final private TaskPropertyDef propFilterMaxContentLength = new TaskPropertyDef(TaskPropertyType.textBox,
			"MaxContentLength", "Max length", "Filter on the maximum content length", 10);

	final private TaskPropertyDef propFilterHost = new TaskPropertyDef(TaskPropertyType.textBox, "Hostname", "Hostname",
			"Filter on the hostname", 30);

	final private TaskPropertyDef propFilterWithSubDomain = new TaskPropertyDef(TaskPropertyType.listBox,
			"WithSubDomain", "With sub domain", "Filter on the sub domain", 10);

	final private TaskPropertyDef propRobotsTxtStatus = new TaskPropertyDef(TaskPropertyType.listBox,
			"Robots.txt status", "Robots.txt status", "Filter on the Robots.txt status", 20);

	final private TaskPropertyDef propFetchStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Fetch status",
			"Fetch status", "Filter on the fetch status", 20);

	final private TaskPropertyDef propParserStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Parser status",
			"Parser status", "Filter on the Parser status", 20);

	final private TaskPropertyDef propIndexStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Index status",
			"Index status", "Filter on the index status", 20);

	final private TaskPropertyDef propBufferSize = new TaskPropertyDef(TaskPropertyType.textBox, "Buffer size",
			"Buffer size", "Buffer size", 10);

	final private TaskPropertyDef[] taskPropertyDefs = { propCommand, propFilterUrl, propFilterLang,
			propFilterContentBaseType, propFilterContentTypeCharset, propFilterContentEncoding,
			propFilterMinContentLength, propFilterMaxContentLength, propFilterHost, propFilterWithSubDomain,
			propRobotsTxtStatus, propFetchStatus, propParserStatus, propIndexStatus, propBufferSize };

	final public static String CommandDoNothing = "Do nothing";
	final public static String CommandSetToUnfetched = "Set to unfetched";
	final public static String CommandSetToFetchFirst = "Set to fetch first";
	final public static String CommandDeleteAll = "Delete all";
	final public static String CommandDeleteSelection = "Delete selection";
	final public static String CommandLoadSitemap = "Load Sitemap(s)";
	final public static String CommandOptimize = "Optimize";
	final public static String CommandSynchronize = "Synchronize";

	final private static String[] CommandList = { CommandDoNothing, CommandSetToUnfetched, CommandSetToFetchFirst,
			CommandDeleteSelection, CommandDeleteAll, CommandLoadSitemap, CommandSynchronize, CommandOptimize };

	@Override
	public String getName() {
		return "Web crawler - URL database";
	}

	@Override
	public TaskPropertyDef[] getPropertyList() {
		return taskPropertyDefs;
	}

	@Override
	public String[] getPropertyValues(Config config, TaskPropertyDef propertyDef, TaskProperties taskProperties) {
		if (propertyDef == propCommand)
			return CommandList;
		else if (propertyDef == propRobotsTxtStatus)
			return RobotsTxtStatus.getNames();
		else if (propertyDef == propFetchStatus)
			return FetchStatus.getNames();
		else if (propertyDef == propParserStatus)
			return ParserStatus.getNames();
		else if (propertyDef == propIndexStatus)
			return IndexStatus.getNames();
		else if (propertyDef == propFilterWithSubDomain)
			return ClassPropertyEnum.BOOLEAN_LIST;
		return null;
	}

	@Override
	public String getDefaultValue(Config config, TaskPropertyDef propertyDef) {
		if (propertyDef == propCommand)
			return CommandList[0];
		else if (propertyDef == propRobotsTxtStatus)
			return RobotsTxtStatus.ALL.name;
		else if (propertyDef == propFetchStatus)
			return FetchStatus.ALL.name;
		else if (propertyDef == propParserStatus)
			return ParserStatus.ALL.name;
		else if (propertyDef == propIndexStatus)
			return IndexStatus.ALL.name;
		else if (propertyDef == propBufferSize)
			return "10000";
		else if (propertyDef == propFilterWithSubDomain)
			return Boolean.FALSE.toString();
		return null;
	}

	private AbstractSearchRequest selectionRequest = null;

	private String manualCommand = null;

	private Integer manualBufferSize = null;

	public void setManual(AbstractSearchRequest selectionRequest, String manualCommand, int bufferSize) {
		this.selectionRequest = selectionRequest;
		this.manualCommand = manualCommand;
		this.manualBufferSize = bufferSize;
	}

	@Override
	public void execute(Client client, TaskProperties properties, Variables variables, TaskLog taskLog)
			throws SearchLibException, IOException {
		UrlManager urlManager = client.getUrlManager();
		taskLog.setInfo("URL manager Action started");

		final String command;
		final int bufferSize;

		if (manualCommand != null) {
			command = manualCommand;
			bufferSize = manualBufferSize;
		} else {
			command = properties.getValue(propCommand);
			bufferSize = Integer.parseInt(properties.getValue(propBufferSize));
			String urlLike = properties.getValue(propFilterUrl);
			String lang = properties.getValue(propFilterLang);
			String contentBaseType = properties.getValue(propFilterContentBaseType);
			String contentTypeCharset = properties.getValue(propFilterContentTypeCharset);
			String contentEncoding = properties.getValue(propFilterContentEncoding);
			Integer minContentLength = properties.getValueInteger(propFilterMinContentLength);
			Integer maxContentLength = properties.getValueInteger(propFilterMaxContentLength);
			String host = properties.getValue(propFilterHost);
			boolean withSubDomain = properties.getValueBoolean(propFilterWithSubDomain, false);
			RobotsTxtStatus robotsTxtStatus = RobotsTxtStatus.findByName(properties.getValue(propRobotsTxtStatus));
			FetchStatus fetchStatus = FetchStatus.findByName(properties.getValue(propFetchStatus));
			ParserStatus parserStatus = ParserStatus.findByName(properties.getValue(propParserStatus));
			IndexStatus indexStatus = IndexStatus.findByName(properties.getValue(propIndexStatus));
			selectionRequest = urlManager.getSearchRequest(UrlManager.SearchTemplate.urlSearch, urlLike, host,
					withSubDomain, lang, null, contentBaseType, contentTypeCharset, contentEncoding, minContentLength,
					maxContentLength, robotsTxtStatus, fetchStatus, null, parserStatus, indexStatus, null, null, null,
					null);
		}

		if (CommandLoadSitemap.equals(command)) {
			taskLog.setInfo("URL manager: Handle SiteMaps");
			urlManager.updateSiteMap(taskLog);
		} else if (CommandSetToFetchFirst.equals(command)) {
			taskLog.setInfo("URL manager: Update status to ");
			urlManager.updateFetchStatus(selectionRequest, FetchStatus.FETCH_FIRST, bufferSize, taskLog);
		} else if (CommandSetToUnfetched.equals(command)) {
			taskLog.setInfo("URL manager: Update status to ");
			urlManager.updateFetchStatus(selectionRequest, FetchStatus.UN_FETCHED, bufferSize, taskLog);
		} else if (CommandDeleteAll.equals(command)) {
			taskLog.setInfo("URL manager: Delete All");
			urlManager.deleteAll(taskLog);
		} else if (CommandDeleteSelection.equals(command)) {
			taskLog.setInfo("URL manager: Delete selection");
			urlManager.deleteUrls(selectionRequest, bufferSize, taskLog);
		} else if (CommandSynchronize.equals(command)) {
			taskLog.setInfo("URL manager: synchronize");
			urlManager.synchronizeIndex(selectionRequest, bufferSize, taskLog);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy