
com.jaeksoft.searchlib.webservice.crawler.webcrawler.WebCrawlerImpl Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2011-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.webservice.crawler.webcrawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import javax.xml.ws.WebServiceException;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.BooleanUtils;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.ClientFactory;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.database.CredentialManager;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager.SearchTemplate;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternItem;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternManager;
import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread;
import com.jaeksoft.searchlib.crawler.web.screenshot.ScreenshotManager;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.request.AbstractSearchRequest;
import com.jaeksoft.searchlib.user.Role;
import com.jaeksoft.searchlib.user.User;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.web.ScreenshotServlet;
import com.jaeksoft.searchlib.webservice.CommonListResult;
import com.jaeksoft.searchlib.webservice.CommonResult;
import com.jaeksoft.searchlib.webservice.CommonServices;
import com.jaeksoft.searchlib.webservice.RestApplication;
import com.jaeksoft.searchlib.webservice.crawler.CrawlerUtils;
import com.jaeksoft.searchlib.webservice.query.document.FieldValueList;
public class WebCrawlerImpl extends CommonServices implements SoapWebCrawler,
RestWebCrawler {
@Override
public CommonResult run(String use, String login, String key, boolean once) {
try {
Client client = getLoggedClient(use, login, key,
Role.WEB_CRAWLER_START_STOP);
ClientFactory.INSTANCE.properties.checkApi();
if (once)
return CrawlerUtils.runOnce(client.getWebCrawlMaster());
else {
client.getWebPropertyManager().getCrawlEnabled().setValue(true);
return CrawlerUtils.runForever(client.getWebCrawlMaster());
}
} catch (IOException e) {
throw new CommonServiceException(e);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult stop(String use, String login, String key) {
try {
Client client = getLoggedClient(use, login, key,
Role.WEB_CRAWLER_START_STOP);
ClientFactory.INSTANCE.properties.checkApi();
client.getWebPropertyManager().getCrawlEnabled().setValue(false);
return CrawlerUtils.stop(client.getWebCrawlMaster());
} catch (IOException e) {
throw new CommonServiceException(e);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult status(String use, String login, String key) {
try {
Client client = getLoggedClientAnyRole(use, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
return CrawlerUtils.status(client.getWebCrawlMaster());
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
private AbstractSearchRequest getRequest(UrlManager urlManager, String host)
throws SearchLibException, ParseException {
AbstractSearchRequest searchRequest = urlManager
.getSearchRequest(SearchTemplate.urlExport);
searchRequest.setQueryString("*:*");
if (host != null && host.length() > 0)
searchRequest.addFilter("host:\"" + host + '"', false);
return searchRequest;
}
@Override
public byte[] exportURLs(String use, String login, String key) {
try {
Client client = getLoggedClientAnyRole(use, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
File file = client.getUrlManager().exportURLs(
getRequest(client.getUrlManager(), null));
return IOUtils.toByteArray(new FileInputStream(file));
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (FileNotFoundException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
} catch (ParseException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
}
}
@Override
public byte[] exportSiteMap(String use, String host, String login,
String key) {
try {
Client client = getLoggedClientAnyRole(use, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
File file = client.getUrlManager().exportSiteMap(
getRequest(client.getUrlManager(), host));
return IOUtils.toByteArray(new FileInputStream(file));
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (FileNotFoundException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
} catch (ParseException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
}
}
private CommonResult injectPatterns(String index, String login, String key,
boolean replaceAll, List patterns, boolean inclusion) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
ClientFactory.INSTANCE.properties.checkApi();
List patternList = PatternManager
.getPatternList(patterns);
PatternManager patternManager = inclusion ? client
.getInclusionPatternManager() : client
.getExclusionPatternManager();
patternManager.addList(patternList, replaceAll);
int count = PatternManager.countStatus(patternList,
PatternItem.Status.INJECTED);
return new CommonResult(true, count + " patterns injected");
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult injectPatternsInclusion(String index, String login,
String key, boolean replaceAll, List patterns) {
return injectPatterns(index, login, key, replaceAll, patterns, true);
}
@Override
public CommonResult injectPatternsExclusion(String index, String login,
String key, boolean replaceAll, List patterns) {
return injectPatterns(index, login, key, replaceAll, patterns, false);
}
private CommonResult getPatternStatusResult(
WebPropertyManager webPropertyManager) {
CommonResult commonResult = new CommonResult(true, null);
commonResult.addDetail("inclusion_enabled", webPropertyManager
.getInclusionEnabled().getValue());
commonResult.addDetail("exclusion_enabled", webPropertyManager
.getExclusionEnabled().getValue());
return commonResult;
}
@Override
public CommonResult getPatternStatus(String index, String login, String key) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
ClientFactory.INSTANCE.properties.checkApi();
WebPropertyManager webPropertyManager = client
.getWebPropertyManager();
return getPatternStatusResult(webPropertyManager);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult setPatternStatus(String index, String login,
String key, Boolean inclusion, Boolean exclusion) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
ClientFactory.INSTANCE.properties.checkApi();
WebPropertyManager webPropertyManager = client
.getWebPropertyManager();
if (inclusion != null)
webPropertyManager.getInclusionEnabled().setValue(inclusion);
if (exclusion != null)
webPropertyManager.getExclusionEnabled().setValue(exclusion);
return getPatternStatusResult(webPropertyManager);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
private CommonResult deletePatterns(String index, String login, String key,
List patterns, boolean inclusion) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
ClientFactory.INSTANCE.properties.checkApi();
PatternManager patternManager = inclusion ? client
.getInclusionPatternManager() : client
.getExclusionPatternManager();
int count = patternManager.delPattern(patterns);
return new CommonResult(true, count + " patterns deleted");
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult deletePatternsInclusion(String index, String login,
String key, List deleteList) {
return deletePatterns(index, login, key, deleteList, true);
}
@Override
public CommonResult deletePatternsExclusion(String index, String login,
String key, List deleteList) {
return deletePatterns(index, login, key, deleteList, false);
}
public CommonListResult extractPatterns(String index, String login,
String key, String startsWith, boolean inclusion) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
PatternManager patternManager = inclusion ? client
.getInclusionPatternManager() : client
.getExclusionPatternManager();
List patterns = new ArrayList();
patternManager.getPatterns(startsWith, patterns);
return new CommonListResult(patterns);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonListResult extractPatternsInclusion(String index,
String login, String key, String startsWith) {
return extractPatterns(index, login, key, startsWith, true);
}
@Override
public CommonListResult extractPatternsExclusion(String index,
String login, String key, String startsWith) {
return extractPatterns(index, login, key, startsWith, false);
}
@Override
public CommonResult crawl(String use, String login, String key, String url,
Boolean returnData) {
try {
Client client = getLoggedClient(use, login, key,
Role.WEB_CRAWLER_START_STOP);
ClientFactory.INSTANCE.properties.checkApi();
WebCrawlThread webCrawlThread = client.getWebCrawlMaster()
.manualCrawl(LinkUtils.newEncodedURL(url),
HostUrlList.ListType.MANUAL);
if (!webCrawlThread.waitForStart(120))
throw new WebServiceException("Time out reached (120 seconds)");
if (!webCrawlThread.waitForEnd(3600))
throw new WebServiceException("Time out reached (3600 seconds)");
UrlItem urlItem = webCrawlThread.getCurrentUrlItem();
CommonResult cr = null;
if (BooleanUtils.isTrue(returnData)) {
Crawl crawl = webCrawlThread.getCurrentCrawl();
if (crawl != null) {
List indexDocuments = crawl
.getTargetIndexDocuments();
if (CollectionUtils.isNotEmpty(indexDocuments)) {
CommonListResult> clr = new CommonListResult>(
indexDocuments.size());
for (IndexDocument indexDocument : indexDocuments) {
List list = FieldValueList
.getNewList(indexDocument);
if (list != null)
clr.items.add(list);
}
cr = clr;
}
}
}
String message = urlItem != null ? "Result: "
+ urlItem.getFetchStatus() + " - "
+ urlItem.getParserStatus() + " - "
+ urlItem.getIndexStatus() : null;
if (cr == null)
cr = new CommonResult(true, message);
cr.addDetail("URL", urlItem.getUrl());
cr.addDetail("HttpResponseCode", urlItem.getResponseCode());
cr.addDetail("RobotsTxtStatus", urlItem.getRobotsTxtStatus());
cr.addDetail("FetchStatus", urlItem.getFetchStatus());
cr.addDetail("ParserStatus", urlItem.getParserStatus());
cr.addDetail("IndexStatus", urlItem.getIndexStatus());
cr.addDetail("RedirectionURL", urlItem.getRedirectionUrl());
cr.addDetail("ContentBaseType", urlItem.getContentBaseType());
cr.addDetail("ContentTypeCharset", urlItem.getContentTypeCharset());
cr.addDetail("ContentLength", urlItem.getContentLength());
return cr;
} catch (MalformedURLException e) {
throw new CommonServiceException(e);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (ParseException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
} catch (SyntaxError e) {
throw new CommonServiceException(e);
} catch (URISyntaxException e) {
throw new CommonServiceException(e);
} catch (ClassNotFoundException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (InstantiationException e) {
throw new CommonServiceException(e);
} catch (IllegalAccessException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult crawlPost(String use, String login, String key,
String url, Boolean returnData) {
return crawl(use, login, key, url, returnData);
}
@Override
public CommonResult captureScreenshot(String use, String login, String key,
String url) {
try {
Client client = getLoggedClientAnyRole(use, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
ScreenshotManager screenshotManager = client.getScreenshotManager();
CredentialManager credentialManager = client
.getWebCredentialManager();
ScreenshotServlet.doCapture(null, screenshotManager,
credentialManager, LinkUtils.newEncodedURL(url));
String message = "Captured URL " + url;
return new CommonResult(true, message);
} catch (MalformedURLException e) {
throw new CommonServiceException(e);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (URISyntaxException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult checkScreenshot(String use, String login, String key,
String url) {
try {
Client client = getLoggedClientAnyRole(use, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
ScreenshotManager screenshotManager = client.getScreenshotManager();
String message = ScreenshotServlet.doCheck(screenshotManager,
LinkUtils.newEncodedURL(url));
return new CommonResult(true, message);
} catch (MalformedURLException e) {
throw new CommonServiceException(e);
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
} catch (URISyntaxException e) {
throw new CommonServiceException(e);
}
}
public static String getCrawlXML(User user, Client client, String url)
throws UnsupportedEncodingException {
return RestApplication.getRestURL("/index/{index}/crawler/web/crawl",
user, client, "url", url);
}
public static String getCrawlJSON(User user, Client client, String url)
throws UnsupportedEncodingException {
return RestApplication.getRestURL("/index/{index}/crawler/web/crawl",
user, client, "url", url);
}
@Override
public CommonResult injectUrls(String index, String login, String key,
boolean replaceAll, List urls) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.WEB_CRAWLER_EDIT_PARAMETERS);
ClientFactory.INSTANCE.properties.checkApi();
UrlManager urlManager = client.getUrlManager();
CommonResult result = new CommonResult(true, null);
if (replaceAll)
urlManager.deleteAll(null);
urlManager.inject(urls, result);
return result;
} catch (SearchLibException e) {
throw new CommonServiceException(e);
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
@Override
public CommonResult getUrls(String index, String login, String key) {
try {
Client client = getLoggedClientAnyRole(index, login, key,
Role.GROUP_WEB_CRAWLER);
ClientFactory.INSTANCE.properties.checkApi();
return null;
} catch (InterruptedException e) {
throw new CommonServiceException(e);
} catch (IOException e) {
throw new CommonServiceException(e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy