com.jaeksoft.searchlib.crawler.rest.RestCrawlThread Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.rest;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.filefilter.FileFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.apache.cxf.helpers.FileUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.utils.URIBuilder;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.FieldMapContext;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlThreadAbstract;
import com.jaeksoft.searchlib.crawler.rest.RestCrawlItem.CallbackMode;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader.Method;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.schema.SchemaField;
import com.jaeksoft.searchlib.util.InfoCallback;
import com.jaeksoft.searchlib.util.Variables;
import com.jaeksoft.searchlib.webservice.CommonListResult;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException;
public class RestCrawlThread extends CrawlThreadAbstract {
protected final Client client;
private final RestCrawlItem restCrawlItem;
private final AtomicLong pendingIndexDocumentCount;
private final AtomicLong updatedIndexDocumentCount;
private final Collection idsCallback;
private final FieldMapContext fieldMapContext;
private static class RestCrawlContext {
private final HttpDownloader downloader;
private final List indexDocumentList;
private final RestFieldMap restFieldMap;
private final int bufferSize;
private final JsonPath jsonPath;
private RestCrawlContext(HttpDownloader downloader, RestCrawlItem restCrawlItem) throws SearchLibException {
this.downloader = downloader;
jsonPath = JsonPath.compile(restCrawlItem.getPathDocument());
restFieldMap = restCrawlItem.getFieldMap();
bufferSize = restCrawlItem.getBufferSize();
indexDocumentList = new ArrayList(bufferSize);
}
}
@SuppressWarnings("unchecked")
public RestCrawlThread(Client client, RestCrawlMaster crawlMaster, RestCrawlItem restCrawlItem, Variables variables,
InfoCallback infoCallback) throws SearchLibException {
super(client, crawlMaster, restCrawlItem, infoCallback);
this.restCrawlItem = restCrawlItem.duplicate();
this.restCrawlItem.apply(variables);
this.client = client;
pendingIndexDocumentCount = new AtomicLong();
updatedIndexDocumentCount = new AtomicLong();
fieldMapContext = new FieldMapContext(client, restCrawlItem.getLang());
this.idsCallback = infoCallback != null && infoCallback instanceof CommonListResult<
?> ? ((CommonListResult) infoCallback).items : null;
}
public String getCountInfo() {
StringBuilder sb = new StringBuilder();
sb.append(getUpdatedIndexDocumentCount());
sb.append(" (");
sb.append(getPendingIndexDocumentCount());
sb.append(")");
return sb.toString();
}
final public long getPendingIndexDocumentCount() {
return pendingIndexDocumentCount.get();
}
final public long getUpdatedIndexDocumentCount() {
return updatedIndexDocumentCount.get();
}
public RestCrawlItem getRestCrawlItem() {
return restCrawlItem;
}
@Override
protected String getCurrentInfo() {
return "";
}
private void callback(HttpDownloader downloader, URI uri, String query)
throws URISyntaxException, ClientProtocolException, IllegalStateException, IOException, SearchLibException {
uri = new URI(uri.getScheme(), null, uri.getHost(), uri.getPort(), uri.getPath(), query, uri.getFragment());
DownloadItem dlItem = downloader.request(uri, restCrawlItem.getCallbackMethod(), restCrawlItem.getCredential(),
null, null, null);
dlItem.checkNoErrorList(200, 201, 202, 203);
}
private final void callbackPerDoc(HttpDownloader downloader, URI uri, String queryPrefix, String key)
throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException, SearchLibException {
StringBuilder queryString = new StringBuilder();
String query = uri.getQuery();
if (query != null)
queryString.append(query);
if (!StringUtils.isEmpty(queryPrefix)) {
if (queryString.length() != 0)
queryString.append('&');
queryString.append(queryPrefix);
if (!StringUtils.isEmpty(key)) {
queryString.append('=');
queryString.append(key);
}
}
callback(downloader, uri, queryString.toString());
}
private final void callbackAllDocs(HttpDownloader downloader, URI uri, String queryPrefix, List pkList)
throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException, SearchLibException {
StringBuilder queryString = new StringBuilder();
String query = uri.getQuery();
if (query != null)
queryString.append(query);
if (!StringUtils.isEmpty(queryPrefix) && pkList != null) {
for (String key : pkList) {
if (queryString.length() != 0)
queryString.append('&');
queryString.append(queryPrefix);
queryString.append('=');
queryString.append(key);
}
}
callback(downloader, uri, queryString.toString());
}
private final void doCallBack(HttpDownloader downloader, List pkList)
throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException, SearchLibException {
CallbackMode mode = restCrawlItem.getCallbackMode();
if (mode == CallbackMode.NO_CALL)
return;
String url = restCrawlItem.getCallbackUrl();
String qp = restCrawlItem.getCallbackQueryParameter();
URI uri = new URI(url);
switch (mode) {
case ONE_CALL_PER_DOCUMENT:
if (pkList != null)
for (String key : pkList)
callbackPerDoc(downloader, uri, qp, key);
break;
case ONE_CALL_FOR_ALL_DOCUMENTS:
callbackAllDocs(downloader, uri, qp, pkList);
break;
default:
break;
}
}
private final boolean index(RestCrawlContext context, int limit)
throws NoSuchAlgorithmException, IOException, URISyntaxException, SearchLibException,
InstantiationException, IllegalAccessException, ClassNotFoundException {
int i = context.indexDocumentList.size();
if (i == 0 || i < limit)
return false;
setStatus(CrawlStatus.INDEXATION);
client.updateDocuments(context.indexDocumentList);
SchemaField uniqueField = client.getSchema().getFieldList().getUniqueField();
List pkList = null;
if (uniqueField != null) {
pkList = new ArrayList(context.indexDocumentList.size());
String fieldName = uniqueField.getName();
for (IndexDocument indexDocument : context.indexDocumentList)
pkList.add(indexDocument.getFieldValueString(fieldName, 0));
if (idsCallback != null)
idsCallback.addAll(pkList);
}
doCallBack(context.downloader, pkList);
pendingIndexDocumentCount.addAndGet(-i);
updatedIndexDocumentCount.addAndGet(i);
context.indexDocumentList.clear();
if (infoCallback != null)
infoCallback.setInfo(updatedIndexDocumentCount + " document(s) indexed");
return true;
}
private void runDocument(RestCrawlContext context, Object document) throws Exception {
setStatus(CrawlStatus.CRAWL);
IndexDocument newIndexDocument = new IndexDocument(fieldMapContext.lang);
context.restFieldMap.mapJson(fieldMapContext, document, newIndexDocument);
context.indexDocumentList.add(newIndexDocument);
pendingIndexDocumentCount.incrementAndGet();
if (index(context, context.bufferSize))
setStatus(CrawlStatus.CRAWL);
}
private int runDocumentList(RestCrawlContext context, Object jsonDoc) throws Exception {
if (jsonDoc == null)
return 0;
if (jsonDoc instanceof Map, ?>) {
runDocument(context, jsonDoc);
return 1;
}
if (jsonDoc instanceof List>) {
List> documents = (List>) jsonDoc;
for (Object document : documents)
runDocument(context, document);
return documents.size();
}
return 0;
}
private int runDownload(RestCrawlContext context, URI uri) throws Exception {
DownloadItem dlItem = context.downloader.request(uri, restCrawlItem.getMethod(), restCrawlItem.getCredential(),
null, null, null);
try {
List