com.jaeksoft.searchlib.scheduler.task.TaskFtpXmlFeed Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2012-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.scheduler.task;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamSource;
import javax.xml.xpath.XPathExpressionException;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPConnectionClosedException;
import org.apache.commons.net.ftp.FTPFile;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.file.process.fileInstances.FtpFileInstance;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.scheduler.TaskAbstract;
import com.jaeksoft.searchlib.scheduler.TaskLog;
import com.jaeksoft.searchlib.scheduler.TaskProperties;
import com.jaeksoft.searchlib.scheduler.TaskPropertyDef;
import com.jaeksoft.searchlib.scheduler.TaskPropertyType;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.Variables;
public class TaskFtpXmlFeed extends TaskAbstract {
final private TaskPropertyDef propServer = new TaskPropertyDef(TaskPropertyType.textBox, "FTP server",
"FTP server (hostname)", "The hostname of the FTP server", 100);
final private TaskPropertyDef propPath = new TaskPropertyDef(TaskPropertyType.textBox, "Path", "Path",
"The remote path", 100);
final private TaskPropertyDef propLogin = new TaskPropertyDef(TaskPropertyType.textBox, "Login", "Login",
"The username on the FTP server", 50);
final private TaskPropertyDef propPassword = new TaskPropertyDef(TaskPropertyType.password, "Password", "Password",
"The password on the FTP server", 50);
final private TaskPropertyDef propFileNamePattern = new TaskPropertyDef(TaskPropertyType.textBox,
"File name pattern", "File name pattern", "A regular expression to filter which files will be handled", 50);
final private TaskPropertyDef propXsl = new TaskPropertyDef(TaskPropertyType.multilineTextBox, "XSL", "XSL",
"An optional XSL stylesheet", 100, 30);
final private TaskPropertyDef propDeleteAfterLoad = new TaskPropertyDef(TaskPropertyType.listBox,
"Delete after load", "Delete after load", "Decide if the document will be deleted after being loaded", 10);
final private TaskPropertyDef propTruncateIndexWhenFilesFound = new TaskPropertyDef(TaskPropertyType.listBox,
"Truncate index when files are found", "Truncate index when files are found",
"Decide to truncate the index before loading the XML file", 10);
final private TaskPropertyDef propBuffersize = new TaskPropertyDef(TaskPropertyType.textBox, "Buffer size",
"Buffer size", "How many documents will be write to the index in each transaction", 10);
final private TaskPropertyDef[] taskPropertyDefs = { propServer, propPath, propLogin, propPassword,
propFileNamePattern, propXsl, propDeleteAfterLoad, propTruncateIndexWhenFilesFound, propBuffersize };
@Override
public String getName() {
return "FTP XML feed ";
}
@Override
public TaskPropertyDef[] getPropertyList() {
return taskPropertyDefs;
}
@Override
public String[] getPropertyValues(Config config, TaskPropertyDef propertyDef, TaskProperties taskProperties)
throws SearchLibException {
if (propertyDef == propDeleteAfterLoad)
return ClassPropertyEnum.BOOLEAN_LIST;
if (propertyDef == propTruncateIndexWhenFilesFound)
return ClassPropertyEnum.BOOLEAN_LIST;
return null;
}
@Override
public String getDefaultValue(Config config, TaskPropertyDef propertyDef) {
if (propertyDef == propPath)
return "/";
if (propertyDef == propBuffersize)
return "50";
if (propertyDef == propDeleteAfterLoad)
return Boolean.FALSE.toString();
if (propertyDef == propTruncateIndexWhenFilesFound)
return Boolean.FALSE.toString();
return null;
}
private void checkConnect(FTPClient ftp, String server, String login, String password) throws IOException {
try {
if (ftp.isConnected())
if (ftp.sendNoOp())
return;
} catch (FTPConnectionClosedException e) {
Logging.warn(e);
}
ftp.setConnectTimeout(120000);
ftp.setControlKeepAliveTimeout(180);
ftp.setDataTimeout(120000);
ftp.connect(server);
ftp.login(login, password);
}
@Override
public void execute(Client client, TaskProperties properties, Variables variables, TaskLog taskLog)
throws SearchLibException, IOException {
String server = properties.getValue(propServer);
String path = properties.getValue(propPath);
String login = properties.getValue(propLogin);
String password = properties.getValue(propPassword);
String fileNamePattern = properties.getValue(propFileNamePattern);
boolean deleteAfterLoad = Boolean.TRUE.toString().equals(properties.getValue(propDeleteAfterLoad));
boolean truncateWhenFilesFound = Boolean.TRUE.toString()
.equals(properties.getValue(propTruncateIndexWhenFilesFound));
Pattern pattern = null;
if (fileNamePattern != null && fileNamePattern.length() > 0)
pattern = Pattern.compile(fileNamePattern);
String p = properties.getValue(propBuffersize);
String xsl = properties.getValue(propXsl);
File xmlTempResult = null;
int bufferSize = 50;
if (p != null && p.length() > 0)
bufferSize = Integer.parseInt(p);
HttpDownloader httpDownloader = client.getWebCrawlMaster().getNewHttpDownloader(true);
FTPClient ftp = null;
InputStream inputStream = null;
try {
// FTP Connection
ftp = new FTPClient();
checkConnect(ftp, server, login, password);
FTPFile[] files = ftp.listFiles(path, new FtpFileInstance.FtpInstanceFileFilter(true, false, null));
if (files == null)
return;
// Sort by ascendant filename
String[] fileNames = new String[files.length];
int i = 0;
for (FTPFile file : files)
fileNames[i++] = file.getName();
Arrays.sort(fileNames);
int ignored = 0;
int loaded = 0;
boolean bAlreadyTruncated = false;
for (String fileName : fileNames) {
String filePathName = FilenameUtils.concat(path, fileName);
if (pattern != null)
if (!pattern.matcher(fileName).find()) {
ignored++;
continue;
}
if (truncateWhenFilesFound && !bAlreadyTruncated) {
client.deleteAll();
bAlreadyTruncated = true;
}
taskLog.setInfo("Working on: " + filePathName);
inputStream = ftp.retrieveFileStream(filePathName);
Node xmlDoc = null;
if (xsl != null && xsl.length() > 0) {
xmlTempResult = File.createTempFile("ossftpfeed", ".xml");
DomUtils.xslt(new StreamSource(inputStream), xsl, xmlTempResult);
xmlDoc = DomUtils.readXml(new StreamSource(xmlTempResult), false);
} else
xmlDoc = DomUtils.readXml(new StreamSource(inputStream), false);
client.updateXmlDocuments(xmlDoc, bufferSize, null, httpDownloader, taskLog);
client.deleteXmlDocuments(xmlDoc, bufferSize, taskLog);
inputStream.close();
inputStream = null;
if (!ftp.completePendingCommand())
throw new SearchLibException("FTP Error");
if (xmlTempResult != null) {
xmlTempResult.delete();
xmlTempResult = null;
}
checkConnect(ftp, server, login, password);
if (deleteAfterLoad)
ftp.deleteFile(filePathName);
loaded++;
}
taskLog.setInfo(loaded + " file(s) loaded - " + ignored + " file(s) ignored");
} catch (XPathExpressionException e) {
throw new SearchLibException(e);
} catch (NoSuchAlgorithmException e) {
throw new SearchLibException(e);
} catch (ParserConfigurationException e) {
throw new SearchLibException(e);
} catch (SAXException e) {
throw new SearchLibException(e);
} catch (IOException e) {
throw new SearchLibException(e);
} catch (URISyntaxException e) {
throw new SearchLibException(e);
} catch (InstantiationException e) {
throw new SearchLibException(e);
} catch (IllegalAccessException e) {
throw new SearchLibException(e);
} catch (ClassNotFoundException e) {
throw new SearchLibException(e);
} catch (TransformerException e) {
throw new SearchLibException(e);
} finally {
if (xmlTempResult != null)
xmlTempResult.delete();
IOUtils.close(inputStream);
try {
if (ftp != null)
if (ftp.isConnected())
ftp.disconnect();
} catch (IOException e) {
Logging.warn(e);
}
if (httpDownloader != null)
httpDownloader.release();
}
}
}