
com.jaeksoft.searchlib.scheduler.task.TaskFtpXmlFeed Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2012-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.scheduler.task;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamSource;
import javax.xml.xpath.XPathExpressionException;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPConnectionClosedException;
import org.apache.commons.net.ftp.FTPFile;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.file.process.fileInstances.FtpFileInstance;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.scheduler.TaskAbstract;
import com.jaeksoft.searchlib.scheduler.TaskLog;
import com.jaeksoft.searchlib.scheduler.TaskProperties;
import com.jaeksoft.searchlib.scheduler.TaskPropertyDef;
import com.jaeksoft.searchlib.scheduler.TaskPropertyType;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.Variables;
public class TaskFtpXmlFeed extends TaskAbstract {
final private TaskPropertyDef propServer = new TaskPropertyDef(
TaskPropertyType.textBox, "FTP server", "FTP server (hostname)",
"The hostname of the FTP server", 100);
final private TaskPropertyDef propPath = new TaskPropertyDef(
TaskPropertyType.textBox, "Path", "Path", "The remote path", 100);
final private TaskPropertyDef propLogin = new TaskPropertyDef(
TaskPropertyType.textBox, "Login", "Login",
"The username on the FTP server", 50);
final private TaskPropertyDef propPassword = new TaskPropertyDef(
TaskPropertyType.password, "Password", "Password",
"The password on the FTP server", 50);
final private TaskPropertyDef propFileNamePattern = new TaskPropertyDef(
TaskPropertyType.textBox, "File name pattern", "File name pattern",
"A regular expression to filter which files will be handled", 50);
final private TaskPropertyDef propXsl = new TaskPropertyDef(
TaskPropertyType.multilineTextBox, "XSL", "XSL",
"An optional XSL stylesheet", 100, 30);
final private TaskPropertyDef propDeleteAfterLoad = new TaskPropertyDef(
TaskPropertyType.listBox, "Delete after load", "Delete after load",
"Decide if the document will be deleted after being loaded", 10);
final private TaskPropertyDef propTruncateIndexWhenFilesFound = new TaskPropertyDef(
TaskPropertyType.listBox, "Truncate index when files are found",
"Truncate index when files are found",
"Decide to truncate the index before loading the XML file", 10);
final private TaskPropertyDef propBuffersize = new TaskPropertyDef(
TaskPropertyType.textBox,
"Buffer size",
"Buffer size",
"How many documents will be write to the index in each transaction",
10);
final private TaskPropertyDef[] taskPropertyDefs = { propServer, propPath,
propLogin, propPassword, propFileNamePattern, propXsl,
propDeleteAfterLoad, propTruncateIndexWhenFilesFound,
propBuffersize };
@Override
public String getName() {
return "FTP XML feed ";
}
@Override
public TaskPropertyDef[] getPropertyList() {
return taskPropertyDefs;
}
@Override
public String[] getPropertyValues(Config config,
TaskPropertyDef propertyDef, TaskProperties taskProperties)
throws SearchLibException {
if (propertyDef == propDeleteAfterLoad)
return ClassPropertyEnum.BOOLEAN_LIST;
if (propertyDef == propTruncateIndexWhenFilesFound)
return ClassPropertyEnum.BOOLEAN_LIST;
return null;
}
@Override
public String getDefaultValue(Config config, TaskPropertyDef propertyDef) {
if (propertyDef == propPath)
return "/";
if (propertyDef == propBuffersize)
return "50";
if (propertyDef == propDeleteAfterLoad)
return Boolean.FALSE.toString();
if (propertyDef == propTruncateIndexWhenFilesFound)
return Boolean.FALSE.toString();
return null;
}
private void checkConnect(FTPClient ftp, String server, String login,
String password) throws IOException {
try {
if (ftp.isConnected())
if (ftp.sendNoOp())
return;
} catch (FTPConnectionClosedException e) {
Logging.warn(e);
}
ftp.setConnectTimeout(120000);
ftp.setControlKeepAliveTimeout(180);
ftp.setDataTimeout(120000);
ftp.connect(server);
ftp.login(login, password);
}
@Override
public void execute(Client client, TaskProperties properties,
Variables variables, TaskLog taskLog) throws SearchLibException {
String server = properties.getValue(propServer);
String path = properties.getValue(propPath);
String login = properties.getValue(propLogin);
String password = properties.getValue(propPassword);
String fileNamePattern = properties.getValue(propFileNamePattern);
boolean deleteAfterLoad = Boolean.TRUE.toString().equals(
properties.getValue(propDeleteAfterLoad));
boolean truncateWhenFilesFound = Boolean.TRUE.toString().equals(
properties.getValue(propTruncateIndexWhenFilesFound));
Pattern pattern = null;
if (fileNamePattern != null && fileNamePattern.length() > 0)
pattern = Pattern.compile(fileNamePattern);
String p = properties.getValue(propBuffersize);
String xsl = properties.getValue(propXsl);
File xmlTempResult = null;
int bufferSize = 50;
if (p != null && p.length() > 0)
bufferSize = Integer.parseInt(p);
HttpDownloader httpDownloader = client.getWebCrawlMaster()
.getNewHttpDownloader(true);
FTPClient ftp = null;
InputStream inputStream = null;
try {
// FTP Connection
ftp = new FTPClient();
checkConnect(ftp, server, login, password);
FTPFile[] files = ftp
.listFiles(path, new FtpFileInstance.FtpInstanceFileFilter(
true, false, null));
if (files == null)
return;
// Sort by ascendant filename
String[] fileNames = new String[files.length];
int i = 0;
for (FTPFile file : files)
fileNames[i++] = file.getName();
Arrays.sort(fileNames);
int ignored = 0;
int loaded = 0;
boolean bAlreadyTruncated = false;
for (String fileName : fileNames) {
String filePathName = FilenameUtils.concat(path, fileName);
if (pattern != null)
if (!pattern.matcher(fileName).find()) {
ignored++;
continue;
}
if (truncateWhenFilesFound && !bAlreadyTruncated) {
client.deleteAll();
bAlreadyTruncated = true;
}
taskLog.setInfo("Working on: " + filePathName);
inputStream = ftp.retrieveFileStream(filePathName);
Node xmlDoc = null;
if (xsl != null && xsl.length() > 0) {
xmlTempResult = File.createTempFile("ossftpfeed", ".xml");
DomUtils.xslt(new StreamSource(inputStream), xsl,
xmlTempResult);
xmlDoc = DomUtils.readXml(new StreamSource(xmlTempResult),
false);
} else
xmlDoc = DomUtils.readXml(new StreamSource(inputStream),
false);
client.updateXmlDocuments(xmlDoc, bufferSize, null,
httpDownloader, taskLog);
client.deleteXmlDocuments(xmlDoc, bufferSize, taskLog);
inputStream.close();
inputStream = null;
if (!ftp.completePendingCommand())
throw new SearchLibException("FTP Error");
if (xmlTempResult != null) {
xmlTempResult.delete();
xmlTempResult = null;
}
checkConnect(ftp, server, login, password);
if (deleteAfterLoad)
ftp.deleteFile(filePathName);
loaded++;
}
taskLog.setInfo(loaded + " file(s) loaded - " + ignored
+ " file(s) ignored");
} catch (XPathExpressionException e) {
throw new SearchLibException(e);
} catch (NoSuchAlgorithmException e) {
throw new SearchLibException(e);
} catch (ParserConfigurationException e) {
throw new SearchLibException(e);
} catch (SAXException e) {
throw new SearchLibException(e);
} catch (IOException e) {
throw new SearchLibException(e);
} catch (URISyntaxException e) {
throw new SearchLibException(e);
} catch (InstantiationException e) {
throw new SearchLibException(e);
} catch (IllegalAccessException e) {
throw new SearchLibException(e);
} catch (ClassNotFoundException e) {
throw new SearchLibException(e);
} catch (TransformerException e) {
throw new SearchLibException(e);
} finally {
if (xmlTempResult != null)
xmlTempResult.delete();
IOUtils.close(inputStream);
try {
if (ftp != null)
if (ftp.isConnected())
ftp.disconnect();
} catch (IOException e) {
Logging.warn(e);
}
if (httpDownloader != null)
httpDownloader.release();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy