org.archive.modules.fetcher.FetchFTP Maven / Gradle / Ivy
Show all versions of heritrix-modules Show documentation
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.fetcher;
import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION;
import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.SocketFactory;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPCommand;
import org.archive.io.RecordingInputStream;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.ClientFTP;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.Recorder;
/**
* Fetches documents and directory listings using FTP. This class will also
* try to extract FTP "links" from directory listings. For this class to
* archive a directory listing, the remote FTP server must support the NLIST
* command. Most modern FTP servers should.
*
* @author pjack
*
*/
public class FetchFTP extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 1L;
/** Logger for this class. */
private static Logger logger = Logger.getLogger(FetchFTP.class.getName());
/** Pattern for matching directory entries. */
private static Pattern DIR =
Pattern.compile("(.+)$", Pattern.MULTILINE);
/**
* The username to send to FTP servers. By convention, the default value of
* "anonymous" is used for publicly available FTP sites.
*/
{
setUsername("anonymous");
}
public String getUsername() {
return (String) kp.get("username");
}
public void setUsername(String username) {
kp.put("username",username);
}
/**
* The password to send to FTP servers. By convention, anonymous users send
* their email address in this field.
*/
{
setPassword("password");
}
public String getPassword() {
return (String) kp.get("password");
}
public void setPassword(String pw) {
kp.put("password",pw);
}
/**
* Set to true to extract further URIs from FTP directories. Default is
* true.
*/
{
setExtractFromDirs(true);
}
/**
* Returns the extract.from.dirs
attribute for this
* FetchFTP
and the given curi.
*
* @return that curi's extract.from.dirs
*/
public boolean getExtractFromDirs() {
return (Boolean) kp.get("extractFromDirs");
}
public void setExtractFromDirs(boolean extractFromDirs) {
kp.put("extractFromDirs",extractFromDirs);
}
/**
* Set to true to extract the parent URI from all FTP URIs. Default is true.
*/
{
setExtractParent(true);
}
/**
* Returns the extract.parent
attribute for this
* FetchFTP
and the given curi.
*
* @return that curi's extract-parent
*/
public boolean getExtractParent() {
return (Boolean) kp.get("extractParent");
}
public void setExtractParent(boolean extractParent) {
kp.put("extractParent",extractParent);
}
/**
* Whether or not to perform an on-the-fly digest hash of retrieved
* content-bodies.
*/
{
setDigestContent(true);
}
public boolean getDigestContent() {
return (Boolean) kp.get("digestContent");
}
public void setDigestContent(boolean digest) {
kp.put("digestContent",digest);
}
/**
* Which algorithm (for example MD5 or SHA-1) to use to perform an
* on-the-fly digest hash of retrieved content-bodies.
*/
protected String digestAlgorithm = "sha1";
public String getDigestAlgorithm() {
return digestAlgorithm;
}
public void setDigestAlgorithm(String digestAlgorithm) {
this.digestAlgorithm = digestAlgorithm;
}
/**
* Maximum length in bytes to fetch. Fetch is truncated at this length. A
* value of 0 means no limit.
*/
{
setMaxLengthBytes(0L); // no limit
}
public long getMaxLengthBytes() {
return (Long) kp.get("maxLengthBytes");
}
public void setMaxLengthBytes(long timeout) {
kp.put("maxLengthBytes",timeout);
}
/**
* The maximum KB/sec to use when fetching data from a server. The default
* of 0 means no maximum.
*/
{
setMaxFetchKBSec(0); // no limit
}
public int getMaxFetchKBSec() {
return (Integer) kp.get("maxFetchKBSec");
}
public void setMaxFetchKBSec(int rate) {
kp.put("maxFetchKBSec",rate);
}
/**
* If the fetch is not completed in this number of seconds, give up (and
* retry later).
*/
{
setTimeoutSeconds(20*60); // 20 minutes
}
public int getTimeoutSeconds() {
return (Integer) kp.get("timeoutSeconds");
}
public void setTimeoutSeconds(int timeout) {
kp.put("timeoutSeconds",timeout);
}
/**
* If the socket is unresponsive for this number of milliseconds, give up.
* Set to zero for no timeout (Not. recommended. Could hang a thread on an
* unresponsive server). This timeout is used timing out socket opens and
* for timing out each socket read. Make sure this value is <
* {@link #TIMEOUT_SECONDS} for optimal configuration: ensures at least one
* retry read.
*/
{
setSoTimeoutMs(20*1000); // 20 seconds
}
public int getSoTimeoutMs() {
return (Integer) kp.get("soTimeoutMs");
}
public void setSoTimeoutMs(int timeout) {
kp.put("soTimeoutMs",timeout);
}
/**
* Constructs a new FetchFTP
.
*/
public FetchFTP() {
}
@Override
protected boolean shouldProcess(CrawlURI curi) {
if (!curi.getUURI().getScheme().equals("ftp")) {
return false;
}
return true;
}
/**
* Processes the given URI. If the given URI is not an FTP URI, then
* this method does nothing. Otherwise an attempt is made to connect
* to the FTP server.
*
* If the connection is successful, an attempt will be made to CD to
* the path specified in the URI. If the remote CD command succeeds,
* then it is assumed that the URI represents a directory. If the
* CD command fails, then it is assumed that the URI represents
* a file.
*
*
For directories, the directory listing will be fetched using
* the FTP LIST command, and saved to the HttpRecorder. If the
* extract.from.dirs
attribute is set to true, then
* the files in the fetched list will be added to the curi as
* extracted FTP links. (It was easier to do that here, rather
* than writing a separate FTPExtractor.)
*
*
For files, the file will be fetched using the FTP RETR
* command, and saved to the HttpRecorder.
*
*
All file transfers (including directory listings) occur using
* Binary mode transfer. Also, the local passive transfer mode
* is always used, to play well with firewalls.
*
* @param curi the curi to process
* @throws InterruptedException if the thread is interrupted during
* processing
*/
@Override
protected void innerProcess(CrawlURI curi) throws InterruptedException {
curi.setFetchBeginTime(System.currentTimeMillis());
ClientFTP client = new ClientFTP();
Recorder recorder = curi.getRecorder();
try {
if (logger.isLoggable(Level.FINE)) {
logger.fine("attempting to fetch ftp uri: " + curi);
}
fetch(curi, client, recorder);
} catch (IOException e) {
if (logger.isLoggable(Level.INFO)) {
logger.info(curi + ": " + e);
}
curi.getNonFatalFailures().add(e);
curi.setFetchStatus(FetchStatusCodes.S_CONNECT_FAILED);
} finally {
disconnect(client);
curi.setFetchCompletedTime(System.currentTimeMillis());
curi.getData().put(A_FTP_CONTROL_CONVERSATION, client.getControlConversation());
}
}
/**
* A {@link SocketFactory} much like javax.net.DefaultSocketFactory,
* except that the createSocket() methods that open connections support a
* connect timeout.
*/
public class SocketFactoryWithTimeout extends SocketFactory {
protected int connectTimeoutMs = 0;
public int getConnectTimeoutMs() {
return connectTimeoutMs;
}
public void setConnectTimeoutMs(int connectTimeoutMs) {
this.connectTimeoutMs = connectTimeoutMs;
}
public Socket createSocket() {
return new Socket();
}
public Socket createSocket(String host, int port) throws IOException,
UnknownHostException {
Socket sock = createSocket();
sock.connect(new InetSocketAddress(host, port), connectTimeoutMs);
return sock;
}
public Socket createSocket(InetAddress host, int port)
throws IOException {
Socket sock = createSocket();
sock.connect(new InetSocketAddress(host, port), connectTimeoutMs);
return sock;
}
public Socket createSocket(String host, int port,
InetAddress localHost, int localPort) throws IOException,
UnknownHostException {
Socket sock = createSocket();
sock.bind(new InetSocketAddress(localHost, localPort));
sock.connect(new InetSocketAddress(host, port), connectTimeoutMs);
return sock;
}
public Socket createSocket(InetAddress address, int port,
InetAddress localAddress, int localPort) throws IOException {
Socket sock = createSocket();
sock.bind(new InetSocketAddress(localAddress, localPort));
sock.connect(new InetSocketAddress(address, port), connectTimeoutMs);
return sock;
}
}
protected SocketFactoryWithTimeout socketFactory;
/**
* Fetches a document from an FTP server.
*
* @param curi the URI of the document to fetch
* @param client the FTPClient to use for the fetch
* @param recorder the recorder to preserve the document in
* @throws IOException if a network or protocol error occurs
* @throws InterruptedException if the thread is interrupted
*/
private void fetch(CrawlURI curi, ClientFTP client, Recorder recorder)
throws IOException, InterruptedException {
// Connect to the FTP server.
UURI uuri = curi.getUURI();
int port = uuri.getPort();
if (port == -1) {
port = 21;
}
if (socketFactory == null) {
socketFactory = new SocketFactoryWithTimeout();
}
socketFactory.setConnectTimeoutMs(getSoTimeoutMs());
client.setSocketFactory(socketFactory);
client.setConnectTimeout(getSoTimeoutMs());
client.setDefaultTimeout(getSoTimeoutMs());
client.setDataTimeout(getSoTimeoutMs());
client.connect(uuri.getHost(), port);
client.setSoTimeout(getSoTimeoutMs()); // must be after connect()
// Authenticate.
String[] auth = getAuth(curi);
client.login(auth[0], auth[1]);
// The given resource may or may not be a directory.
// To figure out which is which, execute a CD command to
// the UURI's path. If CD works, it's a directory.
boolean isDirectory = client.changeWorkingDirectory(uuri.getPath());
// Get a data socket. This will either be the result of a NLST
// command for a directory, or a RETR command for a file.
int command;
String path;
if (isDirectory) {
curi.getAnnotations().add("ftpDirectoryList");
command = FTPCommand.NLST;
client.setFileType(FTP.ASCII_FILE_TYPE);
path = ".";
} else {
command = FTPCommand.RETR;
client.setFileType(FTP.BINARY_FILE_TYPE);
path = uuri.getPath();
}
client.enterLocalPassiveMode();
Socket socket = null;
try {
socket = client.openDataConnection(command, path);
// if "227 Entering Passive Mode" these will get reset later
curi.setFetchStatus(client.getReplyCode());
curi.getData().put(A_FTP_FETCH_STATUS, client.getReplyStrings()[0]);
} catch (IOException e) {
// try it again, see AbstractFrontier.needsRetrying()
curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST);
}
// Save the streams in the CURI, where downstream processors
// expect to find them.
if (socket != null) {
if (socket.getSoTimeout() != getSoTimeoutMs()) {
logger.warning("data socket timeout " + socket.getSoTimeout() + "ms is not expected value " + getSoTimeoutMs() + "ms");
}
// Shall we get a digest on the content downloaded?
boolean digestContent = getDigestContent();
String algorithm = null;
if (digestContent) {
algorithm = getDigestAlgorithm();
recorder.getRecordedInput().setDigest(algorithm);
recorder.getRecordedInput().startDigest();
} else {
// clear
recorder.getRecordedInput().setDigest((MessageDigest)null);
}
try {
saveToRecorder(curi, socket, recorder);
} finally {
recorder.close();
client.closeDataConnection(); // does socket.close()
curi.setContentSize(recorder.getRecordedInput().getSize());
// "226 Transfer complete."
client.getReply();
curi.setFetchStatus(client.getReplyCode());
curi.getData().put(A_FTP_FETCH_STATUS, client.getReplyStrings()[0]);
if (isDirectory) {
curi.setContentType("text/plain");
} else {
curi.setContentType("application/octet-stream");
}
if (logger.isLoggable(Level.FINE)) {
logger.fine("read " + recorder.getRecordedInput().getSize()
+ " bytes from ftp data socket");
}
if (digestContent) {
curi.setContentDigest(algorithm,
recorder.getRecordedInput().getDigestValue());
}
}
if (isDirectory) {
extract(curi, recorder);
}
} else {
// no data - without this, content size is -1
curi.setContentSize(0);
}
addParent(curi);
}
/**
* Saves the given socket to the given recorder.
*
* @param curi the curi that owns the recorder
* @param socket the socket whose streams to save
* @param recorder the recorder to save them to
* @throws IOException if a network or file error occurs
* @throws InterruptedException if the thread is interrupted
*/
private void saveToRecorder(CrawlURI curi,
Socket socket, Recorder recorder)
throws IOException, InterruptedException {
recorder.inputWrap(socket.getInputStream());
recorder.outputWrap(socket.getOutputStream());
recorder.markContentBegin();
// Read the remote file/dir listing in its entirety.
long softMax = 0;
long hardMax = getMaxLengthBytes();
long timeout = (long)getTimeoutSeconds() * 1000L;
int maxRate = getMaxFetchKBSec();
RecordingInputStream input = recorder.getRecordedInput();
input.setLimits(hardMax, timeout, maxRate);
input.readFullyOrUntil(softMax);
}
/**
* Extract FTP links in a directory listing.
* The listing must already be saved to the given recorder.
*
* @param curi The curi to save extracted links to
* @param recorder The recorder containing the directory listing
*/
private void extract(CrawlURI curi, Recorder recorder) {
if (!getExtractFromDirs()) {
return;
}
ReplayCharSequence seq = null;
try {
seq = recorder.getContentReplayCharSequence();
extract(curi, seq);
} catch (IOException e) {
logger.log(Level.SEVERE, "IO error during extraction.", e);
} catch (RuntimeException e) {
logger.log(Level.SEVERE, "IO error during extraction.", e);
} finally {
close(seq);
}
}
/**
* Extracts FTP links in a directory listing.
*
* @param curi The curi to save extracted links to
* @param dir The directory listing to extract links from
* @throws URIException if an extracted link is invalid
*/
private void extract(CrawlURI curi, ReplayCharSequence dir) {
logger.log(Level.FINEST, "Extracting URIs from FTP directory.");
Matcher matcher = DIR.matcher(dir);
while (matcher.find()) {
String file = matcher.group(1);
addExtracted(curi, file);
}
}
/**
* Adds an extracted filename to the curi. A new URI will be formed
* by taking the given curi (which should represent the directory the
* file lives in) and appending the file.
*
* @param curi the curi to store the discovered link in
* @param file the filename of the discovered link
*/
private void addExtracted(CrawlURI curi, String file) {
try {
file = URLEncoder.encode(file, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new AssertionError(e);
}
if (logger.isLoggable(Level.FINEST)) {
logger.log(Level.FINEST, "Found " + file);
}
String base = curi.toString();
if (base.endsWith("/")) {
base = base.substring(0, base.length() - 1);
}
try {
UURI n = UURIFactory.getInstance(base + "/" + file);
CrawlURI link = curi.createCrawlURI(n, LinkContext.NAVLINK_MISC, Hop.NAVLINK);
curi.getOutLinks().add(link);
} catch (URIException e) {
logger.log(Level.WARNING, "URI error during extraction.", e);
}
}
/**
* Extracts the parent URI from the given curi, then adds that parent
* URI as a discovered link to the curi.
*
*
If the extract-parent
attribute is false, then this
* method does nothing. Also, if the path of the given curi is
* /
, then this method does nothing.
*
*
Otherwise the parent is determined by eliminated the lowest part
* of the URI's path. Eg, the parent of ftp://foo.com/one/two
* is ftp://foo.com/one
.
*
* @param curi the curi whose parent to add
*/
private void addParent(CrawlURI curi) {
if (!getExtractParent()) {
return;
}
UURI uuri = curi.getUURI();
try {
if (uuri.getPath().equals("/")) {
// There's no parent to add.
return;
}
String scheme = uuri.getScheme();
String auth = uuri.getEscapedAuthority();
String path = uuri.getEscapedCurrentHierPath();
UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path);
CrawlURI link = curi.createCrawlURI(parent, LinkContext.NAVLINK_MISC,
Hop.NAVLINK);
curi.getOutLinks().add(link);
} catch (URIException e) {
logger.log(Level.WARNING, "URI error during extraction.", e);
}
}
/**
* Returns the username and password for the given URI. This method
* always returns an array of length 2. The first element in the returned
* array is the username for the URI, and the second element is the
* password.
*
*
If the URI itself contains the username and password (i.e., it looks
* like ftp://username:password@host/path
) then that username
* and password are returned.
*
*
Otherwise the settings system is probed for the username
* and password
attributes for this FTPFetch
* and the given curi
context. The values of those
* attributes are then returned.
*
* @param curi the curi whose username and password to return
* @return an array containing the username and password
*/
private String[] getAuth(CrawlURI curi) {
String[] result = new String[2];
UURI uuri = curi.getUURI();
String userinfo;
try {
userinfo = uuri.getUserinfo();
} catch (URIException e) {
assert false;
logger.finest("getUserinfo raised URIException.");
userinfo = null;
}
if (userinfo != null) {
int p = userinfo.indexOf(':');
if (p > 0) {
result[0] = userinfo.substring(0,p);
result[1] = userinfo.substring(p + 1);
return result;
}
}
result[0] = getUsername();
result[1] = getPassword();
return result;
}
/**
* Quietly closes the given sequence.
* If an IOException is raised, this method logs it as a warning.
*
* @param seq the sequence to close
*/
private static void close(ReplayCharSequence seq) {
if (seq == null) {
return;
}
try {
seq.close();
} catch (IOException e) {
logger.log(Level.WARNING, "IO error closing ReplayCharSequence.",
e);
}
}
/**
* Quietly disconnects from the given FTP client.
* If an IOException is raised, this method logs it as a warning.
*
* @param client the client to disconnect
*/
private static void disconnect(ClientFTP client) {
if (client.isConnected()) try {
client.logout();
} catch (IOException e) {
}
if (client.isConnected()) try {
client.disconnect();
} catch (IOException e) {
logger.warning("Could not disconnect from FTP client: " + e);
}
}
}