All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.fetcher.FetchFTP Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION;
import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.net.SocketFactory;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPCommand;
import org.archive.io.RecordingInputStream;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.ClientFTP;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.Recorder;


/**
 * Fetches documents and directory listings using FTP.  This class will also
 * try to extract FTP "links" from directory listings.  For this class to
 * archive a directory listing, the remote FTP server must support the NLIST
 * command.  Most modern FTP servers should.
 * 
 * @author pjack
 *
 */
public class FetchFTP extends Processor  {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    /** Logger for this class. */
    private static Logger logger = Logger.getLogger(FetchFTP.class.getName());

    /** Pattern for matching directory entries. */
    private static Pattern DIR = 
     Pattern.compile("(.+)$", Pattern.MULTILINE);

    
    /**
     * The username to send to FTP servers. By convention, the default value of
     * "anonymous" is used for publicly available FTP sites.
     */
    {
        setUsername("anonymous");
    }
    public String getUsername() {
        return (String) kp.get("username");
    }
    public void setUsername(String username) {
        kp.put("username",username);
    }

    /**
     * The password to send to FTP servers. By convention, anonymous users send
     * their email address in this field.
     */
    {
        setPassword("password");
    }
    public String getPassword() {
        return (String) kp.get("password");
    }
    public void setPassword(String pw) {
        kp.put("password",pw);
    }

    /**
     * Set to true to extract further URIs from FTP directories. Default is
     * true.
     */
    {
        setExtractFromDirs(true);
    }
    /**
     * Returns the extract.from.dirs attribute for this
     * FetchFTP and the given curi.
     * 
     * @return  that curi's extract.from.dirs
     */
    public boolean getExtractFromDirs() {
        return (Boolean) kp.get("extractFromDirs");
    }
    public void setExtractFromDirs(boolean extractFromDirs) {
        kp.put("extractFromDirs",extractFromDirs);
    }
    
    /**
     * Set to true to extract the parent URI from all FTP URIs. Default is true.
     */
    {
        setExtractParent(true);
    }
    /**
     * Returns the extract.parent attribute for this
     * FetchFTP and the given curi.
     * 
     * @return  that curi's extract-parent
     */
    public boolean getExtractParent() {
        return (Boolean) kp.get("extractParent");
    }
    public void setExtractParent(boolean extractParent) {
        kp.put("extractParent",extractParent);
    }

    /**
     * Whether or not to perform an on-the-fly digest hash of retrieved
     * content-bodies.
     */
    {
        setDigestContent(true);
    }
    public boolean getDigestContent() {
        return (Boolean) kp.get("digestContent");
    }
    public void setDigestContent(boolean digest) {
        kp.put("digestContent",digest);
    }
 
    /**
     * Which algorithm (for example MD5 or SHA-1) to use to perform an
     * on-the-fly digest hash of retrieved content-bodies.
     */
    protected String digestAlgorithm = "sha1"; 
    public String getDigestAlgorithm() {
        return digestAlgorithm;
    }
    public void setDigestAlgorithm(String digestAlgorithm) {
        this.digestAlgorithm = digestAlgorithm;
    }


    /**
     * Maximum length in bytes to fetch. Fetch is truncated at this length. A
     * value of 0 means no limit.
     */
    {
        setMaxLengthBytes(0L); // no limit
    }
    public long getMaxLengthBytes() {
        return (Long) kp.get("maxLengthBytes");
    }
    public void setMaxLengthBytes(long timeout) {
        kp.put("maxLengthBytes",timeout);
    }
    
    /**
     * The maximum KB/sec to use when fetching data from a server. The default
     * of 0 means no maximum.
     */
    {
        setMaxFetchKBSec(0); // no limit
    }
    public int getMaxFetchKBSec() {
        return (Integer) kp.get("maxFetchKBSec");
    }
    public void setMaxFetchKBSec(int rate) {
        kp.put("maxFetchKBSec",rate);
    }
    
    /**
     * If the fetch is not completed in this number of seconds, give up (and
     * retry later).
     */
    {
        setTimeoutSeconds(20*60); // 20 minutes
    }
    public int getTimeoutSeconds() {
        return (Integer) kp.get("timeoutSeconds");
    }
    public void setTimeoutSeconds(int timeout) {
        kp.put("timeoutSeconds",timeout);
    }

    /**
     * If the socket is unresponsive for this number of milliseconds, give up.
     * Set to zero for no timeout (Not. recommended. Could hang a thread on an
     * unresponsive server). This timeout is used timing out socket opens and
     * for timing out each socket read. Make sure this value is <
     * {@link #TIMEOUT_SECONDS} for optimal configuration: ensures at least one
     * retry read.
     */
    {
        setSoTimeoutMs(20*1000); // 20 seconds
    }
    public int getSoTimeoutMs() {
        return (Integer) kp.get("soTimeoutMs");
    }
    public void setSoTimeoutMs(int timeout) {
        kp.put("soTimeoutMs",timeout);
    }
     
    /**
     * Constructs a new FetchFTP.
     */
    public FetchFTP() {
    }
    
    @Override
    protected boolean shouldProcess(CrawlURI curi) {
        if (!curi.getUURI().getScheme().equals("ftp")) {
            return false;
        }

        return true;
    }

    /**
     * Processes the given URI.  If the given URI is not an FTP URI, then
     * this method does nothing.  Otherwise an attempt is made to connect
     * to the FTP server.
     * 
     * 

If the connection is successful, an attempt will be made to CD to * the path specified in the URI. If the remote CD command succeeds, * then it is assumed that the URI represents a directory. If the * CD command fails, then it is assumed that the URI represents * a file. * *

For directories, the directory listing will be fetched using * the FTP LIST command, and saved to the HttpRecorder. If the * extract.from.dirs attribute is set to true, then * the files in the fetched list will be added to the curi as * extracted FTP links. (It was easier to do that here, rather * than writing a separate FTPExtractor.) * *

For files, the file will be fetched using the FTP RETR * command, and saved to the HttpRecorder. * *

All file transfers (including directory listings) occur using * Binary mode transfer. Also, the local passive transfer mode * is always used, to play well with firewalls. * * @param curi the curi to process * @throws InterruptedException if the thread is interrupted during * processing */ @Override protected void innerProcess(CrawlURI curi) throws InterruptedException { curi.setFetchBeginTime(System.currentTimeMillis()); ClientFTP client = new ClientFTP(); Recorder recorder = curi.getRecorder(); try { if (logger.isLoggable(Level.FINE)) { logger.fine("attempting to fetch ftp uri: " + curi); } fetch(curi, client, recorder); } catch (IOException e) { if (logger.isLoggable(Level.INFO)) { logger.info(curi + ": " + e); } curi.getNonFatalFailures().add(e); curi.setFetchStatus(FetchStatusCodes.S_CONNECT_FAILED); } finally { disconnect(client); curi.setFetchCompletedTime(System.currentTimeMillis()); curi.getData().put(A_FTP_CONTROL_CONVERSATION, client.getControlConversation()); } } /** * A {@link SocketFactory} much like javax.net.DefaultSocketFactory, * except that the createSocket() methods that open connections support a * connect timeout. */ public class SocketFactoryWithTimeout extends SocketFactory { protected int connectTimeoutMs = 0; public int getConnectTimeoutMs() { return connectTimeoutMs; } public void setConnectTimeoutMs(int connectTimeoutMs) { this.connectTimeoutMs = connectTimeoutMs; } public Socket createSocket() { return new Socket(); } public Socket createSocket(String host, int port) throws IOException, UnknownHostException { Socket sock = createSocket(); sock.connect(new InetSocketAddress(host, port), connectTimeoutMs); return sock; } public Socket createSocket(InetAddress host, int port) throws IOException { Socket sock = createSocket(); sock.connect(new InetSocketAddress(host, port), connectTimeoutMs); return sock; } public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException, UnknownHostException { Socket sock = createSocket(); sock.bind(new InetSocketAddress(localHost, localPort)); sock.connect(new InetSocketAddress(host, port), connectTimeoutMs); return sock; } public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException { Socket sock = createSocket(); sock.bind(new InetSocketAddress(localAddress, localPort)); sock.connect(new InetSocketAddress(address, port), connectTimeoutMs); return sock; } } protected SocketFactoryWithTimeout socketFactory; /** * Fetches a document from an FTP server. * * @param curi the URI of the document to fetch * @param client the FTPClient to use for the fetch * @param recorder the recorder to preserve the document in * @throws IOException if a network or protocol error occurs * @throws InterruptedException if the thread is interrupted */ private void fetch(CrawlURI curi, ClientFTP client, Recorder recorder) throws IOException, InterruptedException { // Connect to the FTP server. UURI uuri = curi.getUURI(); int port = uuri.getPort(); if (port == -1) { port = 21; } if (socketFactory == null) { socketFactory = new SocketFactoryWithTimeout(); } socketFactory.setConnectTimeoutMs(getSoTimeoutMs()); client.setSocketFactory(socketFactory); client.setConnectTimeout(getSoTimeoutMs()); client.setDefaultTimeout(getSoTimeoutMs()); client.setDataTimeout(getSoTimeoutMs()); client.connect(uuri.getHost(), port); client.setSoTimeout(getSoTimeoutMs()); // must be after connect() // Authenticate. String[] auth = getAuth(curi); client.login(auth[0], auth[1]); // The given resource may or may not be a directory. // To figure out which is which, execute a CD command to // the UURI's path. If CD works, it's a directory. boolean isDirectory = client.changeWorkingDirectory(uuri.getPath()); // Get a data socket. This will either be the result of a NLST // command for a directory, or a RETR command for a file. int command; String path; if (isDirectory) { curi.getAnnotations().add("ftpDirectoryList"); command = FTPCommand.NLST; client.setFileType(FTP.ASCII_FILE_TYPE); path = "."; } else { command = FTPCommand.RETR; client.setFileType(FTP.BINARY_FILE_TYPE); path = uuri.getPath(); } client.enterLocalPassiveMode(); Socket socket = null; try { socket = client.openDataConnection(command, path); // if "227 Entering Passive Mode" these will get reset later curi.setFetchStatus(client.getReplyCode()); curi.getData().put(A_FTP_FETCH_STATUS, client.getReplyStrings()[0]); } catch (IOException e) { // try it again, see AbstractFrontier.needsRetrying() curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST); } // Save the streams in the CURI, where downstream processors // expect to find them. if (socket != null) { if (socket.getSoTimeout() != getSoTimeoutMs()) { logger.warning("data socket timeout " + socket.getSoTimeout() + "ms is not expected value " + getSoTimeoutMs() + "ms"); } // Shall we get a digest on the content downloaded? boolean digestContent = getDigestContent(); String algorithm = null; if (digestContent) { algorithm = getDigestAlgorithm(); recorder.getRecordedInput().setDigest(algorithm); recorder.getRecordedInput().startDigest(); } else { // clear recorder.getRecordedInput().setDigest((MessageDigest)null); } try { saveToRecorder(curi, socket, recorder); } finally { recorder.close(); client.closeDataConnection(); // does socket.close() curi.setContentSize(recorder.getRecordedInput().getSize()); // "226 Transfer complete." client.getReply(); curi.setFetchStatus(client.getReplyCode()); curi.getData().put(A_FTP_FETCH_STATUS, client.getReplyStrings()[0]); if (isDirectory) { curi.setContentType("text/plain"); } else { curi.setContentType("application/octet-stream"); } if (logger.isLoggable(Level.FINE)) { logger.fine("read " + recorder.getRecordedInput().getSize() + " bytes from ftp data socket"); } if (digestContent) { curi.setContentDigest(algorithm, recorder.getRecordedInput().getDigestValue()); } } if (isDirectory) { extract(curi, recorder); } } else { // no data - without this, content size is -1 curi.setContentSize(0); } addParent(curi); } /** * Saves the given socket to the given recorder. * * @param curi the curi that owns the recorder * @param socket the socket whose streams to save * @param recorder the recorder to save them to * @throws IOException if a network or file error occurs * @throws InterruptedException if the thread is interrupted */ private void saveToRecorder(CrawlURI curi, Socket socket, Recorder recorder) throws IOException, InterruptedException { recorder.inputWrap(socket.getInputStream()); recorder.outputWrap(socket.getOutputStream()); recorder.markContentBegin(); // Read the remote file/dir listing in its entirety. long softMax = 0; long hardMax = getMaxLengthBytes(); long timeout = (long)getTimeoutSeconds() * 1000L; int maxRate = getMaxFetchKBSec(); RecordingInputStream input = recorder.getRecordedInput(); input.setLimits(hardMax, timeout, maxRate); input.readFullyOrUntil(softMax); } /** * Extract FTP links in a directory listing. * The listing must already be saved to the given recorder. * * @param curi The curi to save extracted links to * @param recorder The recorder containing the directory listing */ private void extract(CrawlURI curi, Recorder recorder) { if (!getExtractFromDirs()) { return; } ReplayCharSequence seq = null; try { seq = recorder.getContentReplayCharSequence(); extract(curi, seq); } catch (IOException e) { logger.log(Level.SEVERE, "IO error during extraction.", e); } catch (RuntimeException e) { logger.log(Level.SEVERE, "IO error during extraction.", e); } finally { close(seq); } } /** * Extracts FTP links in a directory listing. * * @param curi The curi to save extracted links to * @param dir The directory listing to extract links from * @throws URIException if an extracted link is invalid */ private void extract(CrawlURI curi, ReplayCharSequence dir) { logger.log(Level.FINEST, "Extracting URIs from FTP directory."); Matcher matcher = DIR.matcher(dir); while (matcher.find()) { String file = matcher.group(1); addExtracted(curi, file); } } /** * Adds an extracted filename to the curi. A new URI will be formed * by taking the given curi (which should represent the directory the * file lives in) and appending the file. * * @param curi the curi to store the discovered link in * @param file the filename of the discovered link */ private void addExtracted(CrawlURI curi, String file) { try { file = URLEncoder.encode(file, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new AssertionError(e); } if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "Found " + file); } String base = curi.toString(); if (base.endsWith("/")) { base = base.substring(0, base.length() - 1); } try { UURI n = UURIFactory.getInstance(base + "/" + file); CrawlURI link = curi.createCrawlURI(n, LinkContext.NAVLINK_MISC, Hop.NAVLINK); curi.getOutLinks().add(link); } catch (URIException e) { logger.log(Level.WARNING, "URI error during extraction.", e); } } /** * Extracts the parent URI from the given curi, then adds that parent * URI as a discovered link to the curi. * *

If the extract-parent attribute is false, then this * method does nothing. Also, if the path of the given curi is * /, then this method does nothing. * *

Otherwise the parent is determined by eliminated the lowest part * of the URI's path. Eg, the parent of ftp://foo.com/one/two * is ftp://foo.com/one. * * @param curi the curi whose parent to add */ private void addParent(CrawlURI curi) { if (!getExtractParent()) { return; } UURI uuri = curi.getUURI(); try { if (uuri.getPath().equals("/")) { // There's no parent to add. return; } String scheme = uuri.getScheme(); String auth = uuri.getEscapedAuthority(); String path = uuri.getEscapedCurrentHierPath(); UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path); CrawlURI link = curi.createCrawlURI(parent, LinkContext.NAVLINK_MISC, Hop.NAVLINK); curi.getOutLinks().add(link); } catch (URIException e) { logger.log(Level.WARNING, "URI error during extraction.", e); } } /** * Returns the username and password for the given URI. This method * always returns an array of length 2. The first element in the returned * array is the username for the URI, and the second element is the * password. * *

If the URI itself contains the username and password (i.e., it looks * like ftp://username:password@host/path) then that username * and password are returned. * *

Otherwise the settings system is probed for the username * and password attributes for this FTPFetch * and the given curi context. The values of those * attributes are then returned. * * @param curi the curi whose username and password to return * @return an array containing the username and password */ private String[] getAuth(CrawlURI curi) { String[] result = new String[2]; UURI uuri = curi.getUURI(); String userinfo; try { userinfo = uuri.getUserinfo(); } catch (URIException e) { assert false; logger.finest("getUserinfo raised URIException."); userinfo = null; } if (userinfo != null) { int p = userinfo.indexOf(':'); if (p > 0) { result[0] = userinfo.substring(0,p); result[1] = userinfo.substring(p + 1); return result; } } result[0] = getUsername(); result[1] = getPassword(); return result; } /** * Quietly closes the given sequence. * If an IOException is raised, this method logs it as a warning. * * @param seq the sequence to close */ private static void close(ReplayCharSequence seq) { if (seq == null) { return; } try { seq.close(); } catch (IOException e) { logger.log(Level.WARNING, "IO error closing ReplayCharSequence.", e); } } /** * Quietly disconnects from the given FTP client. * If an IOException is raised, this method logs it as a warning. * * @param client the client to disconnect */ private static void disconnect(ClientFTP client) { if (client.isConnected()) try { client.logout(); } catch (IOException e) { } if (client.isConnected()) try { client.disconnect(); } catch (IOException e) { logger.warning("Could not disconnect from FTP client: " + e); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy