org.htmlparser.parserapplications.SiteCapturer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sitecapturer Show documentation
SiteCapturer is an example application for spidering and saving a website.
The newest version!
// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Derrick Oswald
//
// Revision Control Information
//
// $URL: https://htmlparser.svn.sourceforge.net/svnroot/htmlparser/tags/HTMLParserProject-2.1/sitecapturer/src/main/java/org/htmlparser/parserapplications/SiteCapturer.java $
// $Author: derrickoswald $
// $Date: 2006-09-18 03:02:25 +0200 (Mon, 18 Sep 2006) $
// $Revision: 8 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
//   http://opensource.org/licenses/cpl1.0.php

package org.htmlparser.parserapplications;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * Save a web site locally.
 * Illustrative program to save a web site contents locally.
 * It was created to demonstrate URL rewriting in it's simplest form.
 * It uses customized tags in the NodeFactory to alter the URLs.
 * This program has a number of limitations:
 * 
 * it doesn't capture forms, this would involve too many assumptions
 * it doesn't capture script references, so funky onMouseOver and other
 * non-static content will not be faithfully reproduced
 * it doesn't handle style sheets
 * it doesn't dig into attributes that might reference resources, so
 * for example, background images won't necessarily be captured
 * worst of all, it gets confused when a URL both has content and is
 * the prefix for other content,
 * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both
 * yield content, since this cannot be faithfully replicated to a static
 * directory structure (this happens a lot with servlet based sites)
 *
 */
public class SiteCapturer
{
    /**
     * The web site to capture.
     * This is used as the base URL in deciding whether to adjust a link
     * and whether to capture a page or not.
     */
    protected String mSource;

    /**
     * The local directory to capture to.
     * This is used as a base prefix for files saved locally.
     */
    protected String mTarget;

    /**
     * The list of pages to capture.
     * Links are added to this list as they are discovered, and removed in
     * sequential order (FIFO queue) leading to a breadth
     * first traversal of the web site space.
     */
    protected ArrayList mPages;

    /**
     * The set of pages already captured.
     * Used to avoid repeated acquisition of the same page.
     */
    protected HashSet mFinished;

    /**
     * The list of resources to copy.
     * Images and other resources are added to this list as they are discovered.
     */
    protected ArrayList mImages;

    /**
     * The set of resources already copied.
     * Used to avoid repeated acquisition of the same images and other resources.
     */
    protected HashSet mCopied;

    /**
     * The parser to use for processing.
     */
    protected Parser mParser;

    /**
     * If true, save resources locally too,
     * otherwise, leave resource links pointing to original page.
     */
    protected boolean mCaptureResources;

    /**
     * The filter to apply to the nodes retrieved.
     */
    protected NodeFilter mFilter;

    /**
     * Copy buffer size.
     * Resources are moved to disk in chunks this size or less.
     */
    protected final int TRANSFER_SIZE = 4096;

    /**
     * Create a web site capturer.
     */
    public SiteCapturer ()
    {
        PrototypicalNodeFactory factory;

        mSource = null;
        mTarget = null;
        mPages = new ArrayList ();
        mFinished = new HashSet ();
        mImages = new ArrayList ();
        mCopied = new HashSet ();
        mParser = new Parser ();
        factory = new PrototypicalNodeFactory ();
        factory.registerTag (new LocalLinkTag ());
        factory.registerTag (new LocalFrameTag ());
        factory.registerTag (new LocalBaseHrefTag ());
        factory.registerTag (new LocalImageTag ());
        mParser.setNodeFactory (factory);
        mCaptureResources = true;
        mFilter = null;
    }

    /**
     * Getter for property source.
     * @return Value of property source.
     */
    public String getSource ()
    {
        return (mSource);
    }
    
    /**
     * Setter for property source.
     * This is the base URL to capture. URL's that don't start with this prefix
     * are ignored (left as is), while the ones with this URL as a base are
     * re-homed to the local target.
     * @param source New value of property source.
     */
    public void setSource (String source)
    {
        if (source.endsWith ("/"))
            source = source.substring (0, source.length () - 1);
        mSource = source;
    }
    
    /**
     * Getter for property target.
     * @return Value of property target.
     */
    public String getTarget ()
    {
        return (mTarget);
    }
    
    /**
     * Setter for property target.
     * This is the local directory under which to save the site's pages.
     * @param target New value of property target.
     */
    public void setTarget (String target)
    {
        mTarget = target;
    }

    /**
     * Getter for property captureResources.
     * If true, the images and other resources referenced by
     * the site and within the base URL tree are also copied locally to the
     * target directory. If false, the image links are left 'as
     * is', still refering to the original site.
     * @return Value of property captureResources.
     */
    public boolean getCaptureResources ()
    {
        return (mCaptureResources);
    }
    
    /**
     * Setter for property captureResources.
     * @param capture New value of property captureResources.
     */
    public void setCaptureResources (boolean capture)
    {
        mCaptureResources = capture;
    }
    
    
    /** Getter for property filter.
     * @return Value of property filter.
     *
     */
    public NodeFilter getFilter ()
    {
        return (mFilter);
    }
    
    /** Setter for property filter.
     * @param filter New value of property filter.
     *
     */
    public void setFilter (NodeFilter filter)
    {
        mFilter = filter;
    }
    
    /**
     * Returns true if the link is one we are interested in.
     * @param link The link to be checked.
     * @return true if the link has the source URL as a prefix
     * and doesn't contain '?' or '#'; the former because we won't be able to
     * handle server side queries in the static target directory structure and
     * the latter because presumably the full page with that reference has
     * already been captured previously. This performs a case insensitive
     * comparison, which is cheating really, but it's cheap.
     */
    protected boolean isToBeCaptured (String link)
    {
        return (
            link.toLowerCase ().startsWith (getSource ().toLowerCase ())
            && (-1 == link.indexOf ("?"))
            && (-1 == link.indexOf ("#")));
    }

    /**
     * Returns true if the link contains text/html content.
     * @param link The URL to check for content type.
     * @return true if the HTTP header indicates the type is
     * "text/html".
     * @exception ParserException If the supplied URL can't be read from.
     */
    protected boolean isHtml (String link)
        throws
            ParserException
    {
        URL url;
        URLConnection connection;
        String type;
        boolean ret;

        ret = false;
        try
        {
            url = new URL (link);
            connection = url.openConnection ();
            type = connection.getContentType ();
            if (type == null)
                ret = false;
            else
                ret = type.startsWith ("text/html");
        }
        catch (Exception e)
        {
            throw new ParserException ("URL " + link + " has a problem", e);
        }
        
        return (ret);
    }

    /**
     * Converts a link to local.
     * A relative link can be used to construct both a URL and a file name.
     * Basically, the operation is to strip off the base url, if any,
     * and then prepend as many dot-dots as necessary to make
     * it relative to the current page.
     * A bit of a kludge handles the root page specially by calling it
     * index.html, even though that probably isn't it's real file name.
     * This isn't pretty, but it works for me.
     * @param link The link to make relative.
     * @param current The current page URL, or empty if it's an absolute URL
     * that needs to be converted.
     * @return The URL relative to the current page.
     */
    protected String makeLocalLink (String link, String current)
    {
        int i;
        int j;
        String ret;

        if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
            ret = "index.html"; // handle the root page specially
        else if (link.startsWith (getSource ())
                && (link.length () > getSource ().length ()))
            ret = link.substring (getSource ().length () + 1);
        else
            ret = link; // give up
            
        // make it relative to the current page by prepending "../" for
        // each '/' in the current local path
        if ((null != current)
            && link.startsWith (getSource ())
            && (current.length () > getSource ().length ()))
        {
            current = current.substring (getSource ().length () + 1);
            i = 0;
            while (-1 != (j = current.indexOf ('/', i)))
            {
                ret = "../" + ret;
                i = j + 1;
            }
        }

        return (ret);
    }

    /**
     * Unescape a URL to form a file name.
     * Very crude.
     * @param raw The escaped URI.
     * @return The native URI.
     */
    protected String decode (String raw)
    {
        int length;
        int start;
        int index;
        int value;
        StringBuffer ret;
        
        ret = new StringBuffer (raw.length ());

        length = raw.length ();
        start = 0;
        while (-1 != (index = raw.indexOf ('%', start)))
        {   // append the part up to the % sign
            ret.append (raw.substring (start, index));
            // there must be two hex digits after the percent sign
            if (index + 2 < length)
            {
                try
                {
                    value = Integer.parseInt (raw.substring (index + 1, index + 3), 16);
                    ret.append ((char)value);
                    start = index + 3;
                }
                catch (NumberFormatException nfe)
                {
                    ret.append ('%');
                    start = index + 1;
                }
            }
            else
            {   // this case is actually illegal in a URI, but...
                ret.append ('%');
                start = index + 1;
            }
        }
        ret.append (raw.substring (start));
        
        return (ret.toString ());
    }

    /**
     * Copy a resource (image) locally.
     * Removes one element from the 'to be copied' list and saves the
     * resource it points to locally as a file.
     */
    protected void copy ()
    {
        String link;
        String raw;
        String name;
        File file;
        File dir;
        URL source;
        byte[] data;
        InputStream in;
        FileOutputStream out;
        int read;

        link = (String)mImages.remove (0);
        mCopied.add (link);

        if (getCaptureResources ())
        {
            raw = makeLocalLink (link, "");
            name = decode (raw);
            file = new File (getTarget (), name);
            System.out.println ("copying " + link + " to " + file.getAbsolutePath ());
            // ensure directory exists
            dir = file.getParentFile ();
            if (!dir.exists ())
                dir.mkdirs ();
            try
            {
                source = new URL (link);
                data = new byte [TRANSFER_SIZE];
                try
                {
                    in = source.openStream ();
                    try
                    {
                        out = new FileOutputStream (file);
                        try
                        {
                            while (-1 != (read = in.read (data, 0, data.length)))
                                out.write (data, 0, read);
                        }
                        finally
                        {
                            out.close ();
                        }
                    }
                    catch (FileNotFoundException fnfe)
                    {
                        fnfe.printStackTrace ();
                    }
                    finally
                    {
                        in.close ();
                    }
                }
                catch (FileNotFoundException fnfe)
                {
                    System.err.println ("broken link " + fnfe.getMessage () + " ignored");
                }
            }
            catch (MalformedURLException murle)
            {
                murle.printStackTrace ();
            }
            catch (IOException ioe)
            {
                ioe.printStackTrace ();
            }
        }
    }
 
    /**
     * Process a single page.
     * @param filter The filter to apply to the collected nodes.
     * @exception ParserException If a parse error occurs.
     */
    protected void process (NodeFilter filter)
        throws
            ParserException
    {
        String url;
        int bookmark;
        NodeList list;
        NodeList robots;
        MetaTag robot;
        String content;
        File file;
        File dir;
        PrintWriter out;

        // get the next URL and add it to the done pile
        url = (String)mPages.remove (0);
        System.out.println ("processing " + url);
        mFinished.add (url);

        try
        {
            bookmark = mPages.size ();
            // fetch the page and gather the list of nodes
            mParser.setURL (url);
            try
            {
                list = new NodeList ();
                for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
                    list.add (e.nextNode ()); // URL conversion occurs in the tags
            }
            catch (EncodingChangeException ece)
            {
                // fix bug #998195 SiteCatpurer just crashed
                // try again with the encoding now set correctly
                // hopefully mPages, mImages, mCopied and mFinished won't be corrupted
                mParser.reset ();
                list = new NodeList ();
                for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
                    list.add (e.nextNode ());
            }

            // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
            // 
            // 
            robots = list.extractAllNodesThatMatch (
                new AndFilter (
                    new NodeClassFilter (MetaTag.class),
                    new HasAttributeFilter ("name", "robots")), true);
            if (0 != robots.size ())
            {
                robot = (MetaTag)robots.elementAt (0);
                content = robot.getAttribute ("content").toLowerCase ();
                if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
                    // reset mPages
                    for (int i = bookmark; i < mPages.size (); i++)
                        mPages.remove (i);
                if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
                    return;
            }
    
            if (null != filter)
                list.keepAllNodesThatMatch (filter, true);

            // save the page locally
            file = new File (getTarget (), makeLocalLink (url, ""));
            dir = file.getParentFile ();
            if (!dir.exists ())
                dir.mkdirs ();
            else if (!dir.isDirectory ())
            {
                dir = new File (dir.getParentFile (), dir.getName () + ".content");
                if (!dir.exists ())
                    dir.mkdirs ();
                file = new File (dir, file.getName ());
            }
                
            try
            {
                out = new PrintWriter (new FileOutputStream (file));
                for (int i = 0; i < list.size (); i++)
                    out.print (list.elementAt (i).toHtml ());
                out.close ();
            }
            catch (FileNotFoundException fnfe)
            {
                fnfe.printStackTrace ();
            }
        }
        catch (ParserException pe)
        {
            String message;
            
            // this exception handling is suboptimal,
            // but it recognizes resources that aren't text/html
            message = pe.getMessage ();
            if ((null != message) && (message.endsWith ("does not contain text")))
            {
                if (!mCopied.contains (url))
                    if (!mImages.contains (url))
                        mImages.add (url);
                mFinished.remove (url);
            }
            else
                throw pe;
        }
    }

    /**
     * Link tag that rewrites the HREF.
     * The HREF is changed to a local target if it matches the source.
     */
    class LocalLinkTag extends LinkTag
    {
        public void doSemanticAction ()
            throws
                ParserException
        {
            boolean html;
            String link;

            // get the link
            link = getLink ();
            // check if it needs to be captured
            if (isToBeCaptured (link))
            {
                // add the link to a list to be processed
                if (mFinished.contains (link))
                    html = true;
                else if (mPages.contains (link))
                    html = true;
                else if (mCopied.contains (link))
                    html = false;
                else if (mImages.contains (link))
                    html = false;
                else
                {   // this test is expensive, do it reluctantly
                    html = isHtml (link);
                    if (html)
                        mPages.add (link);
                    else
                        mImages.add (link);
                }
                // alter the link
                if (html || (!html && getCaptureResources ()))
                    link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
                setLink (link);
            }
        }
    }

    /**
     * Frame tag that rewrites the SRC URLs.
     * The SRC URLs are mapped to local targets if they match the source.
     */
    class LocalFrameTag extends FrameTag
    {
        public void doSemanticAction ()
            throws
                ParserException
        {
            boolean html;
            String link;

            // get the link
            link = getFrameLocation ();
            // check if it needs to be captured
            if (isToBeCaptured (link))
            {
                // add the link to a list to be processed
                if (mFinished.contains (link))
                    html = true;
                else if (mPages.contains (link))
                    html = true;
                else if (mCopied.contains (link))
                    html = false;
                else if (mImages.contains (link))
                    html = false;
                else
                {   // this test is expensive, do it reluctantly
                    html = isHtml (link);
                    if (html)
                        mPages.add (link);
                    else
                        mImages.add (link);
                }
                // alter the link
                if (html || (!html && getCaptureResources ()))
                    link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
                setFrameLocation (link);
            }
        }
    }

    /**
     * Image tag that rewrites the SRC URL.
     * If resources are being captured the SRC is mapped to a local target if
     * it matches the source, otherwise it is convered to a full URL to point
     * back to the original site.
     */
    class LocalImageTag extends ImageTag
    {
        public void doSemanticAction ()
            throws
                ParserException
        {
            String image;
            
            // get the image url
            image = getImageURL ();
            // check if it needs to be captured
            if (isToBeCaptured (image))
            {   // add the image to the list needing to be copied
                if (!mCopied.contains (image))
                    if (!mImages.contains (image))
                        mImages.add (image);
                if (getCaptureResources ())
                    image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ());
                // alter the link
                setImageURL (image);
            }
        }
    }

    /**
     * Base tag that doesn't show.
     * The toHtml() method is overridden to return an empty string,
     * effectively shutting off the base reference.
     */
    class LocalBaseHrefTag extends BaseHrefTag
    {
        // we don't want to have a base pointing back at the source page
        public String toHtml ()
        {
            return ("");
        }
    }

    /**
     * Perform the capture.
     */
    public void capture ()
    {
       
        mPages.clear ();
        mPages.add (getSource ());
        while (0 != mPages.size ())
            try
            {
                process (getFilter ());
                while (0 != mImages.size ())
                    copy ();
            }
            catch (ParserException pe)
            {   // this exception handling is suboptimal,
                // but it messages correctly about broken links
                Throwable throwable;
                
                throwable = pe.getThrowable ();
                if (null != throwable)
                {
                    throwable = throwable.getCause ();
                    if (throwable instanceof FileNotFoundException)
                        System.err.println ("broken link " + ((FileNotFoundException)throwable).getMessage () + " ignored");
                    else
                        pe.printStackTrace ();
                }
                else
                    pe.printStackTrace ();
            }
    }

    /**
     * Mainline to capture a web site locally.
     * @param args The command line arguments.
     * There are three arguments the web site to capture, the local directory
     * to save it to, and a flag (true or false) to indicate whether resources
     * such as images and video are to be captured as well.
     * These are requested via dialog boxes if not supplied.
     * @exception MalformedURLException If the supplied URL is invalid.
     * @exception IOException If an error occurs reading the page or resources.
     */
    public static void main (String[] args)
        throws
            MalformedURLException,
            IOException
    {
        SiteCapturer worker;
        String url;
        JFileChooser chooser;
        URL source;
        String path;
        File target;
        Boolean capture;
        int ret;
        
        worker = new SiteCapturer ();
        if (0 >= args.length)
        {
            url = (String)JOptionPane.showInputDialog (
                null,
                "Enter the URL to capture:",
                "Web Site",
                JOptionPane.PLAIN_MESSAGE,
                null,
                null,
                "http://htmlparser.sourceforge.net");
            if (null != url)
                worker.setSource (url);
            else
                System.exit (1);
        }
        else
            worker.setSource (args[0]);
        if (1 >= args.length)
        {
            url = worker.getSource ();
            source = new URL (url);
            path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
            target = new File (path);
            chooser = new JFileChooser (target);
            chooser.setDialogType (JFileChooser.SAVE_DIALOG);
            chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
            chooser.setSelectedFile (target); // this doesn't frickin' work
            chooser.setMultiSelectionEnabled (false);
            chooser.setDialogTitle ("Target Directory");
            ret = chooser.showSaveDialog (null);
            if (ret == JFileChooser.APPROVE_OPTION)
                worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
            else
                System.exit (1);
        }
        else
            worker.setTarget (args[1]);
        if (2 >= args.length)
        {
            capture = (Boolean)JOptionPane.showInputDialog (
                null,
                "Should resources be captured:",
                "Capture Resources",
                JOptionPane.PLAIN_MESSAGE,
                null,
                new Object[] { Boolean.TRUE, Boolean.FALSE},
                Boolean.TRUE);
            if (null != capture)
                worker.setCaptureResources (capture.booleanValue ());
            else
                System.exit (1);
        }
        else
            worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
        worker.capture ();
        
        System.exit (0);
    }
}