org.htmlparser.parserapplications.SiteCapturer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/12 11:27:41 $
// $Revision: 1.9 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.parserapplications;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * Save a web site locally.
 * Illustrative program to save a web site contents locally.
 * It was created to demonstrate URL rewriting in it's simplest form.
 * It uses customized tags in the NodeFactory to alter the URLs.
 * This program has a number of limitations:
 * 
 * it doesn't capture forms, this would involve too many assumptions
 * it doesn't capture script references, so funky onMouseOver and other
 * non-static content will not be faithfully reproduced
 * it doesn't handle style sheets
 * it doesn't dig into attributes that might reference resources, so
 * for example, background images won't necessarily be captured
 * worst of all, it gets confused when a URL both has content and is
 * the prefix for other content,
 * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both
 * yield content, since this cannot be faithfully replicated to a static
 * directory structure (this happens a lot with servlet based sites)
 *
 */
public class SiteCapturer
{
    /**
     * The web site to capture.
     * This is used as the base URL in deciding whether to adjust a link
     * and whether to capture a page or not.
     */
    protected String mSource;

    /**
     * The local directory to capture to.
     * This is used as a base prefix for files saved locally.
     */
    protected String mTarget;

    /**
     * The list of pages to capture.
     * Links are added to this list as they are discovered, and removed in
     * sequential order (FIFO queue) leading to a breadth
     * first traversal of the web site space.
     */
    protected ArrayList mPages;

    /**
     * The set of pages already captured.
     * Used to avoid repeated acquisition of the same page.
     */
    protected HashSet mFinished;

    /**
     * The list of resources to copy.
     * Images and other resources are added to this list as they are discovered.
     */
    protected ArrayList mImages;

    /**
     * The set of resources already copied.
     * Used to avoid repeated acquisition of the same images and other resources.
     */
    protected HashSet mCopied;

    /**
     * The parser to use for processing.
     */
    protected Parser mParser;

    /**
     * If true, save resources locally too,
     * otherwise, leave resource links pointing to original page.
     */
    protected boolean mCaptureResources;

    /**
     * The filter to apply to the nodes retrieved.
     */
    protected NodeFilter mFilter;

    /**
     * Copy buffer size.
     * Resources are moved to disk in chunks this size or less.
     */
    protected final int TRANSFER_SIZE = 4096;

    /**
     * Create a web site capturer.
     */
    public SiteCapturer ()
    {
        PrototypicalNodeFactory factory;

        mSource = null;
        mTarget = null;
        mPages = new ArrayList ();
        mFinished = new HashSet ();
        mImages = new ArrayList ();
        mCopied = new HashSet ();
        mParser = new Parser ();
        factory = new PrototypicalNodeFactory ();
        factory.registerTag (new LocalLinkTag ());
        factory.registerTag (new LocalFrameTag ());
        factory.registerTag (new LocalBaseHrefTag ());
        factory.registerTag (new LocalImageTag ());
        mParser.setNodeFactory (factory);
        mCaptureResources = true;
        mFilter = null;
    }

    /**
     * Getter for property source.
     * @return Value of property source.
     */
    public String getSource ()
    {
        return (mSource);
    }
    
    /**
     * Setter for property source.
     * This is the base URL to capture. URL's that don't start with this prefix
     * are ignored (left as is), while the ones with this URL as a base are
     * re-homed to the local target.
     * @param source New value of property source.
     */
    public void setSource (String source)
    {
        if (source.endsWith ("/"))
            source = source.substring (0, source.length () - 1);
        mSource = source;
    }
    
    /**
     * Getter for property target.
     * @return Value of property target.
     */
    public String getTarget ()
    {
        return (mTarget);
    }
    
    /**
     * Setter for property target.
     * This is the local directory under which to save the site's pages.
     * @param target New value of property target.
     */
    public void setTarget (String target)
    {
        mTarget = target;
    }

    /**
     * Getter for property captureResources.
     * If true, the images and other resources referenced by
     * the site and within the base URL tree are also copied locally to the
     * target directory. If false, the image links are left 'as
     * is', still refering to the original site.
     * @return Value of property captureResources.
     */
    public boolean getCaptureResources ()
    {
        return (mCaptureResources);
    }
    
    /**
     * Setter for property captureResources.
     * @param capture New value of property captureResources.
     */
    public void setCaptureResources (boolean capture)
    {
        mCaptureResources = capture;
    }
    
    
    /** Getter for property filter.
     * @return Value of property filter.
     *
     */
    public NodeFilter getFilter ()
    {
        return (mFilter);
    }
    
    /** Setter for property filter.
     * @param filter New value of property filter.
     *
     */
    public void setFilter (NodeFilter filter)
    {
        mFilter = filter;
    }
    
    /**
     * Returns true if the link is one we are interested in.
     * @param link The link to be checked.
     * @return true if the link has the source URL as a prefix
     * and doesn't contain '?' or '#'; the former because we won't be able to
     * handle server side queries in the static target directory structure and
     * the latter because presumably the full page with that reference has
     * already been captured previously. This performs a case insensitive
     * comparison, which is cheating really, but it's cheap.
     */
    protected boolean isToBeCaptured (String link)
    {
        return (
            link.toLowerCase ().startsWith (getSource ().toLowerCase ())
            && (-1 == link.indexOf ("?"))
            && (-1 == link.indexOf ("#")));
    }

    /**
     * Returns true if the link contains text/html content.
     * @param link The URL to check for content type.
     * @return true if the HTTP header indicates the type is
     * "text/html".
     * @exception ParserException If the supplied URL can't be read from.
     */
    protected boolean isHtml (String link)
        throws
            ParserException
    {
        URL url;
        URLConnection connection;
        String type;
        boolean ret;

        ret = false;
        try
        {
            url = new URL (link);
            connection = url.openConnection ();
            type = connection.getContentType ();
            if (type == null)
                ret = false;
            else
                ret = type.startsWith ("text/html");
        }
        catch (Exception e)
        {
            throw new ParserException ("URL " + link + " has a problem", e);
        }
        
        return (ret);
    }

    /**
     * Converts a link to local.
     * A relative link can be used to construct both a URL and a file name.
     * Basically, the operation is to strip off the base url, if any,
     * and then prepend as many dot-dots as necessary to make
     * it relative to the current page.
     * A bit of a kludge handles the root page specially by calling it
     * index.html, even though that probably isn't it's real file name.
     * This isn't pretty, but it works for me.
     * @param link The link to make relative.
     * @param current The current page URL, or empty if it's an absolute URL
     * that needs to be converted.
     * @return The URL relative to the current page.
     */
    protected String makeLocalLink (String link, String current)
    {
        int i;
        int j;
        String ret;

        if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
            ret = "index.html"; // handle the root page specially
        else if (link.startsWith (getSource ())
                && (link.length () > getSource ().length ()))
            ret = link.substring (getSource ().length () + 1);
        else
            ret = link; // give up
            
        // make it relative to the current page by prepending "../" for
        // each '/' in the current local path
        if ((null != current)
            && link.startsWith (getSource ())
            && (current.length () > getSource ().length ()))
        {
            current = current.substring (getSource ().length () + 1);
            i = 0;
            while (-1 != (j = current.indexOf ('/', i)))
            {
                ret = "../" + ret;
                i = j + 1;
            }
        }

        return (ret);
    }

    /**
     * Unescape a URL to form a file name.
     * Very crude.
     * @param raw The escaped URI.
     * @return The native URI.
     */
    protected String decode (String raw)
    {
        int length;
        int start;
        int index;
        int value;
        StringBuilder ret;
        
        ret = new StringBuilder (raw.length ());

        length = raw.length ();
        start = 0;
        while (-1 != (index = raw.indexOf ('%', start)))
        {   // append the part up to the % sign
            ret.append (raw.substring (start, index));
            // there must be two hex digits after the percent sign
            if (index + 2 < length)
            {
                try
                {
                    value = Integer.parseInt (raw.substring (index + 1, index + 3), 16);
                    ret.append ((char)value);
                    start = index + 3;
                }
                catch (NumberFormatException nfe)
                {
                    ret.append ('%');
                    start = index + 1;
                }
            }
            else
            {   // this case is actually illegal in a URI, but...
                ret.append ('%');
                start = index + 1;
            }
        }
        ret.append (raw.substring (start));
        
        return (ret.toString ());
    }

    /**
     * Copy a resource (image) locally.
     * Removes one element from the 'to be copied' list and saves the
     * resource it points to locally as a file.
     */
    protected void copy ()
    {
        String link;
        String raw;
        String name;
        File file;
        File dir;
        URL source;
        byte[] data;
        InputStream in;
        FileOutputStream out;
        int read;

        link = (String)mImages.remove (0);
        mCopied.add (link);

        if (getCaptureResources ())
        {
            raw = makeLocalLink (link, "");
            name = decode (raw);
            file = new File (getTarget (), name);
            System.out.println ("copying " + link + " to " + file.getAbsolutePath ());
            // ensure directory exists
            dir = file.getParentFile ();
            if (!dir.exists ())
                dir.mkdirs ();
            try
            {
                source = new URL (link);
                data = new byte [TRANSFER_SIZE];
                try
                {
                    in = source.openStream ();
                    try
                    {
                        out = new FileOutputStream (file);
                        try
                        {
                            while (-1 != (read = in.read (data, 0, data.length)))
                                out.write (data, 0, read);
                        }
                        finally
                        {
                            out.close ();
                        }
                    }
                    catch (FileNotFoundException fnfe)
                    {
                        fnfe.printStackTrace ();
                    }
                    finally
                    {
                        in.close ();
                    }
                }
                catch (FileNotFoundException fnfe)
                {
                    System.err.println ("broken link " + fnfe.getMessage () + " ignored");
                }
            }
            catch (MalformedURLException murle)
            {
                murle.printStackTrace ();
            }
            catch (IOException ioe)
            {
                ioe.printStackTrace ();
            }
        }
    }
 
    /**
     * Process a single page.
     * @param filter The filter to apply to the collected nodes.
     * @exception ParserException If a parse error occurs.
     */
    protected void process (NodeFilter filter)
        throws
            ParserException
    {
        String url;
        int bookmark;
        NodeList list;
        NodeList robots;
        MetaTag robot;
        String content;
        File file;
        File dir;
        PrintWriter out;

        // get the next URL and add it to the done pile
        url = (String)mPages.remove (0);
        System.out.println ("processing " + url);
        mFinished.add (url);

        try
        {
            bookmark = mPages.size ();
            // fetch the page and gather the list of nodes
            mParser.setURL (url);
            try
            {
                list = new NodeList ();
                for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
                    list.add (e.nextNode ()); // URL conversion occurs in the tags
            }
            catch (EncodingChangeException ece)
            {
                // fix bug #998195 SiteCatpurer just crashed
                // try again with the encoding now set correctly
                // hopefully mPages, mImages, mCopied and mFinished won't be corrupted
                mParser.reset ();
                list = new NodeList ();
                for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
                    list.add (e.nextNode ());
            }

            // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
            // 
            // 
            robots = list.extractAllNodesThatMatch (
                new AndFilter (
                    new NodeClassFilter (MetaTag.class),
                    new HasAttributeFilter ("name", "robots")), true);
            if (0 != robots.size ())
            {
                robot = (MetaTag)robots.elementAt (0);
                content = robot.getAttribute ("content").toLowerCase ();
                if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
                    // reset mPages
                    for (int i = bookmark; i < mPages.size (); i++)
                        mPages.remove (i);
                if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
                    return;
            }
    
            if (null != filter)
                list.keepAllNodesThatMatch (filter, true);

            // save the page locally
            file = new File (getTarget (), makeLocalLink (url, ""));
            dir = file.getParentFile ();
            if (!dir.exists ())
                dir.mkdirs ();
            else if (!dir.isDirectory ())
            {
                dir = new File (dir.getParentFile (), dir.getName () + ".content");
                if (!dir.exists ())
                    dir.mkdirs ();
                file = new File (dir, file.getName ());
            }
                
            try
            {
                out = new PrintWriter (new FileOutputStream (file));
                for (int i = 0; i < list.size (); i++)
                    out.print (list.elementAt (i).toHtml ());
                out.close ();
            }
            catch (FileNotFoundException fnfe)
            {
                fnfe.printStackTrace ();
            }
        }
        catch (ParserException pe)
        {
            String message;
            
            // this exception handling is suboptimal,
            // but it recognizes resources that aren't text/html
            message = pe.getMessage ();
            if ((null != message) && (message.endsWith ("does not contain text")))
            {
                if (!mCopied.contains (url))
                    if (!mImages.contains (url))
                        mImages.add (url);
                mFinished.remove (url);
            }
            else
                throw pe;
        }
    }

    /**
     * Link tag that rewrites the HREF.
     * The HREF is changed to a local target if it matches the source.
     */
    class LocalLinkTag extends LinkTag
    {
        public void doSemanticAction ()
            throws
                ParserException
        {
            boolean html;
            String link;

            // get the link
            link = getLink ();
            // check if it needs to be captured
            if (isToBeCaptured (link))
            {
                // add the link to a list to be processed
                if (mFinished.contains (link))
                    html = true;
                else if (mPages.contains (link))
                    html = true;
                else if (mCopied.contains (link))
                    html = false;
                else if (mImages.contains (link))
                    html = false;
                else
                {   // this test is expensive, do it reluctantly
                    html = isHtml (link);
                    if (html)
                        mPages.add (link);
                    else
                        mImages.add (link);
                }
                // alter the link
                if (html || (!html && getCaptureResources ()))
                    link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
                setLink (link);
            }
        }
    }

    /**
     * Frame tag that rewrites the SRC URLs.
     * The SRC URLs are mapped to local targets if they match the source.
     */
    class LocalFrameTag extends FrameTag
    {
        public void doSemanticAction ()
            throws
                ParserException
        {
            boolean html;
            String link;

            // get the link
            link = getFrameLocation ();
            // check if it needs to be captured
            if (isToBeCaptured (link))
            {
                // add the link to a list to be processed
                if (mFinished.contains (link))
                    html = true;
                else if (mPages.contains (link))
                    html = true;
                else if (mCopied.contains (link))
                    html = false;
                else if (mImages.contains (link))
                    html = false;
                else
                {   // this test is expensive, do it reluctantly
                    html = isHtml (link);
                    if (html)
                        mPages.add (link);
                    else
                        mImages.add (link);
                }
                // alter the link
                if (html || (!html && getCaptureResources ()))
                    link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
                setFrameLocation (link);
            }
        }
    }

    /**
     * Image tag that rewrites the SRC URL.
     * If resources are being captured the SRC is mapped to a local target if
     * it matches the source, otherwise it is convered to a full URL to point
     * back to the original site.
     */
    class LocalImageTag extends ImageTag
    {
        public void doSemanticAction ()
            throws
                ParserException
        {
            String image;
            
            // get the image url
            image = getImageURL ();
            // check if it needs to be captured
            if (isToBeCaptured (image))
            {   // add the image to the list needing to be copied
                if (!mCopied.contains (image))
                    if (!mImages.contains (image))
                        mImages.add (image);
                if (getCaptureResources ())
                    image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ());
                // alter the link
                setImageURL (image);
            }
        }
    }

    /**
     * Base tag that doesn't show.
     * The toHtml() method is overridden to return an empty string,
     * effectively shutting off the base reference.
     */
    class LocalBaseHrefTag extends BaseHrefTag
    {
        // we don't want to have a base pointing back at the source page
        public String toHtml ()
        {
            return ("");
        }
    }

    /**
     * Perform the capture.
     */
    public void capture ()
    {
       
        mPages.clear ();
        mPages.add (getSource ());
        while (0 != mPages.size ())
            try
            {
                process (getFilter ());
                while (0 != mImages.size ())
                    copy ();
            }
            catch (ParserException pe)
            {   // this exception handling is suboptimal,
                // but it messages correctly about broken links
                Throwable throwable;
                
                throwable = pe.getThrowable ();
                if (null != throwable)
                {
                    throwable = throwable.getCause ();
                    if (throwable instanceof FileNotFoundException)
                        System.err.println ("broken link " + ((FileNotFoundException)throwable).getMessage () + " ignored");
                    else
                        pe.printStackTrace ();
                }
                else
                    pe.printStackTrace ();
            }
    }

    /**
     * Mainline to capture a web site locally.
     * @param args The command line arguments.
     * There are three arguments the web site to capture, the local directory
     * to save it to, and a flag (true or false) to indicate whether resources
     * such as images and video are to be captured as well.
     * These are requested via dialog boxes if not supplied.
     * @exception MalformedURLException If the supplied URL is invalid.
     * @exception IOException If an error occurs reading the page or resources.
     */
    public static void main (String[] args)
        throws
            MalformedURLException,
            IOException
    {
        SiteCapturer worker;
        String url;
        JFileChooser chooser;
        URL source;
        String path;
        File target;
        Boolean capture;
        int ret;
        
        worker = new SiteCapturer ();
        if (0 >= args.length)
        {
            url = (String)JOptionPane.showInputDialog (
                null,
                "Enter the URL to capture:",
                "Web Site",
                JOptionPane.PLAIN_MESSAGE,
                null,
                null,
                "http://htmlparser.sourceforge.net/wiki");
            if (null != url)
                worker.setSource (url);
            else
                System.exit (1);
        }
        else
            worker.setSource (args[0]);
        if (1 >= args.length)
        {
            url = worker.getSource ();
            source = new URL (url);
            path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
            target = new File (path);
            chooser = new JFileChooser (target);
            chooser.setDialogType (JFileChooser.SAVE_DIALOG);
            chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
            chooser.setSelectedFile (target); // this doesn't frickin' work
            chooser.setMultiSelectionEnabled (false);
            chooser.setDialogTitle ("Target Directory");
            ret = chooser.showSaveDialog (null);
            if (ret == JFileChooser.APPROVE_OPTION)
                worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
            else
                System.exit (1);
        }
        else
            worker.setTarget (args[1]);
        if (2 >= args.length)
        {
            capture = (Boolean)JOptionPane.showInputDialog (
                null,
                "Should resources be captured:",
                "Capture Resources",
                JOptionPane.PLAIN_MESSAGE,
                null,
                new Object[] { Boolean.TRUE, Boolean.FALSE},
                Boolean.TRUE);
            if (null != capture)
                worker.setCaptureResources (capture.booleanValue ());
            else
                System.exit (1);
        }
        else
            worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
        worker.capture ();
        
        System.exit (0);
    }
}