org.htmlparser.parserapplications.WikiCapturer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sitecapturer Show documentation
SiteCapturer is an example application for spidering and saving a website.
The newest version!
// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Derrick Oswald
//
// Revision Control Information
//
// $URL: https://htmlparser.svn.sourceforge.net/svnroot/htmlparser/tags/HTMLParserProject-2.1/sitecapturer/src/main/java/org/htmlparser/parserapplications/WikiCapturer.java $
// $Author: derrickoswald $
// $Date: 2006-09-18 03:02:25 +0200 (Mon, 18 Sep 2006) $
// $Revision: 8 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
//   http://opensource.org/licenses/cpl1.0.php

package org.htmlparser.parserapplications;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;

/**
 * Save a wikiwikiweb locally.
 * Illustrative program to save a wiki locally.
 */
public class WikiCapturer
    extends
        SiteCapturer
{
    /**
     * Create a wikicapturer.
     */
    public WikiCapturer ()
    {
    }

    /**
     * Returns true if the link is one we are interested in.
     * @param link The link to be checked.
     * @return true if the link has the source URL as a prefix
     * and doesn't contain '?' or '#'; the former because we won't be able to
     * handle server side queries in the static target directory structure and
     * the latter because presumably the full page with that reference has
     * already been captured previously. This performs a case insensitive
     * comparison, which is cheating really, but it's cheap.
     */
    protected boolean isToBeCaptured (String link)
    {
        boolean ret;
        
        ret = super.isToBeCaptured (link);

        // eliminate PhpWiki specific pages
        if (ret)
            if (link.endsWith ("PhpWikiAdministration"))
                ret = false;
            else if (link.endsWith ("PhpWikiDocumentation"))
                ret = false;
            else if (link.endsWith ("TextFormattingRules"))
                ret = false;
            else if (link.endsWith ("NewMarkupTestPage"))
                ret = false;
            else if (link.endsWith ("OldMarkupTestPage"))
                ret = false;
            else if (link.endsWith ("OldTextFormattingRules"))
                ret = false;
            else if (link.endsWith ("PgsrcTranslation"))
                ret = false;
            else if (link.endsWith ("HowToUseWiki"))
                ret = false;
            else if (link.endsWith ("MoreAboutMechanics"))
                ret = false;
            else if (link.endsWith ("AddingPages"))
                ret = false;
            else if (link.endsWith ("WikiWikiWeb"))
                ret = false;
            else if (link.endsWith ("UserPreferences"))
                ret = false;
            else if (link.endsWith ("PhpWiki"))
                ret = false;
            else if (link.endsWith ("WabiSabi"))
                ret = false;
            else if (link.endsWith ("EditText"))
                ret = false;
            else if (link.endsWith ("FindPage"))
                ret = false;
            else if (link.endsWith ("RecentChanges"))
                ret = false;
            else if (link.endsWith ("RecentEdits"))
                ret = false;
            else if (link.endsWith ("RecentVisitors"))
                ret = false;
            else if (link.endsWith ("SteveWainstead"))
                ret = false;

        return (ret);
    }

    /**
     * Mainline to capture a web site locally.
     * @param args The command line arguments.
     * There are three arguments the web site to capture, the local directory
     * to save it to, and a flag (true or false) to indicate whether resources
     * such as images and video are to be captured as well.
     * These are requested via dialog boxes if not supplied.
     * @exception MalformedURLException If the supplied URL is invalid.
     * @exception IOException If an error occurs reading the pages or resources.
     */
    public static void main (String[] args)
        throws
            MalformedURLException,
            IOException
    {
        WikiCapturer worker;
        String url;
        JFileChooser chooser;
        URL source;
        String path;
        File target;
        Boolean capture;
        int ret;
        
        worker = new WikiCapturer ();
        if (0 >= args.length)
        {
            url = (String)JOptionPane.showInputDialog (
                null,
                "Enter the URL to capture:",
                "Web Site",
                JOptionPane.PLAIN_MESSAGE,
                null,
                null,
                "http://htmlparser.sourceforge.net");
            if (null != url)
                worker.setSource (url);
            else
                System.exit (1);
        }
        else
            worker.setSource (args[0]);
        if (1 >= args.length)
        {
            url = worker.getSource ();
            source = new URL (url);
            path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
            target = new File (path);
            chooser = new JFileChooser (target);
            chooser.setDialogType (JFileChooser.SAVE_DIALOG);
            chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
            chooser.setSelectedFile (target); // this doesn't frickin' work
            chooser.setMultiSelectionEnabled (false);
            chooser.setDialogTitle ("Target Directory");
            ret = chooser.showSaveDialog (null);
            if (ret == JFileChooser.APPROVE_OPTION)
                worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
            else
                System.exit (1);
        }
        else
            worker.setTarget (args[1]);
        if (2 >= args.length)
        {
            capture = (Boolean)JOptionPane.showInputDialog (
                null,
                "Should resources be captured:",
                "Capture Resources",
                JOptionPane.PLAIN_MESSAGE,
                null,
                new Object[] { Boolean.TRUE, Boolean.FALSE},
                Boolean.TRUE);
            if (null != capture)
                worker.setCaptureResources (capture.booleanValue ());
            else
                System.exit (1);
        }
        else
            worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
        worker.setFilter (
            new NotFilter (
                new OrFilter (
                    new AndFilter (
                        new TagNameFilter ("DIV"),
                        new HasAttributeFilter ("id", "navbar")), 
                    new OrFilter (
                        new AndFilter (
                            new TagNameFilter ("DIV"),
                            new HasAttributeFilter ("id", "actionbar")),
                        new AndFilter (
                            new TagNameFilter ("DIV"),
                            new HasAttributeFilter ("id", "xhtml-validator"))))));
        worker.capture ();
        
        System.exit (0);
    }
}