org.htmlparser.parserapplications.LinkExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/LinkExtractor.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/12 11:27:41 $
// $Revision: 1.52 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.parserapplications;

import javax.swing.JOptionPane;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * LinkExtractor extracts all the links from the given webpage
 * and prints them on standard output.
 */
public class LinkExtractor
{
    /**
     * Run the link extractor.
     * @param args [0] Optional url to extract links from.
     * An input dialog is displayed if it is not supplied.
     */
    public static void main (String[] args)
    {
        String url;
        Parser parser;
        NodeFilter filter;
        NodeList list;

        if (0 >= args.length)
        {
            url = (String)JOptionPane.showInputDialog (
                null,
                "Enter the URL to extract links from:",
                "Web Site",
                JOptionPane.PLAIN_MESSAGE,
                null,
                null,
                "http://htmlparser.sourceforge.net/wiki/");
            if (null == url)
                System.exit (1);
        }
        else
            url = args[0];
        filter = new NodeClassFilter (LinkTag.class);
        if ((1 < args.length) && args[1].equalsIgnoreCase ("-maillinks"))
            filter = new AndFilter (
                filter,
                new NodeFilter ()
                {
                    public boolean accept (Node node)
                    {
                        return (((LinkTag)node).isMailLink ());
                    }
                }
            );
        try
        {
            parser = new Parser (url);
            list = parser.extractAllNodesThatMatch (filter);
            for (int i = 0; i < list.size (); i++)
                System.out.println (list.elementAt (i).toHtml ());
        }
        catch (ParserException e)
        {
            e.printStackTrace ();
        }
        System.exit (0);
    }
}