org.htmlparser.parserapplications.StringExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/StringExtractor.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/12 11:27:42 $
// $Revision: 1.48 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.parserapplications;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
/**
* Extract plaintext strings from a web page.
* Illustrative program to gather the textual contents of a web page.
* Uses a {@link org.htmlparser.beans.StringBean StringBean} to accumulate
* the user visible text (what a browser would display) into a single string.
*/
public class StringExtractor
{
private String resource;
/**
* Construct a StringExtractor to read from the given resource.
* @param resource Either a URL or a file name.
*/
public StringExtractor (String resource)
{
this.resource = resource;
}
/**
* Extract the text from a page.
* @return The textual contents of the page.
* @param links if true
include hyperlinks in output.
* @exception ParserException If a parse error occurs.
*/
public String extractStrings (boolean links)
throws
ParserException
{
StringBean sb;
sb = new StringBean ();
sb.setLinks (links);
sb.setURL (resource);
return (sb.getStrings ());
}
/**
* Mainline.
* @param args The command line arguments.
*/
public static void main (String[] args)
{
boolean links;
String url;
StringExtractor se;
links = false;
url = null;
for (int i = 0; i < args.length; i++)
if (args[i].equalsIgnoreCase ("-links"))
links = true;
else
url = args[i];
if (null != url)
{
se = new StringExtractor (url);
try
{
System.out.println (se.extractStrings (links));
}
catch (ParserException e)
{
e.printStackTrace ();
}
}
else
System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.parserapplications.StringExtractor [-links] url");
}
}