All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlparser.beans.StringBean Maven / Gradle / Ivy

// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/15 11:49:03 $
// $Revision: 1.44 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.beans;

import java.beans.PropertyChangeListener;
import java.beans.PropertyChangeSupport;
import java.io.Serializable;
import java.net.URLConnection;

import org.htmlparser.Parser;
import org.htmlparser.Text;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.Tag;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.Translate;
import org.htmlparser.visitors.NodeVisitor;

/**
 * Extract strings from a URL.
 * 

Text within <SCRIPT></SCRIPT> tags is removed.

*

The text within <PRE></PRE> tags is not altered.

*

The property Strings, which is the output property is null * until a URL is set. So a typical usage is:

*
 *     StringBean sb = new StringBean ();
 *     sb.setLinks (false);
 *     sb.setReplaceNonBreakingSpaces (true);
 *     sb.setCollapse (true);
 *     sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here
 *     String s = sb.getStrings ();
 * 
* You can also use the StringBean as a NodeVisitor on your own parser, * in which case you have to refetch your page if you change one of the * properties because it resets the Strings property:

*
 *     StringBean sb = new StringBean ();
 *     Parser parser = new Parser ("http://cbc.ca");
 *     parser.visitAllNodesWith (sb);
 *     String s = sb.getStrings ();
 *     sb.setLinks (true);
 *     parser.reset ();
 *     parser.visitAllNodesWith (sb);
 *     String sl = sb.getStrings ();
 * 
* According to Nick Burch, who contributed the patch, this is handy if you * don't want StringBean to wander off and get the content itself, either * because you already have it, it's not on a website etc. */ public class StringBean extends NodeVisitor implements Serializable { /** * Property name in event where the URL contents changes. */ public static final String PROP_STRINGS_PROPERTY = "strings"; /** * Property name in event where the 'embed links' state changes. */ public static final String PROP_LINKS_PROPERTY = "links"; /** * Property name in event where the URL changes. */ public static final String PROP_URL_PROPERTY = "URL"; /** * Property name in event where the 'replace non-breaking spaces' * state changes. */ public static final String PROP_REPLACE_SPACE_PROPERTY = "replaceNonBreakingSpaces"; /** * Property name in event where the 'collapse whitespace' state changes. */ public static final String PROP_COLLAPSE_PROPERTY = "collapse"; /** * Property name in event where the connection changes. */ public static final String PROP_CONNECTION_PROPERTY = "connection"; /** * A newline. */ private static final String NEWLINE = System.getProperty ("line.separator"); /** * The length of the NEWLINE. */ private static final int NEWLINE_SIZE = NEWLINE.length (); /** * Bound property support. */ protected PropertyChangeSupport mPropertySupport; /** * The parser used to extract strings. */ protected Parser mParser; /** * The strings extracted from the URL. */ protected String mStrings; /** * If true the link URLs are embedded in the text output. */ protected boolean mLinks; /** * If true regular space characters are substituted for * non-breaking spaces in the text output. */ protected boolean mReplaceSpace; /** * If true sequences of whitespace characters are replaced * with a single space character. */ protected boolean mCollapse; /** * The buffer text is stored in while traversing the HTML. */ protected StringBuilder mBuffer; /** * Set true when traversing a SCRIPT tag. */ protected boolean mIsScript; /** * Set true when traversing a PRE tag. */ protected boolean mIsPre; /** * Set true when traversing a STYLE tag. */ protected boolean mIsStyle; /** * Create a StringBean object. * Default property values are set to 'do the right thing': *

Links is set false so text appears like a * browser would display it, albeit without the colour or underline clues * normally associated with a link.

*

ReplaceNonBreakingSpaces is set true, so * that printing the text works, but the extra information regarding these * formatting marks is available if you set it false.

*

Collapse is set true, so text appears * compact like a browser would display it.

*/ public StringBean () { super (true, true); mPropertySupport = new PropertyChangeSupport (this); mParser = new Parser (); mStrings = null; mLinks = false; mReplaceSpace = true; mCollapse = true; mBuffer = new StringBuilder (4096); mIsScript = false; mIsPre = false; mIsStyle = false; } // // internals // /** * Appends a newline to the buffer if there isn't one there already. * Except if the buffer is empty. */ protected void carriageReturn () { int length; length = mBuffer.length (); if ((0 != length) // don't append newlines to the beginning of a buffer && ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE && (!mBuffer.substring ( length - NEWLINE_SIZE, length).equals (NEWLINE)))) mBuffer.append (NEWLINE); } /** * Add the given text collapsing whitespace. * Use a little finite state machine: *
     * state 0: whitepace was last emitted character
     * state 1: in whitespace
     * state 2: in word
     * A whitespace character moves us to state 1 and any other character
     * moves us to state 2, except that state 0 stays in state 0 until
     * a non-whitespace and going from whitespace to word we emit a space
     * before the character:
     *    input:     whitespace   other-character
     * state\next
     *    0               0             2
     *    1               1        space then 2
     *    2               1             2
     * 
* @param buffer The buffer to append to. * @param string The string to append. */ protected void collapse (StringBuilder buffer, String string) { int chars; int length; int state; char character; chars = string.length (); if (0 != chars) { length = buffer.length (); state = ((0 == length) || (buffer.charAt (length - 1) == ' ') || ((NEWLINE_SIZE <= length) && buffer.substring ( length - NEWLINE_SIZE, length).equals (NEWLINE))) ? 0 : 1; for (int i = 0; i < chars; i++) { character = string.charAt (i); switch (character) { // see HTML specification section 9.1 White space // http://www.w3.org/TR/html4/struct/text.html#h-9.1 case '\u0020': case '\u0009': case '\u000C': case '\u200B': case '\r': case '\n': if (0 != state) state = 1; break; default: if (1 == state) buffer.append (' '); state = 2; buffer.append (character); } } } } /** * Extract the text from a page. * @return The textual contents of the page. * @exception ParserException If a parse error occurs. */ protected String extractStrings () throws ParserException { String ret; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); mBuffer = new StringBuilder(4096); return (ret); } /** * Assign the Strings property, firing the property change. * @param strings The new value of the Strings property. */ protected void updateStrings (String strings) { String oldValue; if ((null == mStrings) || !mStrings.equals (strings)) { oldValue = mStrings; mStrings = strings; mPropertySupport.firePropertyChange ( PROP_STRINGS_PROPERTY, oldValue, strings); } } /** * Fetch the URL contents. * Only do work if there is a valid parser with it's URL set. */ protected void setStrings () { if (null != getURL ()) try { try { mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); } finally { mBuffer = new StringBuilder (4096); } } catch (EncodingChangeException ece) { mIsPre = false; mIsScript = false; mIsStyle = false; try { // try again with the encoding now in force mParser.reset (); mBuffer = new StringBuilder (4096); mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); } catch (ParserException pe) { updateStrings (pe.toString ()); } finally { mBuffer = new StringBuilder (4096); } } catch (ParserException pe) { updateStrings (pe.toString ()); } else { // reset in case this StringBean is used as a visitor // on another parser, not it's own mStrings = null; mBuffer = new StringBuilder (4096); } } /** * Refetch the URL contents. * Only need to worry if there is already a valid parser and it's * been spent fetching the string contents. */ private void resetStrings () { if (null != mStrings) try { mParser.setURL (getURL ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } // // Property change support. // /** * Add a PropertyChangeListener to the listener list. * The listener is registered for all properties. * @param listener The PropertyChangeListener to be added. */ public void addPropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.addPropertyChangeListener (listener); } /** * Remove a PropertyChangeListener from the listener list. * This removes a registered PropertyChangeListener. * @param listener The PropertyChangeListener to be removed. */ public void removePropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.removePropertyChangeListener (listener); } // // Properties // /** * Return the textual contents of the URL. * This is the primary output of the bean. * @return The user visible (what would be seen in a browser) text. */ public String getStrings () { if (null == mStrings) if (0 == mBuffer.length ()) setStrings (); else updateStrings (mBuffer.toString ()); return (mStrings); } /** * Get the current 'include links' state. * @return true if link text is included in the text extracted * from the URL, false otherwise. */ public boolean getLinks () { return (mLinks); } /** * Set the 'include links' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param links Use true if link text is to be included in the * text extracted from the URL, false otherwise. */ public void setLinks (boolean links) { boolean oldValue = mLinks; if (oldValue != links) { mLinks = links; mPropertySupport.firePropertyChange ( PROP_LINKS_PROPERTY, oldValue, links); resetStrings (); } } /** * Get the current URL. * @return The URL from which text has been extracted, or null * if this property has not been set yet. */ public String getURL () { return ((null != mParser) ? mParser.getURL () : null); } /** * Set the URL to extract strings from. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param url The URL that text should be fetched from. */ public void setURL (String url) { String old; URLConnection conn; old = getURL (); conn = getConnection (); if (((null == old) && (null != url)) || ((null != old) && !old.equals (url))) { try { if (null == mParser) mParser = new Parser (url); else mParser.setURL (url); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, old, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } } /** * Get the current 'replace non breaking spaces' state. * @return true if non-breaking spaces (character '\u00a0', * numeric character reference &#160; or character entity * reference &nbsp;) are to be replaced with normal * spaces (character '\u0020'). */ public boolean getReplaceNonBreakingSpaces () { return (mReplaceSpace); } /** * Set the 'replace non breaking spaces' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param replace true if non-breaking spaces * (character '\u00a0', numeric character reference &#160; * or character entity reference &nbsp;) are to be replaced with normal * spaces (character '\u0020'). */ public void setReplaceNonBreakingSpaces (boolean replace) { boolean oldValue = mReplaceSpace; if (oldValue != replace) { mReplaceSpace = replace; mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, oldValue, replace); resetStrings (); } } /** * Get the current 'collapse whitespace' state. * If set to true this emulates the operation of browsers * in interpretting text where user agents should collapse input * white space sequences when producing output inter-word space. * See HTML specification section 9.1 White space * * http://www.w3.org/TR/html4/struct/text.html#h-9.1. * @return true if sequences of whitespace (space '\u0020', * tab '\u0009', form feed '\u000C', zero-width space '\u200B', * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single * space. */ public boolean getCollapse () { return (mCollapse); } /** * Set the current 'collapse whitespace' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param collapse If true, sequences of whitespace * will be reduced to a single space. */ public void setCollapse (boolean collapse) { boolean oldValue = mCollapse; if (oldValue != collapse) { mCollapse = collapse; mPropertySupport.firePropertyChange ( PROP_COLLAPSE_PROPERTY, oldValue, collapse); resetStrings (); } } /** * Get the current connection. * @return The connection that the parser has or null if it * hasn't been set or the parser hasn't been constructed yet. */ public URLConnection getConnection () { return ((null != mParser) ? mParser.getConnection () : null); } /** * Set the parser's connection. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param connection New value of property Connection. */ public void setConnection (URLConnection connection) { String url; URLConnection conn; url = getURL (); conn = getConnection (); if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection))) { try { if (null == mParser) mParser = new Parser (connection); else mParser.setConnection (connection); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, url, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } } // // NodeVisitor overrides // /** * Appends the text to the output. * @param string The text node. */ public void visitStringNode (Text string) { if (!mIsScript && !mIsStyle) { String text = string.getText (); if (!mIsPre) { text = Translate.decode (text); if (getReplaceNonBreakingSpaces ()) text = text.replace ('\u00a0', ' '); if (getCollapse ()) collapse (mBuffer, text); else mBuffer.append (text); } else mBuffer.append (text); } } /** * Appends a NEWLINE to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. * @param tag The tag to examine. */ public void visitTag (Tag tag) { String name; if (tag instanceof LinkTag) if (getLinks ()) { // appends the link as text between angle brackets to the output. mBuffer.append ("<"); mBuffer.append (((LinkTag)tag).getLink ()); mBuffer.append (">"); } name = tag.getTagName (); if (name.equalsIgnoreCase ("PRE")) mIsPre = true; else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = true; else if (name.equalsIgnoreCase ("STYLE")) mIsStyle = true; if (tag.breaksFlow ()) carriageReturn (); } /** * Resets the state of the PRE and SCRIPT flags. * @param tag The end tag to process. */ public void visitEndTag (Tag tag) { String name; name = tag.getTagName (); if (name.equalsIgnoreCase ("PRE")) mIsPre = false; else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = false; else if (name.equalsIgnoreCase ("STYLE")) mIsStyle = false; } /** * Unit test. * @param args Pass arg[0] as the URL to process. */ public static void main (String[] args) { if (0 >= args.length) System.out.println ("Usage: java -classpath htmlparser.jar" + " org.htmlparser.beans.StringBean "); else { StringBean sb = new StringBean (); sb.setLinks (false); sb.setReplaceNonBreakingSpaces (true); sb.setCollapse (true); sb.setURL (args[0]); System.out.println (sb.getStrings ()); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy