org.htmlparser.util.ParserUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/15 11:49:05 $
// $Revision: 1.47 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.util;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.tags.CompositeTag;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
public class ParserUtils
{
public static String removeChars(String s, char occur) {
StringBuilder newString = new StringBuilder();
char ch;
for (int i = 0; i < s.length(); i++) {
ch = s.charAt(i);
if (ch != occur)
newString.append(ch);
}
return newString.toString();
}
public static String removeEscapeCharacters(String inputString) {
inputString = ParserUtils.removeChars(inputString, '\r');
inputString = ParserUtils.removeChars(inputString, '\n');
inputString = ParserUtils.removeChars(inputString, '\t');
return inputString;
}
public static String removeTrailingBlanks(String text) {
char ch = ' ';
while (ch == ' ') {
ch = text.charAt(text.length() - 1);
if (ch == ' ')
text = text.substring(0, text.length() - 1);
}
return text;
}
/**
* Search given node and pick up any objects of given type.
* @param node The node to search.
* @param type The class to search for.
* @return A node array with the matching nodes.
*/
public static Node[] findTypeInNode(Node node, Class type)
{
NodeFilter filter;
NodeList ret;
ret = new NodeList ();
filter = new NodeClassFilter (type);
node.collectInto (ret, filter);
return (ret.toNodeArray ());
}
/**
* Split the input string considering as string separator
* all the not numerical characters
* with the only exception of the characters specified in charsDoNotBeRemoved param.
*
For example if you call splitButDigits("<DIV> +12.5, +3.4 </DIV>", "+."),
*
you obtain an array of strings {"+12.5", "+3.4"} as output (1,2,3,4 and 5 are digits and +,. are chars that do not be removed).
* @param input The string in input.
* @param charsDoNotBeRemoved The chars that do not be removed.
* @return The array of strings as output.
*/
public static String[] splitButDigits (String input, String charsDoNotBeRemoved)
{
ArrayList output = new ArrayList();
int minCapacity = 0;
StringBuilder str = new StringBuilder();
boolean charFound = false;
boolean toBeAdd = false;
for (int index=0; indexFor example if you call trimButDigits("<DIV> +12.5 </DIV>", "+."),
*
you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed).
*
For example if you call trimButDigits("<DIV> +1 2 . 5 </DIV>", "+."),
*
you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed).
* @param input The string in input.
* @param charsDoNotBeRemoved The chars that do not be removed.
* @return The string as output.
*/
public static String trimButDigits (String input, String charsDoNotBeRemoved)
{
StringBuilder output = new StringBuilder();
boolean charFound=false;
for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string.
*
For example if you call trimButDigitsBeginEnd("<DIV> +12.5 </DIV>", "+."),
*
you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed).
*
For example if you call trimButDigitsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+."),
*
you obtain a string "+1 2 . 5" as output (the spacess inside the string are not removed).
* @param input - The string in input.
* @param charsDoNotBeRemoved - The chars that do not be removed.
* @return The string as output.
*/
public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved)
{
String output = new String();
int begin=0;
int end=input.length()-1;
boolean charFound=false;
boolean ok=true;
for (int index=begin; (index=0) && ok; index--)
{
charFound=false;
for (int charsCount=0; charsCountFor example if you call splitSpaces("<DIV> +12.5, +3.4 </DIV>", "<>DIV/,"),
* <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed).
* @param input The string in input.
* @param charsToBeRemoved The chars to be removed.
* @return The array of strings as output.
*/
public static String[] splitSpaces (String input, String charsToBeRemoved)
{
ArrayList output = new ArrayList();
int minCapacity = 0;
StringBuilder str = new StringBuilder();
boolean charFound = false;
boolean toBeAdd = false;
for (int index=0; indexFor example if you call trimSpaces("<DIV> +12.5 </DIV>", "<>DIV/"),
*
you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed).
*
For example if you call trimSpaces("<DIV> Trim All Spaces Also The Ones Inside The String </DIV>", "<>DIV/"),
*
you obtain a string "TrimAllSpacesAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed).
* @param input The string in input.
* @param charsToBeRemoved The chars to be removed.
* @return The string as output.
*/
public static String trimSpaces (String input, String charsToBeRemoved)
{
StringBuilder output = new StringBuilder();
boolean charFound=false;
for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string.
*
For example if you call trimSpacesBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/"),
*
you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed).
*
For example if you call trimSpacesBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/"),
*
you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved).
* @param input The string in input.
* @param charsToBeRemoved The chars to be removed.
* @return The string as output.
*/
public static String trimSpacesBeginEnd (String input, String charsToBeRemoved)
{
String output = new String();
int begin=0;
int end=input.length()-1;
boolean charFound=false;
boolean ok=true;
for (int index=begin; (index=0) && ok; index--)
{
charFound=false;
for (int charsCount=0; charsCountFor example if you call splitButChars("<DIV> +12.5, +3.4 </DIV>", "+.1234567890"),
*
you obtain an array of strings {"+12.5", "+3.4"} as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed).
* @param input The string in input.
* @param charsDoNotBeRemoved The chars that do not be removed.
* @return The array of strings as output.
*/
public static String[] splitButChars (String input, String charsDoNotBeRemoved)
{
ArrayList output = new ArrayList();
int minCapacity = 0;
StringBuilder str = new StringBuilder();
boolean charFound = false;
boolean toBeAdd = false;
for (int index=0; indexFor example if you call trimButChars("<DIV> +12.5 </DIV>", "+.1234567890"),
*
you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed).
*
For example if you call trimButChars("<DIV> +1 2 . 5 </DIV>", "+.1234567890"),
*
you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed).
* @param input The string in input.
* @param charsDoNotBeRemoved The chars that do not be removed.
* @return The string as output.
*/
public static String trimButChars (String input, String charsDoNotBeRemoved)
{
StringBuilder output = new StringBuilder();
boolean charFound=false;
for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string.
*
For example if you call trimButCharsBeginEnd("<DIV> +12.5 </DIV>", "+.1234567890"),
*
you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed).
*
For example if you call trimButCharsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+.1234567890"),
*
you obtain a string "+1 2 . 5" as output (the spaces inside the string are not removed).
* @param input The string in input.
* @param charsDoNotBeRemoved The chars that do not be removed.
* @return The string as output.
*/
public static String trimButCharsBeginEnd (String input, String charsDoNotBeRemoved)
{
String output = new String();
int begin=0;
int end=input.length()-1;
boolean charFound=false;
boolean ok=true;
for (int index=begin; (index=0) && ok; index--)
{
charFound=false;
for (int charsCount=0; charsCountFor example if you call splitChars("<DIV> +12.5, +3.4 </DIV>", " <>DIV/,"),
*
you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed).
* @param input The string in input.
* @param charsToBeRemoved The chars to be removed.
* @return The array of strings as output.
*/
public static String[] splitChars (String input, String charsToBeRemoved)
{
ArrayList output = new ArrayList();
int minCapacity = 0;
StringBuilder str = new StringBuilder();
boolean charFound = false;
boolean toBeAdd = false;
for (int index=0; indexFor example if you call trimChars("<DIV> +12.5 </DIV>", "<>DIV/ "),
*
you obtain a string "+12.5" as output (<,>,D,I,V,/ and space char are chars that must be removed).
*
For example if you call trimChars("<DIV> Trim All Chars Also The Ones Inside The String </DIV>", "<>DIV/ "),
*
you obtain a string "TrimAllCharsAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed).
* @param input The string in input.
* @param charsToBeRemoved The chars to be removed.
* @return The string as output.
*/
public static String trimChars (String input, String charsToBeRemoved)
{
StringBuilder output = new StringBuilder();
boolean charFound=false;
for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string.
*
For example if you call trimCharsBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/ "),
*
you obtain a string "+12.5" as output (' ' is a space char and <,>,D,I,V,/ are chars that must be removed).
*
For example if you call trimCharsBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/ "),
*
you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved).
* @param input The string in input.
* @param charsToBeRemoved The chars to be removed.
* @return The string as output.
*/
public static String trimCharsBeginEnd (String input, String charsToBeRemoved)
{
String output = new String();
int begin=0;
int end=input.length()-1;
boolean charFound=false;
boolean ok=true;
for (int index=begin; (index=0) && ok; index--)
{
charFound=false;
for (int charsCount=0; charsCountFor example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}),
*
you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content recursively).
*
For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false),
*
you obtain a string array {"Begin ", "<DIV> +12.5 </DIV>", " ALL OK"} as output (splitted <DIV> tags and not their content and no recursively).
*
For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false),
*
you obtain a string array {"Begin ", " +12.5 ", " ALL OK"} as output (splitted <DIV> tags and not their content recursively).
*
For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true),
*
you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content).
* @param input The string in input.
* @param tags The tags to be used as splitting delimiter.
* @param recursive Optional parameter (true if not present), if true delete all the tags recursively.
* @param insideTag Optional parameter (true if not present), if true delete also the content of the tags.
* @return The string array containing the strings delimited by tags.
*/
public static String[] splitTags (String input, String[] tags, boolean recursive, boolean insideTag)
throws ParserException, UnsupportedEncodingException
{
ArrayList outputArrayList = new ArrayList();
int minCapacity = 0;
String output = new String();
String inputModified = new String(input);
String[] outputStr = new String[] {};
String dummyString = createDummyString (' ', input.length());
// loop inside the different tags to be trimmed
for (int i=0; iUse Class class as input parameter
* instead of tags[] string array.
* @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String[] splitTags (String input, Class nodeType)
throws ParserException, UnsupportedEncodingException
{
return splitTags (input, new NodeClassFilter (nodeType), true, true);
}
/**
* Split the input string in a string array,
* considering the tags as delimiter for splitting.
*
Use Class class as input parameter
* instead of tags[] string array.
* @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String[] splitTags (String input, Class nodeType, boolean recursive, boolean insideTag)
throws ParserException, UnsupportedEncodingException
{
return splitTags (input, new NodeClassFilter (nodeType), recursive, insideTag);
}
/**
* Split the input string in a string array,
* considering the tags as delimiter for splitting.
*
Use NodeFilter class as input parameter
* instead of tags[] string array.
* @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String[] splitTags (String input, NodeFilter filter)
throws ParserException, UnsupportedEncodingException
{
return splitTags (input, filter, true, true);
}
/**
* Split the input string in a string array,
* considering the tags as delimiter for splitting.
*
Use NodeFilter class as input parameter
* instead of tags[] string array.
* @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String[] splitTags (String input, NodeFilter filter, boolean recursive, boolean insideTag)
throws ParserException, UnsupportedEncodingException
{
ArrayList outputArrayList = new ArrayList();
int minCapacity = 0;
String output = new String();
String dummyString = createDummyString (' ', input.length());
// loop inside the tags of the same type
NodeList links = getLinks (input, filter, recursive);
for (int j=0; jThe method trims all the substrings included in the input string of the following type:
* "<XXX>", where XXX could be a string of any type.
*
If you set to true the inside parameter, the method deletes also the YYY string in the following input string:
* "<XXX>YYY<ZZZ>", note that ZZZ is not necessary the closing tag of XXX.
* @param input The string in input.
* @param inside If true, it forces the method to delete also what is inside the tags.
* @return The string without tags.
*/
public static String trimAllTags (String input, boolean inside)
{
StringBuilder output = new StringBuilder();
if (inside) {
if ((input.indexOf('<')==-1) || (input.lastIndexOf('>')==-1) || (input.lastIndexOf('>')')+1, input.length()));
}
} else {
boolean write = true;
for (int index=0; index' && (!write))
write = true;
}
}
return output.toString();
}
/**
* Trim all tags in the input string and
* return a string like the input one
* without the tags and their content.
* @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String trimTags (String input, String[] tags)
throws ParserException, UnsupportedEncodingException
{
return trimTags (input, tags, true, true);
}
/**
* Trim all tags in the input string and
* return a string like the input one
* without the tags and their content (optional).
*
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}),
*
you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content recursively).
*
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false),
*
you obtain a string "<DIV> +12.5 </DIV> ALL OK" as output (trimmed <DIV> tags and not their content and no recursively).
*
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false),
*
you obtain a string " +12.5 ALL OK" as output (trimmed <DIV> tags and not their content recursively).
*
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true),
*
you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content).
* @param input The string in input.
* @param tags The tags to be removed.
* @param recursive Optional parameter (true if not present), if true delete all the tags recursively.
* @param insideTag Optional parameter (true if not present), if true delete also the content of the tags.
* @return The string without tags.
*/
public static String trimTags (String input, String[] tags, boolean recursive, boolean insideTag)
throws ParserException, UnsupportedEncodingException
{
StringBuilder output = new StringBuilder();
String inputModified = new String(input);
String dummyString = createDummyString (' ', input.length());
// loop inside the different tags to be trimmed
for (int i=0; iUse Class class as input parameter
* instead of tags[] string array.
* @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String trimTags (String input, Class nodeType)
throws ParserException, UnsupportedEncodingException
{
return trimTags (input, new NodeClassFilter (nodeType), true, true);
}
/**
* Trim all tags in the input string and
* return a string like the input one
* without the tags and their content (optional).
*
Use Class class as input parameter
* instead of tags[] string array.
* @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String trimTags (String input, Class nodeType, boolean recursive, boolean insideTag)
throws ParserException, UnsupportedEncodingException
{
return trimTags (input, new NodeClassFilter (nodeType), recursive, insideTag);
}
/**
* Trim all tags in the input string and
* return a string like the input one
* without the tags and their content.
*
Use NodeFilter class as input parameter
* instead of tags[] string array.
* @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String trimTags (String input, NodeFilter filter)
throws ParserException, UnsupportedEncodingException
{
return trimTags (input, filter, true, true);
}
/**
* Trim all tags in the input string and
* return a string like the input one
* without the tags and their content (optional).
*
Use NodeFilter class as input parameter
* instead of tags[] string array.
* @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag).
*/
public static String trimTags (String input, NodeFilter filter, boolean recursive, boolean insideTag)
throws ParserException, UnsupportedEncodingException
{
StringBuilder output = new StringBuilder();
String dummyString = createDummyString (' ', input.length());
// loop inside the tags of the same type
NodeList links = getLinks (input, filter, recursive);
for (int j=0; jThe string will be parsed as it would be a file.
* @param input The string in input.
* @return The Parser Object with the string as input stream.
*/
public static Parser createParserParsingAnInputString (String input)
throws ParserException, UnsupportedEncodingException
{
Parser parser = new Parser();
Lexer lexer = new Lexer();
Page page = new Page(input);
lexer.setPage(page);
parser.setLexer(lexer);
return parser;
}
private static NodeList getLinks (String output, String tag, boolean recursive)
throws ParserException, UnsupportedEncodingException
{
Parser parser = new Parser();
NodeFilter filterLink = new TagNameFilter (tag);
NodeList links = new NodeList ();
parser = createParserParsingAnInputString(output);
links = parser.extractAllNodesThatMatch(filterLink);
// loop to remove tags added recursively
// so if you have selected 'not recursive option'
// you have only the tag container and not the contained tags.
if (!recursive)
{
for (int j=0; jjStartTagBegin) && (kEndTagEndjStartTagBegin) && (kEndTagEnd