All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlparser.util.ParserUtils Maven / Gradle / Ivy

Go to download

bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com

The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/15 11:49:05 $
// $Revision: 1.47 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.util;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.tags.CompositeTag;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;


public class ParserUtils
{
    public static String removeChars(String s, char occur) {
        StringBuilder newString = new StringBuilder();
        char ch;
        for (int i = 0; i < s.length(); i++) {
            ch = s.charAt(i);
            if (ch != occur)
                newString.append(ch);
        }
        return newString.toString();
    }

    public static String removeEscapeCharacters(String inputString) {
        inputString = ParserUtils.removeChars(inputString, '\r');
        inputString = ParserUtils.removeChars(inputString, '\n');
        inputString = ParserUtils.removeChars(inputString, '\t');
        return inputString;
    }

    public static String removeTrailingBlanks(String text) {
        char ch = ' ';
        while (ch == ' ') {
            ch = text.charAt(text.length() - 1);
            if (ch == ' ')
                text = text.substring(0, text.length() - 1);
        }
        return text;
    }

    /**
     * Search given node and pick up any objects of given type.
     * @param node The node to search.
     * @param type The class to search for.
     * @return A node array with the matching nodes.
     */
    public static Node[] findTypeInNode(Node node, Class type)
    {
        NodeFilter filter;
        NodeList ret;
        
        ret = new NodeList ();
        filter = new NodeClassFilter (type);
        node.collectInto (ret, filter);

        return (ret.toNodeArray ());
    }

    /**
     * Split the input string considering as string separator
     * all the not numerical characters
     * with the only exception of the characters specified in charsDoNotBeRemoved param.
     * 
For example if you call splitButDigits("<DIV> +12.5, +3.4 </DIV>", "+."), *
you obtain an array of strings {"+12.5", "+3.4"} as output (1,2,3,4 and 5 are digits and +,. are chars that do not be removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The array of strings as output. */ public static String[] splitButDigits (String input, String charsDoNotBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuilder str = new StringBuilder(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; indexFor example if you call trimButDigits("<DIV> +12.5 </DIV>", "+."), *
you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). *
For example if you call trimButDigits("<DIV> +1 2 . 5 </DIV>", "+."), *
you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButDigits (String input, String charsDoNotBeRemoved) { StringBuilder output = new StringBuilder(); boolean charFound=false; for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string. *
For example if you call trimButDigitsBeginEnd("<DIV> +12.5 </DIV>", "+."), *
you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). *
For example if you call trimButDigitsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+."), *
you obtain a string "+1 2 . 5" as output (the spacess inside the string are not removed). * @param input - The string in input. * @param charsDoNotBeRemoved - The chars that do not be removed. * @return The string as output. */ public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCountFor example if you call splitSpaces("<DIV> +12.5, +3.4 </DIV>", "<>DIV/,"), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The array of strings as output. */ public static String[] splitSpaces (String input, String charsToBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuilder str = new StringBuilder(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; indexFor example if you call trimSpaces("<DIV> +12.5 </DIV>", "<>DIV/"), *
you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). *
For example if you call trimSpaces("<DIV> Trim All Spaces Also The Ones Inside The String </DIV>", "<>DIV/"), *
you obtain a string "TrimAllSpacesAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimSpaces (String input, String charsToBeRemoved) { StringBuilder output = new StringBuilder(); boolean charFound=false; for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string. *
For example if you call trimSpacesBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/"), *
you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). *
For example if you call trimSpacesBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/"), *
you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimSpacesBeginEnd (String input, String charsToBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCountFor example if you call splitButChars("<DIV> +12.5, +3.4 </DIV>", "+.1234567890"), *
you obtain an array of strings {"+12.5", "+3.4"} as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The array of strings as output. */ public static String[] splitButChars (String input, String charsDoNotBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuilder str = new StringBuilder(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; indexFor example if you call trimButChars("<DIV> +12.5 </DIV>", "+.1234567890"), *
you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). *
For example if you call trimButChars("<DIV> +1 2 . 5 </DIV>", "+.1234567890"), *
you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButChars (String input, String charsDoNotBeRemoved) { StringBuilder output = new StringBuilder(); boolean charFound=false; for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string. *
For example if you call trimButCharsBeginEnd("<DIV> +12.5 </DIV>", "+.1234567890"), *
you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). *
For example if you call trimButCharsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+.1234567890"), *
you obtain a string "+1 2 . 5" as output (the spaces inside the string are not removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButCharsBeginEnd (String input, String charsDoNotBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCountFor example if you call splitChars("<DIV> +12.5, +3.4 </DIV>", " <>DIV/,"), *
you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The array of strings as output. */ public static String[] splitChars (String input, String charsToBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuilder str = new StringBuilder(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; indexFor example if you call trimChars("<DIV> +12.5 </DIV>", "<>DIV/ "), *
you obtain a string "+12.5" as output (<,>,D,I,V,/ and space char are chars that must be removed). *
For example if you call trimChars("<DIV> Trim All Chars Also The Ones Inside The String </DIV>", "<>DIV/ "), *
you obtain a string "TrimAllCharsAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimChars (String input, String charsToBeRemoved) { StringBuilder output = new StringBuilder(); boolean charFound=false; for (int index=0; indexThe removal process removes only chars at the beginning and at the end of the string. *
For example if you call trimCharsBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/ "), *
you obtain a string "+12.5" as output (' ' is a space char and <,>,D,I,V,/ are chars that must be removed). *
For example if you call trimCharsBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/ "), *
you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimCharsBeginEnd (String input, String charsToBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCountFor example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}), *
you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content recursively). *
For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false), *
you obtain a string array {"Begin ", "<DIV> +12.5 </DIV>", " ALL OK"} as output (splitted <DIV> tags and not their content and no recursively). *
For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false), *
you obtain a string array {"Begin ", " +12.5 ", " ALL OK"} as output (splitted <DIV> tags and not their content recursively). *
For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true), *
you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content). * @param input The string in input. * @param tags The tags to be used as splitting delimiter. * @param recursive Optional parameter (true if not present), if true delete all the tags recursively. * @param insideTag Optional parameter (true if not present), if true delete also the content of the tags. * @return The string array containing the strings delimited by tags. */ public static String[] splitTags (String input, String[] tags, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { ArrayList outputArrayList = new ArrayList(); int minCapacity = 0; String output = new String(); String inputModified = new String(input); String[] outputStr = new String[] {}; String dummyString = createDummyString (' ', input.length()); // loop inside the different tags to be trimmed for (int i=0; iUse Class class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, Class nodeType) throws ParserException, UnsupportedEncodingException { return splitTags (input, new NodeClassFilter (nodeType), true, true); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. *
Use Class class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, Class nodeType, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { return splitTags (input, new NodeClassFilter (nodeType), recursive, insideTag); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. *
Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, NodeFilter filter) throws ParserException, UnsupportedEncodingException { return splitTags (input, filter, true, true); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. *
Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { ArrayList outputArrayList = new ArrayList(); int minCapacity = 0; String output = new String(); String dummyString = createDummyString (' ', input.length()); // loop inside the tags of the same type NodeList links = getLinks (input, filter, recursive); for (int j=0; jThe method trims all the substrings included in the input string of the following type: * "<XXX>", where XXX could be a string of any type. *
If you set to true the inside parameter, the method deletes also the YYY string in the following input string: * "<XXX>YYY<ZZZ>", note that ZZZ is not necessary the closing tag of XXX. * @param input The string in input. * @param inside If true, it forces the method to delete also what is inside the tags. * @return The string without tags. */ public static String trimAllTags (String input, boolean inside) { StringBuilder output = new StringBuilder(); if (inside) { if ((input.indexOf('<')==-1) || (input.lastIndexOf('>')==-1) || (input.lastIndexOf('>')')+1, input.length())); } } else { boolean write = true; for (int index=0; index' && (!write)) write = true; } } return output.toString(); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, String[] tags) throws ParserException, UnsupportedEncodingException { return trimTags (input, tags, true, true); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content (optional). *
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}), *
you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content recursively). *
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false), *
you obtain a string "<DIV> +12.5 </DIV> ALL OK" as output (trimmed <DIV> tags and not their content and no recursively). *
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false), *
you obtain a string " +12.5 ALL OK" as output (trimmed <DIV> tags and not their content recursively). *
For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true), *
you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content). * @param input The string in input. * @param tags The tags to be removed. * @param recursive Optional parameter (true if not present), if true delete all the tags recursively. * @param insideTag Optional parameter (true if not present), if true delete also the content of the tags. * @return The string without tags. */ public static String trimTags (String input, String[] tags, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { StringBuilder output = new StringBuilder(); String inputModified = new String(input); String dummyString = createDummyString (' ', input.length()); // loop inside the different tags to be trimmed for (int i=0; iUse Class class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, Class nodeType) throws ParserException, UnsupportedEncodingException { return trimTags (input, new NodeClassFilter (nodeType), true, true); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content (optional). *
Use Class class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, Class nodeType, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { return trimTags (input, new NodeClassFilter (nodeType), recursive, insideTag); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content. *
Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, NodeFilter filter) throws ParserException, UnsupportedEncodingException { return trimTags (input, filter, true, true); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content (optional). *
Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { StringBuilder output = new StringBuilder(); String dummyString = createDummyString (' ', input.length()); // loop inside the tags of the same type NodeList links = getLinks (input, filter, recursive); for (int j=0; jThe string will be parsed as it would be a file. * @param input The string in input. * @return The Parser Object with the string as input stream. */ public static Parser createParserParsingAnInputString (String input) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); Lexer lexer = new Lexer(); Page page = new Page(input); lexer.setPage(page); parser.setLexer(lexer); return parser; } private static NodeList getLinks (String output, String tag, boolean recursive) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); NodeFilter filterLink = new TagNameFilter (tag); NodeList links = new NodeList (); parser = createParserParsingAnInputString(output); links = parser.extractAllNodesThatMatch(filterLink); // loop to remove tags added recursively // so if you have selected 'not recursive option' // you have only the tag container and not the contained tags. if (!recursive) { for (int j=0; jjStartTagBegin) && (kEndTagEndjStartTagBegin) && (kEndTagEnd




© 2015 - 2024 Weber Informatics LLC | Privacy Policy