All Downloads are FREE. Search and download functionalities are using the official Maven repository.

sunlabs.brazil.handler.HtmlRewriter Maven / Gradle / Ivy

The newest version!
/*
 * HtmlRewriter.java
 *
 * Brazil project web application toolkit,
 * export version: 2.3 
 * Copyright (c) 1999-2006 Sun Microsystems, Inc.
 *
 * Sun Public License Notice
 *
 * The contents of this file are subject to the Sun Public License Version 
 * 1.0 (the "License"). You may not use this file except in compliance with 
 * the License. A copy of the License is included as the file "license.terms",
 * and also available at http://www.sun.com/
 * 
 * The Original Code is from:
 *    Brazil project web application toolkit release 2.3.
 * The Initial Developer of the Original Code is: cstevens.
 * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
 * All Rights Reserved.
 * 
 * Contributor(s): cstevens, guym, suhler.
 *
 * Version:  2.6
 * Created by cstevens on 99/09/29
 * Last modified by suhler on 06/04/25 14:28:22
 *
 * Version Histories:
 *
 * 2.6 06/04/25-14:28:22 (suhler)
 *   add "getMap()" to return a copy of the tag attributes
 *
 * 2.5 05/06/16-08:00:02 (suhler)
 *   Make "deQuote" public, and move to Format.
 *
 * 2.4 04/04/05-14:49:18 (suhler)
 *   add token counters, change quoting semantics
 *
 * 2.3 03/07/17-10:40:26 (suhler)
 *   change quote() again.  It now always adds "'s if plausible.  Although this
 *   perturbs the original document more than necessary, it's always safe, and
 *   gets around problems where ${} expansions might cause "'s to be required
 *
 * 2.2 03/07/07-15:54:14 (suhler)
 *   Merged changes between child workspace "/home/suhler/brazil/naws" and
 *   parent workspace "/net/mack.eng/export/ws/brazil/naws".
 *
 * 1.13.1.1 03/07/07-15:52:34 (suhler)
 *   modify quoting convensions a little
 *
 * 2.1 02/10/01-16:36:25 (suhler)
 *   version change
 *
 * 1.13 02/07/24-10:45:30 (suhler)
 *   doc updates
 *
 * 1.12 02/05/01-11:21:08 (suhler)
 *   fix sccs version info
 *
 * 1.11 01/08/21-13:00:22 (guym)
 *   Fixed a bug where string map was null when get was being called
 *
 * 1.10 01/03/08-16:03:45 (cstevens)
 *   Handle singleton HTML tags like 
or . Before, these forms * were interpreted incorrectly as the "br/" tag or the "a" tag with the * attribute "name" and the value "foo/". * * 1.9 00/12/27-12:12:09 (suhler) * accumulate now returns its previous setting * * 1.8 00/10/31-10:18:10 (suhler) * doc fixes * * 1.7 99/11/16-13:18:00 (cstevens) * Rename "getValue" to "get" to make it more compatible with Dictionary * naming scheme. * * 1.6 99/10/21-18:06:58 (cstevens) * HtmlRewriter didn't re-emit parsed comments as comments. * * 1.5 99/10/19-18:35:52 (cstevens) * * 1.4 99/10/14-14:57:20 (cstevens) * resolve wilcard imports. * * 1.3 99/10/07-12:59:08 (cstevens) * Javadocs for HtmlRewriter. * * 1.2 99/09/30-14:10:05 (cstevens) * Improperly quoting and dequoting HTML tag attributes led to forms/templates * not working. * * 1.2 99/09/29-16:13:26 (Codemgr) * SunPro Code Manager data about conflicts, renames, etc... * Name history : 1 0 handlers/HtmlRewriter.java * * 1.1 99/09/29-16:13:25 (cstevens) * date and time created 99/09/29 16:13:25 by cstevens * */ package sunlabs.brazil.handler; import sunlabs.brazil.util.Format; import sunlabs.brazil.util.LexHTML; import sunlabs.brazil.util.StringMap; import java.util.Enumeration; /** * This class helps with parsing and rewriting an HTML document. The * source document is not changed; a new HTML document is built. *

* The user can sequentially examine and rewrite each token in the source * HTML document. As each token in the document is seen, the user has * two choices:

    *
  • modify the current token. *
  • don't modify the current token. *
* If the user modifies (or replaces, deletes, etc.) the current token, * then the resultant HTML document will contain that modification. On * the other hand, if the user doesn't do anything with the current token, * it will appear, unchanged, in the resultant HTML document. *

* Parsing is implemented lazily, meaning, for example, that unless the * user actually asks for attributes of an HTML tag, this parser * does not have to spend the time breaking up the attributes. *

* This class is used by HTML filters to maintain the state of the * document and allow the filters to perform arbitrary rewriting. * * @author Colin Stevens ([email protected]) * @version @(#)HtmlRewriter.java 2.6 */ public class HtmlRewriter { /** * The parser for the source HTML document. */ public LexHTML lex; /** * Storage holding the resultant HTML document. */ public StringBuffer sb; /** * true if the last token was pushed back and should be * presented again next time. Made false once the * pushedback token is presented. */ boolean pushback; /** * true if nextToken should automatically * append unmodified tokens to the result. */ boolean accumulate; /** * true if the user has already explicitly appended * something, so nextToken shouldn't append the * unmodified token. */ boolean appendToken; /** * true if the user has modified the tag name or * attributes of the current tag, so when this tag is appended, we * need to write out its parts rather than just emitting the raw token. */ boolean tokenModified; int type; boolean singleton; String token; String tag; StringMap map; int tokenCount; // count tokens int tagCount; // count just tags /** * Creates a new HtmlRewriter from the given HTML parser. * * @param lex * The HTML parser. */ public HtmlRewriter(LexHTML lex) { this.lex = lex; sb = new StringBuffer(); accumulate = true; tokenCount=0; tagCount=0; } /** * Creates a new HtmlRewriter that will operate on the given * string. * * @param str * The HTML document. */ public HtmlRewriter(String str) { this(new LexHTML(str)); } /** * Returns the "new" rewritten HTML document. This is normally called * once all of the tokens have been processed, and the user wants to * send on this rewritten document. *

* At any time, this method can be called to return the current state * of the HTML document. The return value is the result of * processing the source document up to this point in time; the * unprocessed remainder of the source document is not considered. *

* Due to the implementation, calling this method may be expensive. * Specifically, calling this method a second (or further) time for * a given HtmlRewriter may involve copying temporary * strings around. The pessimal case would be to call this method * every time a new token is appended. * * @return The rewritten HTML document, up to this point in time. */ public String toString() { return sb.toString(); } /** * Advances to the next token in the source HTML document. *

* The other purpose of this function is to "do the right thing", which * is to append the token we just processed to the resultant HTML * document, unless the user has already appended something else. *

* A sample program follows. This program changes all * <img> tags to <form> tags, * deletes all <table> tags, capitalizes * and bolds each string token, and passes all other tokens through * unchanged, to illustrate how nextToken interacts with * some of the other methods in this class. *

     * HtmlRewriter hr = new HtmlRewriter(str);
     * while (hr.nextToken()) {
     *     switch (hr.getType()) {
     *     case LexHTML.TAG: 
     *         if (hr.getTag().equals("img")) {
     *             // Change the tag name w/o affecting the attributes.
     *             
     *             hr.setTag("form");
     *         } else if (hr.getTag().equals("table")) {
     *             // Eliminate the entire "table" token.
     *             
     *             hr.killToken();
     *         } 
     *         break;
     *             
     *     case LexHTML.STRING:
     *         // Append a new sequence in place of the existing token.
     *
     *         hr.append("<b>" + hr.getToken().toUpperCase() + "</b>");
     *         break;
     *     }
     *     // Any tokens we didn't modify get copied through unchanged.
     * }
     * 
* * @return true if there are tokens left to process, * false otherwise. */ public boolean nextToken() { tokenCount++; if (pushback) { pushback = false; return true; } if (appendToken && accumulate) { appendToken(); } token = null; tag = null; map = null; appendToken = true; tokenModified = false; if (lex.nextToken()) { type = lex.getType(); if (type == LexHTML.TAG) { tagCount++; } singleton = lex.isSingleton(); return true; } return false; } /** * A convenence method built on top of nextToken. * Advances to the next HTML tag. All intervening strings and comments * between the last tag and the new current tag are copied through * unchanged. This method can be used when the caller wants to process * only HTML tags, without having to manually check the type of each * token to see if it is actually a tag. * * @return true if there are tokens left to process, * false otherwise. */ public boolean nextTag() { while (nextToken()) { if (getType() == LexHTML.TAG) { return true; } } return false; } /** * Gets the type of the current token. * * @return The type. * * @see LexHTML#getType */ public int getType() { return type; } /** * Sets the type of the current token. */ public void setType(int type) { this.type = type; tokenModified = true; } /** * See if the current tag a singleton. A Singleton tag ends in "/", as * in <
. */ public boolean isSingleton() { return singleton; } /** * Make the current tag a singleton. A Singleton tag ends in "/", as * in <
. */ public void setSingleton(boolean singleton) { this.singleton = singleton; tokenModified = true; } /** * Gets the raw string making up the entire current token, including * the angle brackets or comment delimiters, if applicable. * * @return The current token. * * @see LexHTML#getToken */ public String getToken() { if (token == null) { token = lex.getToken(); } return token; } /** * Gets the current tag's name. The name returned is converted to * lower case. * * @return The lower-cased tag name, or null if the * current token does not have a tag name * * @see LexHTML#getTag */ public String getTag() { if (tag == null) { tag = lex.getTag(); } return tag; } /** * Changes the current tag's name. The tag's attributes are not changed. * * @param tag * New tag name */ public void setTag(String tag) { this.tag = tag; tokenModified = true; } /** * Gets the body of the current token as a string. * * @return The body. * * @see LexHTML#getBody */ public String getBody() { return lex.getBody(); } /** * Gets the arguments of the current token as a string. * * @return The body. * * @see LexHTML#getArgs */ public String getArgs() { return lex.getArgs(); } /** * Returns the value that the specified case-insensitive key maps * to in the attributes for the current tag. For keys that were * present in the tag's attributes without a value, the value returned * is the empty string. In other words, for the tag * <table border rows=2>:
    *
  • get("border") returns the empty string "". *
  • get("rows") returns 2. *
*

* Surrounding single and double quote marks that occur in the literal * tag are removed from the values reported. So, for the tag * <a href="/foo.html" target=_top onclick='alert("hello")'>:

    *
  • get("href") returns /foo.html . *
  • get("target") returns _top . *
  • get("onclick") returns alert("hello") . *
* * @param The key to lookup in the current tag's attributes. * * @return The value to which the specified key is mapped, or * null if the key was not in the attributes. * * @see LexHTML#getAttributes */ public String get(String key) { String str; getAttributes(); if (map != null) { str = map.get(key); if (str == null) { return null; } } else { return null; } return Format.deQuote(str); } /** * Maps the given case-insensitive key to the specified value in the * current tag's attributes. *

* The value can be retrieved by calling get with a * key that is case-insensitive equal to the given key. *

* If the attributes already contained a mapping for the given key, * the old value is forgotten and the new specified value is used. * The case of the prior key is retained in that case. Otherwise * the case of the new key is used and a new mapping is made. * * @param key * The new key. May not be null. * * @param value * The new value. May be not be null. */ public void put(String key, String value) { getAttributes(); map.put(key, quote(value)); tokenModified = true; } /** * Removes the given case-insensitive key and its corresponding value * from the current tag's attributes. This method does nothing if the * key is not in the attributes. * * @param key * The key that needs to be removed. Must not be * null. */ public void remove(String key) { getAttributes(); map.remove(key); tokenModified = true; } /** * Returns an enumeration of the keys in the current tag's attributes. * The elements of the enumeration are the string keys. The keys can * be passed to get to get the values of the attributes. * * @return An enumeration of the keys. */ public Enumeration keys() { getAttributes(); return map.keys(); } /** * Instead of modifying an existing token, this method allows the user * to completely replace the current token with arbitrary new content. *

* This method may be called multiple times while processing the current * token to add more and more data to the resultant HTML document. * Before and/or after calling this method, the appendToken * method may also be called explicitly in order to add the current token * to the resultant HTML document. *

* Following is sample code illustrating how to use this method * to put bold tags around all the <a> tags. *

     * HtmlRewriter hr = new HtmlRewriter(str);
     * while (hr.nextTag()) {
     *     if (hr.getTag().equals("a")) {
     *         hr.append("<b>");
     *         hr.appendToken();
     *     } else if (hr.getTag().equals("/a")) {
     *         hr.appendToken();
     *         hr.append("</b>");
     *     }
     * }
     * 
* The calls to appendToken are necessary. Otherwise, * the HtmlRewriter could not know where and when to * append the existing token in addition to the new content provided * by the user. * * @param str * The new content to append. May be null, * in which case no new content is appended (the equivalent * of appending ""). * * @see #appendToken * @see #killToken */ public void append(String str) { if (str != null) { sb.append(str); } appendToken = false; } /** * Appends the current token to the resultant HTML document. * If the caller has changed the current token using the * setTag, set, or remove * methods, those changes will be reflected. *

* By default, this method is automatically called after each token is * processed unless the user has already appended something to the * resultant HTML document. Therefore, if the user appends something * and also wants to append the current token, or if the user wants * to append the current token a number of times, this method must * be called. * * @see #append * @see #killToken */ public void appendToken() { appendToken = false; if (tokenModified) { getTag(); getAttributes(); if (getType() == LexHTML.COMMENT) { sb.append("<--"); } else { sb.append('<'); } sb.append(tag); int length = map.size(); for (int i = 0; i < length; i++) { sb.append(' ').append(map.getKey(i)); String value = map.get(i); if ((value != null) && (value.length() > 0)) { sb.append('=').append(value); } } if (isSingleton()) { if (length > 0) { sb.append(' '); } sb.append('/'); } if (getType() == LexHTML.COMMENT) { sb.append("-->"); } else { sb.append('>'); } } else { sb.append(getToken()); } } /** * Tells this HtmlRewriter not to append the current token * to the resultant HTML document. Even if the user hasn't appended * anything else, the current token will be ignored rather than appended. * * @see #append * @see #killToken */ public void killToken() { appendToken = false; } /** * Turns on or off the automatic accumulation of each token. *

* After each token is processed, the current token is appended to * to the resultant HTML document unless the user has already appended * something else. By setting accumulate to * false, this behavior is turned off. The user must then * explicitly call appendToken to cause the current token * to be appended. *

* Turning off accumulation takes effect immediately, while turning * on accumulation takes effect on the next token. In other words, * whether the user turns this setting off or on, the current token * will not be added to the resultant HTML document unless the user * explicitly calls appendToken. *

* Following is sample code that illustrates how to use this method * to extract the contents of the <head> of the * source HTML document. *

     * HtmlRewriter hr = new HtmlRewriter(str);
     * // Don't accumulate tokens until we see the <head> below.
     * hr.accumulate(false);
     * while (hr.nextTag()) {
     *     if (hr.getTag().equals("head")) {
     *         // Start remembering the contents of the HTML document,
     *         // not including the <head> tag itself.
     *
     *         hr.accumulate(true);
     *     } else if (hr.getTag().equals("/head")) {
     *         // Return everything accumulated so far.
     *
     *         return hr.toString();
     *     }
     * }
     * 
* This method can be called any number of times while processing * the source HTML document. * * @param accumulate * true to automatically accumulate tokens in the * resultant HTML document, false to require * that the user explicitly accumulate them. * @return The previous accumulate setting * * @see #reset */ public boolean accumulate(boolean accumulate) { boolean was = this.accumulate; this.accumulate = accumulate; appendToken = false; return was; } /** * Forgets all the tokens that have been appended to the resultant * HTML document so far, including the current token. */ public void reset() { sb.setLength(0); appendToken = false; } /** * Puts the current token back. The next time nextToken * is called, it will be the current token again, rather than * advancing to the next token in the source HTML document. *

* This is useful when a code fragment needs to read an indefinite * number of tokens, but that once some distinguished token is found, * needs to push that token back so that normal processing can occur * on that token. */ public void pushback() { pushback = true; } /** * Return count of tokens seen so far */ public int tokenCount() { return tokenCount; } /** * Return count of tags seen so far */ public int tagCount() { return tagCount; } /* * The set of characters that will turn-on quoting */ // public static String needQuote="' \t%$"; // these need quoting /** * Helper class to quote a attribute's value when the value is being * written to the resultant HTML document. Values set by the * put method are automatically quoted as needed. This * method is provided in case the user is dynamically constructing a new * tag to be appended with append and needs to quote some * arbitrary values. *

* The quoting algorithm is as follows:
* If the string contains double-quotes, put single quotes around it.
* If the string contains any "special" characters, put double-quotes * around it. *

* This algorithm is, of course, insufficient for complicated * strings that include both single and double quotes. In that case, * it is the user's responsibility to escape the special characters * in the string using the HTML special symbols like * &quot; or &#34; * * @return The quoted string, or the original string if it did not * need to be quoted. */ public static String quote(String str) { if (str.indexOf('\"') >= 0) { return "\'" + str + "\'"; } else if (str.length() > 0) { return "\"" + str + "\""; } else { return ""; } } /** * see if target contains any of the strings in candidates */ static private boolean contains(String target, String candidates) { char[] check = candidates.toCharArray(); for(int i=0;i= 0) { return true; } } return false; } private void getAttributes() { if (map == null) { map = lex.getAttributes(); } } /** * Return a copy of the StringMap of attributes. */ public StringMap getMap() { getAttributes(); StringMap result = new StringMap(); result.append(map, true); return result; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy