sunlabs.brazil.handler.HtmlRewriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sunlabs.brazil Show documentation
Extremely small footprint Java HTTP stack.
The newest version!
/*
 * HtmlRewriter.java
 *
 * Brazil project web application toolkit,
 * export version: 2.3 
 * Copyright (c) 1999-2006 Sun Microsystems, Inc.
 *
 * Sun Public License Notice
 *
 * The contents of this file are subject to the Sun Public License Version 
 * 1.0 (the "License"). You may not use this file except in compliance with 
 * the License. A copy of the License is included as the file "license.terms",
 * and also available at http://www.sun.com/
 * 
 * The Original Code is from:
 *    Brazil project web application toolkit release 2.3.
 * The Initial Developer of the Original Code is: cstevens.
 * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
 * All Rights Reserved.
 * 
 * Contributor(s): cstevens, guym, suhler.
 *
 * Version:  2.6
 * Created by cstevens on 99/09/29
 * Last modified by suhler on 06/04/25 14:28:22
 *
 * Version Histories:
 *
 * 2.6 06/04/25-14:28:22 (suhler)
 *   add "getMap()" to return a copy of the tag attributes
 *
 * 2.5 05/06/16-08:00:02 (suhler)
 *   Make "deQuote" public, and move to Format.
 *
 * 2.4 04/04/05-14:49:18 (suhler)
 *   add token counters, change quoting semantics
 *
 * 2.3 03/07/17-10:40:26 (suhler)
 *   change quote() again.  It now always adds "'s if plausible.  Although this
 *   perturbs the original document more than necessary, it's always safe, and
 *   gets around problems where ${} expansions might cause "'s to be required
 *
 * 2.2 03/07/07-15:54:14 (suhler)
 *   Merged changes between child workspace "/home/suhler/brazil/naws" and
 *   parent workspace "/net/mack.eng/export/ws/brazil/naws".
 *
 * 1.13.1.1 03/07/07-15:52:34 (suhler)
 *   modify quoting convensions a little
 *
 * 2.1 02/10/01-16:36:25 (suhler)
 *   version change
 *
 * 1.13 02/07/24-10:45:30 (suhler)
 *   doc updates
 *
 * 1.12 02/05/01-11:21:08 (suhler)
 *   fix sccs version info
 *
 * 1.11 01/08/21-13:00:22 (guym)
 *   Fixed a bug where string map was null when get was being called
 *
 * 1.10 01/03/08-16:03:45 (cstevens)
 *   Handle singleton HTML tags like 
 or .  Before, these forms
 *   were interpreted incorrectly as the "br/" tag or the "a" tag with the
 *   attribute "name" and the value "foo/".
 *
 * 1.9 00/12/27-12:12:09 (suhler)
 *   accumulate now returns its previous setting
 *
 * 1.8 00/10/31-10:18:10 (suhler)
 *   doc fixes
 *
 * 1.7 99/11/16-13:18:00 (cstevens)
 *   Rename "getValue" to "get" to make it more compatible with Dictionary
 *   naming scheme.
 *
 * 1.6 99/10/21-18:06:58 (cstevens)
 *   HtmlRewriter didn't re-emit parsed comments as comments.
 *
 * 1.5 99/10/19-18:35:52 (cstevens)
 *
 * 1.4 99/10/14-14:57:20 (cstevens)
 *   resolve wilcard imports.
 *
 * 1.3 99/10/07-12:59:08 (cstevens)
 *   Javadocs for HtmlRewriter.
 *
 * 1.2 99/09/30-14:10:05 (cstevens)
 *   Improperly quoting and dequoting HTML tag attributes led to forms/templates
 *   not working.
 *
 * 1.2 99/09/29-16:13:26 (Codemgr)
 *   SunPro Code Manager data about conflicts, renames, etc...
 *   Name history : 1 0 handlers/HtmlRewriter.java
 *
 * 1.1 99/09/29-16:13:25 (cstevens)
 *   date and time created 99/09/29 16:13:25 by cstevens
 *
 */

package sunlabs.brazil.handler;

import sunlabs.brazil.util.Format;
import sunlabs.brazil.util.LexHTML;
import sunlabs.brazil.util.StringMap;

import java.util.Enumeration;

/**
 * This class helps with parsing and rewriting an HTML document.  The
 * source document is not changed; a new HTML document is built.
 * 
 * The user can sequentially examine and rewrite each token in the source
 * HTML document.  As each token in the document is seen, the user has
 * two choices: 

 *  modify the current token.
 * 
 don't modify the current token.
 * 
 * If the user modifies (or replaces, deletes, etc.) the current token,
 * then the resultant HTML document will contain that modification.  On
 * the other hand, if the user doesn't do anything with the current token,
 * it will appear, unchanged, in the resultant HTML document.
 * 
 * Parsing is implemented lazily, meaning, for example, that unless the
 * user actually asks for attributes of an HTML tag, this parser
 * does not have to spend the time breaking up the attributes.
 * 

 * This class is used by HTML filters to maintain the state of the
 * document and allow the filters to perform arbitrary rewriting.
 *
 * @author	Colin Stevens ([email protected])
 * @version	@(#)HtmlRewriter.java	2.6
 */
public class HtmlRewriter
{
    /**
     * The parser for the source HTML document.
     */
    public LexHTML lex;

    /**
     * Storage holding the resultant HTML document.
     */
    public StringBuffer sb;

    /**
     * true if the last token was pushed back and should be
     * presented again next time.  Made false once the
     * pushedback token is presented.
     */
    boolean pushback;

    /**
     * true if nextToken should automatically
     * append unmodified tokens to the result.
     */
    boolean accumulate;

    /**
     * true if the user has already explicitly appended
     * something, so nextToken shouldn't append the
     * unmodified token.  
     */
    boolean appendToken;

    /**
     * true if the user has modified the tag name or
     * attributes of the current tag, so when this tag is appended, we
     * need to write out its parts rather than just emitting the raw token.
     */
    boolean tokenModified;

    int type;
    boolean singleton;
    String token;
    String tag;
    StringMap map;
    int tokenCount;	// count tokens
    int tagCount;	// count just tags

    /**
     * Creates a new HtmlRewriter from the given HTML parser.
     *
     * @param	lex
     *		The HTML parser.
     */
    public
    HtmlRewriter(LexHTML lex)
    {
	this.lex = lex;

	sb = new StringBuffer();
	accumulate = true;
	tokenCount=0;
	tagCount=0;
    }

    /**
     * Creates a new HtmlRewriter that will operate on the given
     * string.
     *
     * @param	str
     *		The HTML document.
     */
    public
    HtmlRewriter(String str)
    {
	this(new LexHTML(str));
    }

    /**
     * Returns the "new" rewritten HTML document.  This is normally called
     * once all of the tokens have been processed, and the user wants to
     * send on this rewritten document.
     * 

     * At any time, this method can be called to return the current state
     * of the HTML document.  The return value is the result of
     * processing the source document up to this point in time; the
     * unprocessed remainder of the source document is not considered.
     * 

     * Due to the implementation, calling this method may be expensive.
     * Specifically, calling this method a second (or further) time for
     * a given HtmlRewriter may involve copying temporary
     * strings around.  The pessimal case would be to call this method
     * every time a new token is appended.
     *
     * @return	The rewritten HTML document, up to this point in time.
     */
    public String
    toString()
    {
	return sb.toString();
    }

    /**
     * Advances to the next token in the source HTML document.
     * 

     * The other purpose of this function is to "do the right thing", which
     * is to append the token we just processed to the resultant HTML
     * document, unless the user has already appended something else.  
     * 

     * A sample program follows.  This program changes all
     * <img> tags to <form> tags,
     * deletes all <table> tags, capitalizes
     * and bolds each string token, and passes all other tokens through
     * unchanged, to illustrate how nextToken interacts with
     * some of the other methods in this class.
     * 
     * HtmlRewriter hr = new HtmlRewriter(str);
     * while (hr.nextToken()) {
     *     switch (hr.getType()) {
     *     case LexHTML.TAG: 
     *         if (hr.getTag().equals("img")) {
     *             // Change the tag name w/o affecting the attributes.
     *             
     *             hr.setTag("form");
     *         } else if (hr.getTag().equals("table")) {
     *             // Eliminate the entire "table" token.
     *             
     *             hr.killToken();
     *         } 
     *         break;
     *             
     *     case LexHTML.STRING:
     *         // Append a new sequence in place of the existing token.
     *
     *         hr.append("<b>" + hr.getToken().toUpperCase() + "</b>");
     *         break;
     *     }
     *     // Any tokens we didn't modify get copied through unchanged.
     * }
     * 
     *
     * @return	true if there are tokens left to process,
     *		false otherwise.
     */
    public boolean
    nextToken()
    {
	tokenCount++;
	if (pushback) {
	    pushback = false;
	    return true;
	}

	if (appendToken && accumulate) {
	    appendToken();
	}

	token = null;
	tag = null;
	map = null;

	appendToken = true;
	tokenModified = false;

	if (lex.nextToken()) {
	    type = lex.getType();
	    if (type == LexHTML.TAG) {
		tagCount++;
	    }
	    singleton = lex.isSingleton();
	    return true;
	}
	return false;
    }

    /**
     * A convenence method built on top of nextToken.
     * Advances to the next HTML tag.  All intervening strings and comments
     * between the last tag and the new current tag are copied through
     * unchanged.  This method can be used when the caller wants to process
     * only HTML tags, without having to manually check the type of each
     * token to see if it is actually a tag.
     *
     * @return	true if there are tokens left to process,
     *		false otherwise.
     */
    public boolean
    nextTag()
    {
	while (nextToken()) {
	    if (getType() == LexHTML.TAG) {
		return true;
	    }
	}
	return false;
    }

    /**
     * Gets the type of the current token.
     *
     * @return	The type.
     *
     * @see	LexHTML#getType
     */
    public int
    getType()
    {
	return type;
    }

    /**
     * Sets the type of the current token.
     */
    public void
    setType(int type)
    {
	this.type = type;
	tokenModified = true;
    }

    /**
     * See if the current tag a singleton.  A Singleton tag ends in "/", as
     * in <
.
     */

    public boolean
    isSingleton()
    {
	return singleton;
    }

    /**
     * Make the current tag a singleton.  A Singleton tag ends in "/", as
     * in <
.
     */

    public void
    setSingleton(boolean singleton)
    {
	this.singleton = singleton;
	tokenModified = true;
    }

    /**
     * Gets the raw string making up the entire current token, including
     * the angle brackets or comment delimiters, if applicable.
     *
     * @return	The current token.
     *
     * @see	LexHTML#getToken
     */
    public String
    getToken()
    {
	if (token == null) {
	    token = lex.getToken();
	}
	return token;
    }

    /**
     * Gets the current tag's name.  The name returned is converted to
     * lower case.
     *
     * @return	The lower-cased tag name, or null if the
     *		current token does not have a tag name
     *
     * @see	LexHTML#getTag
     */
    public String
    getTag()
    {
	if (tag == null) {
	    tag = lex.getTag();
	}
	return tag;
    }

    /**
     * Changes the current tag's name.  The tag's attributes are not changed.
     *
     * @param	tag
     *		New tag name
     */
    public void
    setTag(String tag)
    {
	this.tag = tag;
	tokenModified = true;
    }

    /**
     * Gets the body of the current token as a string.  
     *
     * @return	The body.
     *
     * @see	LexHTML#getBody
     */
    public String
    getBody()
    {
	return lex.getBody();
    }

    /**
     * Gets the arguments of the current token as a string.  
     *
     * @return	The body.
     *
     * @see	LexHTML#getArgs
     */
    public String
    getArgs()
    {
	return lex.getArgs();
    }

    /**
     * Returns the value that the specified case-insensitive key maps
     * to in the attributes for the current tag.  For keys that were
     * present in the tag's attributes without a value, the value returned
     * is the empty string.  In other words, for the tag
     * <table border rows=2>: 
     *  get("border") returns the empty string "". 
     * 
 get("rows") returns 2.
     * 
     * 
     * Surrounding single and double quote marks that occur in the literal
     * tag are removed from the values reported.  So, for the tag
     * <a href="/foo.html" target=_top onclick='alert("hello")'>: 

     *  get("href") returns /foo.html . 
     * 
 get("target") returns _top .
     * 
 get("onclick") returns alert("hello") .
     * 
     *
     * @param	The key to lookup in the current tag's attributes.
     * 
     * @return	The value to which the specified key is mapped, or
     *		null if the key was not in the attributes.
     *
     * @see	LexHTML#getAttributes
     */
    public String
    get(String key)
    {
        String str;

	getAttributes();

        if (map != null) {
	  str = map.get(key);
	  if (str == null) {
	    return null;
	  }
        } else {
          return null;
        }
	return Format.deQuote(str);
    }

    /**
     * Maps the given case-insensitive key to the specified value in the
     * current tag's attributes.
     * 
     * The value can be retrieved by calling get with a
     * key that is case-insensitive equal to the given key.
     * 

     * If the attributes already contained a mapping for the given key,
     * the old value is forgotten and the new specified value is used.
     * The case of the prior key is retained in that case.  Otherwise
     * the case of the new key is used and a new mapping is made.
     *
     * @param	key
     *		The new key.  May not be null.
     *
     * @param	value
     *		The new value.  May be not be null.
     */
    public void
    put(String key, String value)
    {
	getAttributes();
	map.put(key, quote(value));
	tokenModified = true;
    }

    /**
     * Removes the given case-insensitive key and its corresponding value
     * from the current tag's attributes.  This method does nothing if the
     * key is not in the attributes.
     *
     * @param	key
     *		The key that needs to be removed.  Must not be
     *		null.
     */
    public void
    remove(String key)
    {
	getAttributes();
	map.remove(key);
	tokenModified = true;
    }

    /**
     * Returns an enumeration of the keys in the current tag's attributes.
     * The elements of the enumeration are the string keys.  The keys can
     * be passed to get to get the values of the attributes.
     * 
     * @return	An enumeration of the keys.
     */
    public Enumeration
    keys()
    {
	getAttributes();
	return map.keys();
    }

    /**
     * Instead of modifying an existing token, this method allows the user
     * to completely replace the current token with arbitrary new content.  
     * 

     * This method may be called multiple times while processing the current
     * token to add more and more data to the resultant HTML document.
     * Before and/or after calling this method, the appendToken
     * method may also be called explicitly in order to add the current token
     * to the resultant HTML document.
     * 

     * Following is sample code illustrating how to use this method
     * to put bold tags around all the <a> tags.
     * 
     * HtmlRewriter hr = new HtmlRewriter(str);
     * while (hr.nextTag()) {
     *     if (hr.getTag().equals("a")) {
     *         hr.append("<b>");
     *         hr.appendToken();
     *     } else if (hr.getTag().equals("/a")) {
     *         hr.appendToken();
     *         hr.append("</b>");
     *     }
     * }
     * 
     * The calls to appendToken are necessary.  Otherwise,
     * the HtmlRewriter could not know where and when to
     * append the existing token in addition to the new content provided
     * by the user.
     *
     * @param	str
     *		The new content to append.  May be null,
     *		in which case no new content is appended (the equivalent
     *		of appending "").
     *
     * @see	#appendToken
     * @see	#killToken
     */
    public void
    append(String str)
    {
	if (str != null) {
	    sb.append(str);
	}
	appendToken = false;
    }

    /**
     * Appends the current token to the resultant HTML document.
     * If the caller has changed the current token using the 
     * setTag, set, or remove
     * methods, those changes will be reflected.
     * 
     * By default, this method is automatically called after each token is
     * processed unless the user has already appended something to the
     * resultant HTML document.  Therefore, if the user appends something
     * and also wants to append the current token, or if the user wants
     * to append the current token a number of times, this method must
     * be called.
     *
     * @see	#append
     * @see	#killToken
     */
    public void
    appendToken()
    {
	appendToken = false;
	if (tokenModified) {
	    getTag();
	    getAttributes();

	    if (getType() == LexHTML.COMMENT) {
		sb.append("<--");
	    } else {
		sb.append('<');
	    }
	    sb.append(tag);
	    int length = map.size();
	    for (int i = 0; i < length; i++) {
		sb.append(' ').append(map.getKey(i));
		String value = map.get(i);
		if ((value != null) && (value.length() > 0)) {
		    sb.append('=').append(value);
		}
	    }
	    if (isSingleton()) {
		if (length > 0) {
		    sb.append(' ');
		}
		sb.append('/');
	    }
	    if (getType() == LexHTML.COMMENT) {
		sb.append("-->");
	    } else {
		sb.append('>');
	    }
	} else {
	    sb.append(getToken());
	}
    }

    /**
     * Tells this HtmlRewriter not to append the current token
     * to the resultant HTML document.  Even if the user hasn't appended
     * anything else, the current token will be ignored rather than appended.
     *
     * @see	#append
     * @see	#killToken
     */
    public void
    killToken()
    {
	appendToken = false;
    }

    /**
     * Turns on or off the automatic accumulation of each token.
     * 

     * After each token is processed, the current token is appended to
     * to the resultant HTML document unless the user has already appended
     * something else.  By setting accumulate to
     * false, this behavior is turned off.  The user must then
     * explicitly call appendToken to cause the current token
     * to be appended.
     * 

     * Turning off accumulation takes effect immediately, while turning
     * on accumulation takes effect on the next token.  In other words,
     * whether the user turns this setting off or on, the current token
     * will not be added to the resultant HTML document unless the user
     * explicitly calls appendToken. 
     * 

     * Following is sample code that illustrates how to use this method
     * to extract the contents of the <head> of the
     * source HTML document.
     * 
     * HtmlRewriter hr = new HtmlRewriter(str);
     * // Don't accumulate tokens until we see the <head> below.
     * hr.accumulate(false);
     * while (hr.nextTag()) {
     *     if (hr.getTag().equals("head")) {
     *         // Start remembering the contents of the HTML document,
     *         // not including the <head> tag itself.
     *
     *         hr.accumulate(true);
     *     } else if (hr.getTag().equals("/head")) {
     *         // Return everything accumulated so far.
     *
     *         return hr.toString();
     *     }
     * }
     * 
     * This method can be called any number of times while processing
     * the source HTML document.
     *
     * @param	accumulate
     *		true to automatically accumulate tokens in the
     *		resultant HTML document, false to require
     *		that the user explicitly accumulate them.
     * @return	The previous accumulate setting
     *
     * @see	#reset
     */
    public boolean
    accumulate(boolean accumulate)
    {
	boolean was = this.accumulate;
	this.accumulate = accumulate;
	appendToken = false;
	return was;
    }

    /**
     * Forgets all the tokens that have been appended to the resultant
     * HTML document so far, including the current token.
     */
    public void
    reset()
    {
	sb.setLength(0);
	appendToken = false;
    }

    /**
     * Puts the current token back.  The next time nextToken
     * is called, it will be the current token again, rather than
     * advancing to the next token in the source HTML document.
     * 
     * This is useful when a code fragment needs to read an indefinite
     * number of tokens, but that once some distinguished token is found,
     * needs to push that token back so that normal processing can occur
     * on that token.
     */
    public void
    pushback()
    {
	pushback = true;
    }

    /**
     * Return count of tokens seen so far
     */

    public int
    tokenCount() {
	return tokenCount;
    }

    /**
     * Return count of tags seen so far
     */

    public int
    tagCount() {
	return tagCount;
    }

    /*
     * The set of characters that will turn-on quoting
     */
    // public static String needQuote="' \t%$";  // these need quoting

    /**
     * Helper class to quote a attribute's value when the value is being
     * written to the resultant HTML document.  Values set by the
     * put method are automatically quoted as needed.  This
     * method is provided in case the user is dynamically constructing a new
     * tag to be appended with append and needs to quote some
     * arbitrary values.
     * 

     * The quoting algorithm is as follows: 

     * If the string contains double-quotes, put single quotes around it. 

     * If the string contains any "special" characters, put double-quotes
     * around it.
     * 
     * This algorithm is, of course, insufficient for complicated
     * strings that include both single and double quotes.  In that case,
     * it is the user's responsibility to escape the special characters
     * in the string using the HTML special symbols like
     * &quot; or &#34;
     *
     * @return	The quoted string, or the original string if it did not
     *		need to be quoted.
     */

    public static String
    quote(String str) {
	if (str.indexOf('\"') >= 0) {
	    return "\'" + str + "\'";
	} else if (str.length() > 0) {
	    return "\"" + str + "\"";
	} else {
	    return "";
	}
    }

    /**
     * see if target contains any of the strings in candidates
     */

    static private boolean
    contains(String target, String candidates) {
	char[] check = candidates.toCharArray();
	for(int i=0;i= 0) {
		return true;
	    }
	}
	return false;
    }

    private void
    getAttributes()
    {
	if (map == null) {
	    map = lex.getAttributes();
	}
    }

    /**
     * Return a copy of the StringMap of attributes.
     */

    public StringMap
    getMap() {
	getAttributes();
	StringMap result = new StringMap();
	result.append(map, true);
	return result;
    }
}
Related Artifacts