echopointng.ui.util.HtmlNodeLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ibis-echo2 Show documentation
Echo2 bundled with Echo2_Extras, Echo2_FileTransfer and echopointing and various improvements/bugfixes
There is a newer version: 2.0.4
/* 
 * This file is part of the Echo Point Project.  This project is a collection
 * of Components that have extended the Echo Web Application Framework.
 *
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 */
package echopointng.ui.util;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;

/**
 * HtmlNodeLexer will lex a piece of HTML text and
 * callback out when a TextNode or Element is encountered.  
 * 
 * Remember that each text node may contain whitespace and may consist of 
 * multiple words.   Also a Element node consists of 1 or 2 events, the start tag and 
 * the end tag or a end tag only.
 */  
public class HtmlNodeLexer {
	
	/**
	 * The HtmlLexerCallBack interface is used as a callback mechanism
	 * when a specific type of HTML node is encountered.
	 */
	public interface HtmlLexerCallBack {
		/**
		 * Called then a textNode has been fully lexed.  The callee can
		 * modify the text node and pass back a new text node value.
		 * 
		 * @param textNode - the StringBuffer of text
		 * @return the possibly modified text node
		 */
		public StringBuffer onTextNode(StringBuffer textNode);
		/**
		 * Called then an Element has been fully lexed.  The callee can
		 * modify the Element node and pass back a new Element node value.
		 * 
		 * @param element - a StringBuffer of the Element
		 * @return the possibly modified Element
		 */
		public StringBuffer onElementNode(StringBuffer element);
		/**
		 * Called then an Comment has been fully lexed.  The callee can
		 * modify the Comment node and pass back a new Comment node value.
		 * 
		 * @param comment - a StringBuffer of the Comment
		 * @return the possibly modified Comment
		 */
		public StringBuffer onCommentNode(StringBuffer comment);
	}
	
	public static String lex(String htmlText, HtmlLexerCallBack callBack) {
		return doTokenize(htmlText,callBack,"<>",0,0,1,0xFF,false,false);
	}
	
	private static final String STATE_START  	 	= "STATE_START";
	private static final String STATE_ELEMENT 		= "STATE_ELEMENT";
	private static final String STATE_COMMENT		= "STATE_COMMENT";
	private static final String STATE_TEXT			= "STATE_TEXT";
	
	private static String doTokenize(String str, HtmlLexerCallBack callBack, String delims, int loWhiteSpace, int hiWhiteSpace, int loWordChar, int hiWordChar, boolean useQuotes, boolean noCommonWhiteSpace) {
		StringReader sr = new StringReader(str);
		StreamTokenizer st = new StreamTokenizer(sr);
		st.resetSyntax();
		st.whitespaceChars(loWhiteSpace, hiWhiteSpace);
		st.wordChars(loWordChar, hiWordChar);
		for (int i = 0; i < delims.length(); i++) {
			st.ordinaryChar(delims.charAt(i));
		}
		if (useQuotes) {
			st.quoteChar('\'');
			st.quoteChar('"');
		}
		if (noCommonWhiteSpace) {
			st.wordChars(' ',' ');
			st.wordChars('\t','\t');
			st.wordChars('\n','\n');
			st.wordChars('\r','\r');
		}
		try {
			return doTokenizeInternal(st,callBack);
		} catch (Exception e) {
			return str;
		}
	}
		
	private static String doTokenizeInternal(StreamTokenizer st, HtmlLexerCallBack callBack) throws IOException {
			StringBuffer sbTextNode = null;
			StringBuffer sbElement = null;
			StringBuffer sbComment = null;
			String state = STATE_START;
			String token = "";
			String prevToken = "";
			StringBuffer sbOut = new StringBuffer();


			int tt = st.nextToken();
			while (tt != java.io.StreamTokenizer.TT_EOF) {
				prevToken = token;
				token = "";
				switch (tt) {
					case java.io.StreamTokenizer.TT_WORD :
						// A word was found; the value is in sval
						token = st.sval;
						break;
					case java.io.StreamTokenizer.TT_EOF :
						// End of file has been reached
						break;
					default :
						// A regular character was found; the value is the token itself
						token = new String(new char[] {(char)tt});
						break;
				}
				// ======
				// WHAT STATE ARE WE IN
				// ======
				// STATE_START
				if (state == STATE_START) {
					if (token.equals("<")) {
						state = STATE_ELEMENT;
						sbElement = new StringBuffer(token);
					} else {
						state = STATE_TEXT;
						sbTextNode = new StringBuffer(token);
					}
				//	STATE_ELEMENT
				} else if (state == STATE_ELEMENT) {
					if (prevToken.equals("<") && token.indexOf("!--") == 0) {
						sbComment = new StringBuffer();
						// what we though was an element in in fact a comment
						sbComment.append(sbElement); 
						sbComment.append(token);
						sbElement = null;
						state = STATE_COMMENT; 
					}
					else if (token.equals(">")) {
						sbElement.append(token);
						StringBuffer newElementBuffer = nvl(callBack.onElementNode(sbElement));
						sbOut.append(newElementBuffer);
						sbElement = null;
						
						state = STATE_TEXT;
						sbTextNode = new StringBuffer();
					} else {
						sbElement.append(token);
					}
				//	STATE_COMMENT
				} else if (state == STATE_COMMENT) {
					if (token.equals(">")) {
						sbComment.append(token);
						int endIndex = prevToken.lastIndexOf("--"); 
						if (endIndex == prevToken.length()-2) {
							// callback for a lexed comment
							StringBuffer newCommentBuffer = nvl(callBack.onCommentNode(sbComment));
							sbOut.append(newCommentBuffer);
							sbComment = null;

							state = STATE_TEXT;
							sbTextNode = new StringBuffer();
						}
					} else  {
						sbComment.append(token);
					}

				//	STATE_TEXT
				} else if (state == STATE_TEXT) {
					if (token.equals("<")) {
						// call backout to callback
						if (sbTextNode.length() > 0) {
							StringBuffer newTextBuffer = nvl(callBack.onTextNode(sbTextNode));
							sbOut.append(newTextBuffer);
						}
						sbTextNode = null;
						sbElement = new StringBuffer(token);
						state = STATE_ELEMENT;
					} else  {
						sbTextNode.append(token);
					}
				}
				tt = st.nextToken();
			}
			// just in case we finished on a node
			if (sbTextNode != null && sbTextNode.length() > 0) {
				sbTextNode = nvl(callBack.onTextNode(sbTextNode));
				sbOut.append(sbTextNode);
			}
			if (sbElement != null && sbElement.length() > 0) {
				sbElement = nvl(callBack.onElementNode(sbElement));
				sbOut.append(sbElement);
			}
			if (sbComment != null && sbComment.length() > 0) {
				sbComment = nvl(callBack.onCommentNode(sbComment));
				sbOut.append(sbComment);
			}
			return sbOut.toString();
	}
	
	private static StringBuffer nvl(StringBuffer sb) {
		return (sb == null ? new StringBuffer() : sb);
	}
	
}