org.apache.myfaces.renderkit.html.util.ReducedHTMLParser Maven / Gradle / Ivy
Show all versions of tomahawk Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.myfaces.renderkit.html.util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* A class which detects the open/close tags in an HTML document and reports
* them to a listener class.
*
* This is unfortunately necessary when using JSF with JSP, as tags in the body
* of the document can need to output commands into the document at points
* earlier than the tag occurred (particularly into the document HEAD section).
* This can only be implemented by buffering the response and post-processing
* it to find the relevant HTML tags and modifying the buffer as needed.
*
* This class tries to do the parsing as quickly as possible; many of the
* details of HTML are not relevant for the purposes this class is used for.
*
* @version $Revision: 673833 $ $Date: 2008-07-03 16:58:05 -0500 (Thu, 03 Jul 2008) $
*/
public class ReducedHTMLParser
{
// IMPLEMENTATION NOTE:
//
// Many of the methods on this class are package-scope. This is intended
// solely for the purpose of unit-testing. This class does not expect
// other classes in this package to access its methods.
private static final Log log = LogFactory.getLog(ReducedHTMLParser.class);
public static final int BODY_TAG = 0;
public static final int HEAD_TAG = 1;
public static final int SCRIPT_TAG = 2;
private static final int STATE_READY = 0;
private static final int STATE_IN_COMMENT = 1;
private static final int STATE_IN_TAG = 2;
private static final int STATE_IN_MARKED_SECTION = 3;
private static final int STATE_EXPECTING_ETAGO = 4;
private int _offset;
private int _lineNumber;
private CharSequence _seq;
private CallbackListener _listener;
public static void parse(CharSequence seq, CallbackListener l)
{
new ReducedHTMLParser(seq, l).parse();
}
/**
* Constructor, package-scope for unit testing.
*
* @param s is the sequence of chars to parse.
* @param l is the listener to invoke callbacks on.
*/
ReducedHTMLParser(CharSequence s, CallbackListener l)
{
_seq = s;
_listener = l;
}
/**
* Return true if there are no more characters to parse.
*/
boolean isFinished()
{
return _offset >= _seq.length();
}
int getCurrentLineNumber()
{
return _lineNumber;
}
/**
* Advance the current parse position over any whitespace characters.
*/
void consumeWhitespace()
{
boolean crSeen = false;
while (_offset < _seq.length())
{
char c = _seq.charAt(_offset);
if (!Character.isWhitespace(c))
{
break;
}
// Track line number for error messages.
if (c == '\r')
{
++_lineNumber;
crSeen = true;
}
else if ((c == '\n') && !crSeen)
{
++_lineNumber;
}
else
{
crSeen = false;
}
++_offset;
}
}
/**
* Eat up a sequence of non-whitespace characters and return them.
*/
String consumeNonWhitespace()
{
int wordStart = _offset;
while (_offset < _seq.length())
{
char c = _seq.charAt(_offset);
if (Character.isWhitespace(c))
{
break;
}
++_offset;
}
if (wordStart == _offset)
{
return null;
}
else
{
return _seq.subSequence(wordStart, _offset).toString();
}
}
/**
* If the next chars in the input sequence exactly match the specified
* string then skip over them and return true.
*
* If there is not a match then leave the current parse position
* unchanged and return false.
*
* @param s is the exact string to match.
* @return true if the input contains exactly the param s
*/
boolean consumeMatch(String s)
{
if (_offset + s.length() > _seq.length())
{
// seq isn't long enough to contain the specified string
return false;
}
int i = 0;
while (i < s.length())
{
if (_seq.charAt(_offset+i) == s.charAt(i))
{
++i;
}
else
{
return false;
}
}
_offset += i;
return true;
}
/**
* Eat up a sequence of chars which form a valid XML element name.
*
* TODO: implement this properly in compliance with spec
*/
String consumeElementName()
{
consumeWhitespace();
int nameStart = _offset;
while (!isFinished())
{
boolean ok = false;
char c = _seq.charAt(_offset);
if (Character.isLetterOrDigit(_seq.charAt(_offset)))
{
ok = true;
}
else if (c == '_')
{
ok = true;
}
else if (c == '-')
{
ok = true;
}
else if (c == ':')
{
ok = true;
}
if (!ok)
{
break;
}
++_offset;
}
if (nameStart == _offset)
{
return null;
}
else
{
return _seq.subSequence(nameStart, _offset).toString();
}
}
/**
* Eat up a sequence of chars which form a valid XML attribute name.
*
* TODO: implement this properly in compliance with spec
*/
String consumeAttrName()
{
// for now, assume elements and attributes have same rules
return consumeElementName();
}
/**
* Eat up a string which is terminated with the specified quote
* character. This means handling escaped quote chars within the
* string.
*
* This method assumes that the leading quote has already been
* consumed.
*/
String consumeString(char quote)
{
// TODO: should we consider a string to be terminated by a newline?
// that would help with runaway strings but I think that multiline
// strings *are* allowed...
//
// TODO: detect newlines within strings and increment lineNumber.
// This isn't so important, though; they aren't common and being a
// few lines out in an error message isn't serious either.
StringBuffer stringBuf = new StringBuffer();
boolean escaping = false;
while (!isFinished())
{
char c = _seq.charAt(_offset);
++_offset;
if (c == quote)
{
if (!escaping)
{
break;
}
else
{
stringBuf.append(c);
escaping = false;
}
}
else if (c == '\\')
{
if (escaping)
{
// append a real backslash
stringBuf.append(c);
escaping = false;
}
else
{
escaping = true;
}
}
else
{
if (escaping)
{
stringBuf.append('\\');
escaping = false;
}
stringBuf.append(c);
}
}
return stringBuf.toString();
}
/**
* Assuming we have already encountered "attrname=", consume the
* value part of the attribute definition. Note that unlike XML,
* HTML doesn't have to quote its attribute values.
*
* @return the attribute value. If the attr-value was quoted,
* the returned value will not include the quote chars.
*/
String consumeAttrValue()
{
consumeWhitespace();
if (consumeMatch("'"))
{
return consumeString('\'');
}
else if (consumeMatch("\""))
{
return consumeString('"');
}
else
{
return consumeNonWhitespace();
}
}
/**
* Discard all characters in the input until one in the specified
* string (character-set) is found.
*
* @param s is a set of characters that should not be discarded.
*/
void consumeExcept(String s)
{
boolean crSeen = false;
while (_offset < _seq.length())
{
char c = _seq.charAt(_offset);
if (s.indexOf(c) >= 0)
{
// char is in the exception set
return;
}
// Track line number for error messages.
if (c == '\r')
{
++_lineNumber;
crSeen = true;
}
else if ((c == '\n') && !crSeen)
{
++_lineNumber;
}
else
{
crSeen = false;
}
++_offset;
}
}
/**
* Process the entire input buffer, invoking callbacks on the listener
* object as appropriate.
*/
void parse()
{
int state = STATE_READY;
int currentTagStart = -1;
String currentTagName = null;
_lineNumber = 1;
_offset = 0;
int lastOffset = _offset -1;
while (_offset < _seq.length())
{
// Sanity check; each pass through this loop must increase the offset.
// Failure to do this means a hang situation has occurred.
if (_offset <= lastOffset)
{
// throw new RuntimeException("Infinite loop detected in ReducedHTMLParser");
log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped."+
" Surroundings: '" + getTagSurroundings() +"'.");
//return;
}
lastOffset = _offset;
if (state == STATE_READY)
{
// in this state, nothing but "<" has any significance
consumeExcept("<");
if (isFinished())
{
break;
}
if (consumeMatch(""))
{
state = STATE_READY;
}
else
{
// false call; hyphen is not end of comment
consumeMatch("-");
}
continue;
}
if (state == STATE_IN_TAG)
{
consumeWhitespace();
if (consumeMatch("/>"))
{
// ok, end of element
state = STATE_READY;
closedTag(currentTagStart, _offset, currentTagName);
// and reset vars just in case...
currentTagStart = -1;
currentTagName = null;
}
else if (consumeMatch(">"))
{
if (currentTagName.equalsIgnoreCase("script")
|| currentTagName.equalsIgnoreCase("style"))
{
// We've just started a special tag which can contain anything except
// the ETAGO marker (""). See
// http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
state = STATE_EXPECTING_ETAGO;
}
else
{
state = STATE_READY;
}
// end of open tag, but not end of element
openedTag(currentTagStart, _offset, currentTagName);
// and reset vars just in case...
currentTagStart = -1;
currentTagName = null;
}
else
{
// xml attribute
String attrName = consumeAttrName();
if (attrName == null)
{
// Oops, we found something quite unexpected in this tag.
// The best we can do is probably to drop back to looking
// for "/>", though that does risk us misinterpreting the
// contents of an attribute's associated string value.
log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'"
+ " at line " + getCurrentLineNumber()+". "+
"Surroundings: '" + getTagSurroundings() +"'.");
state = STATE_EXPECTING_ETAGO;
// and consume one character
++_offset;
}
else
{
consumeWhitespace();
// html can have "stand-alone" attributes with no following equals sign
if (consumeMatch("="))
{
consumeAttrValue();
}
}
}
continue;
}
if (state == STATE_IN_MARKED_SECTION)
{
// in this state, nothing but "]]>" has any significance
consumeExcept("]");
if (isFinished())
{
break;
}
if (consumeMatch("]]>"))
{
state = STATE_READY;
}
else
{
// false call; ] is not end of cdata section
consumeMatch("]");
}
continue;
}
if (state == STATE_EXPECTING_ETAGO)
{
// The term "ETAGO" is the official spec term for "".
consumeExcept("<");
if (isFinished())
{
log.debug("Malformed input page; input terminated while tag not closed.");
break;
}
if (consumeMatch(""))
{
if (!processEndTag())
{
return;
}
state = STATE_READY;
}
else
{
// false call; < does not start an ETAGO
consumeMatch("<");
}
continue;
}
}
}
/**
* Get details about malformed HTML tag.
*
* @return Tag surroundings.
*/
private String getTagSurroundings()
{
int maxLength = 30;
int end = _seq.length();
if (end - _offset > maxLength) {
end = _offset + maxLength;
}
return _seq.subSequence(_offset, end).toString();
}
/**
* Invoked when "</" has been seen in the input, this method
* handles the parsing of the end tag and the invocation of the
* appropriate callback method.
*
* @return true if the tag was successfully parsed, and false
* if there was a fatal parsing error.
*/
private boolean processEndTag()
{
int tagStart = _offset - 2;
String tagName = consumeElementName();
consumeWhitespace();
if (!consumeMatch(">"))
{
// log details about malformed end tag
log.error("Malformed end tag '" + tagName + "' at line " + getCurrentLineNumber()
+ "; skipping parsing. Surroundings: '" + getTagSurroundings() +"'.");
return false;
}
// inform user that the tag has been closed
closedTag(tagStart, _offset, tagName);
// We can't verify that the tag names balance because this is HTML
// we are processing, not XML.
return true;
}
/**
* Invoke a callback method to inform the listener that we have found a start tag.
*
* @param startOffset
* @param endOffset
* @param tagName
*/
void openedTag(int startOffset, int endOffset, String tagName)
{
//log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
if ("head".equalsIgnoreCase(tagName))
{
_listener.openedStartTag(startOffset, HEAD_TAG);
_listener.closedStartTag(endOffset, HEAD_TAG);
}
else if ("body".equalsIgnoreCase(tagName))
{
_listener.openedStartTag(startOffset, BODY_TAG);
_listener.closedStartTag(endOffset, BODY_TAG);
}
else if ("script".equalsIgnoreCase(tagName))
{
_listener.openedStartTag(startOffset, SCRIPT_TAG);
_listener.closedStartTag(endOffset, SCRIPT_TAG);
}
}
void closedTag(int startOffset, int endOffset, String tagName)
{
//log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
if ("head".equalsIgnoreCase(tagName))
{
_listener.openedEndTag(startOffset, HEAD_TAG);
_listener.closedEndTag(endOffset, HEAD_TAG);
}
else if ("body".equalsIgnoreCase(tagName))
{
_listener.openedEndTag(startOffset, BODY_TAG);
_listener.closedEndTag(endOffset, BODY_TAG);
}
else if ("script".equalsIgnoreCase(tagName))
{
_listener.openedEndTag(startOffset, SCRIPT_TAG);
_listener.closedEndTag(endOffset, SCRIPT_TAG);
}
}
}