All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.myfaces.renderkit.html.util.ReducedHTMLParser Maven / Gradle / Ivy

Go to download

JSF components and utilities that can be used with any JSF implementation. This library is compatible with both JSF1.1 and JSF1.2; however for JSF1.2 users there is an alternative build of Tomahawk available that takes advantage of JSF1.2 features to offer some additional benefits.

There is a newer version: 1.1.14
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.myfaces.renderkit.html.util;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * A class which detects the open/close tags in an HTML document and reports
 * them to a listener class.
 * 

* This is unfortunately necessary when using JSF with JSP, as tags in the body * of the document can need to output commands into the document at points * earlier than the tag occurred (particularly into the document HEAD section). * This can only be implemented by buffering the response and post-processing * it to find the relevant HTML tags and modifying the buffer as needed. *

* This class tries to do the parsing as quickly as possible; many of the * details of HTML are not relevant for the purposes this class is used for. * * @version $Revision: 673833 $ $Date: 2008-07-03 16:58:05 -0500 (Thu, 03 Jul 2008) $ */ public class ReducedHTMLParser { // IMPLEMENTATION NOTE: // // Many of the methods on this class are package-scope. This is intended // solely for the purpose of unit-testing. This class does not expect // other classes in this package to access its methods. private static final Log log = LogFactory.getLog(ReducedHTMLParser.class); public static final int BODY_TAG = 0; public static final int HEAD_TAG = 1; public static final int SCRIPT_TAG = 2; private static final int STATE_READY = 0; private static final int STATE_IN_COMMENT = 1; private static final int STATE_IN_TAG = 2; private static final int STATE_IN_MARKED_SECTION = 3; private static final int STATE_EXPECTING_ETAGO = 4; private int _offset; private int _lineNumber; private CharSequence _seq; private CallbackListener _listener; public static void parse(CharSequence seq, CallbackListener l) { new ReducedHTMLParser(seq, l).parse(); } /** * Constructor, package-scope for unit testing. * * @param s is the sequence of chars to parse. * @param l is the listener to invoke callbacks on. */ ReducedHTMLParser(CharSequence s, CallbackListener l) { _seq = s; _listener = l; } /** * Return true if there are no more characters to parse. */ boolean isFinished() { return _offset >= _seq.length(); } int getCurrentLineNumber() { return _lineNumber; } /** * Advance the current parse position over any whitespace characters. */ void consumeWhitespace() { boolean crSeen = false; while (_offset < _seq.length()) { char c = _seq.charAt(_offset); if (!Character.isWhitespace(c)) { break; } // Track line number for error messages. if (c == '\r') { ++_lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++_lineNumber; } else { crSeen = false; } ++_offset; } } /** * Eat up a sequence of non-whitespace characters and return them. */ String consumeNonWhitespace() { int wordStart = _offset; while (_offset < _seq.length()) { char c = _seq.charAt(_offset); if (Character.isWhitespace(c)) { break; } ++_offset; } if (wordStart == _offset) { return null; } else { return _seq.subSequence(wordStart, _offset).toString(); } } /** * If the next chars in the input sequence exactly match the specified * string then skip over them and return true. *

* If there is not a match then leave the current parse position * unchanged and return false. * * @param s is the exact string to match. * @return true if the input contains exactly the param s */ boolean consumeMatch(String s) { if (_offset + s.length() > _seq.length()) { // seq isn't long enough to contain the specified string return false; } int i = 0; while (i < s.length()) { if (_seq.charAt(_offset+i) == s.charAt(i)) { ++i; } else { return false; } } _offset += i; return true; } /** * Eat up a sequence of chars which form a valid XML element name. *

* TODO: implement this properly in compliance with spec */ String consumeElementName() { consumeWhitespace(); int nameStart = _offset; while (!isFinished()) { boolean ok = false; char c = _seq.charAt(_offset); if (Character.isLetterOrDigit(_seq.charAt(_offset))) { ok = true; } else if (c == '_') { ok = true; } else if (c == '-') { ok = true; } else if (c == ':') { ok = true; } if (!ok) { break; } ++_offset; } if (nameStart == _offset) { return null; } else { return _seq.subSequence(nameStart, _offset).toString(); } } /** * Eat up a sequence of chars which form a valid XML attribute name. *

* TODO: implement this properly in compliance with spec */ String consumeAttrName() { // for now, assume elements and attributes have same rules return consumeElementName(); } /** * Eat up a string which is terminated with the specified quote * character. This means handling escaped quote chars within the * string. *

* This method assumes that the leading quote has already been * consumed. */ String consumeString(char quote) { // TODO: should we consider a string to be terminated by a newline? // that would help with runaway strings but I think that multiline // strings *are* allowed... // // TODO: detect newlines within strings and increment lineNumber. // This isn't so important, though; they aren't common and being a // few lines out in an error message isn't serious either. StringBuffer stringBuf = new StringBuffer(); boolean escaping = false; while (!isFinished()) { char c = _seq.charAt(_offset); ++_offset; if (c == quote) { if (!escaping) { break; } else { stringBuf.append(c); escaping = false; } } else if (c == '\\') { if (escaping) { // append a real backslash stringBuf.append(c); escaping = false; } else { escaping = true; } } else { if (escaping) { stringBuf.append('\\'); escaping = false; } stringBuf.append(c); } } return stringBuf.toString(); } /** * Assuming we have already encountered "attrname=", consume the * value part of the attribute definition. Note that unlike XML, * HTML doesn't have to quote its attribute values. * * @return the attribute value. If the attr-value was quoted, * the returned value will not include the quote chars. */ String consumeAttrValue() { consumeWhitespace(); if (consumeMatch("'")) { return consumeString('\''); } else if (consumeMatch("\"")) { return consumeString('"'); } else { return consumeNonWhitespace(); } } /** * Discard all characters in the input until one in the specified * string (character-set) is found. * * @param s is a set of characters that should not be discarded. */ void consumeExcept(String s) { boolean crSeen = false; while (_offset < _seq.length()) { char c = _seq.charAt(_offset); if (s.indexOf(c) >= 0) { // char is in the exception set return; } // Track line number for error messages. if (c == '\r') { ++_lineNumber; crSeen = true; } else if ((c == '\n') && !crSeen) { ++_lineNumber; } else { crSeen = false; } ++_offset; } } /** * Process the entire input buffer, invoking callbacks on the listener * object as appropriate. */ void parse() { int state = STATE_READY; int currentTagStart = -1; String currentTagName = null; _lineNumber = 1; _offset = 0; int lastOffset = _offset -1; while (_offset < _seq.length()) { // Sanity check; each pass through this loop must increase the offset. // Failure to do this means a hang situation has occurred. if (_offset <= lastOffset) { // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser"); log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped."+ " Surroundings: '" + getTagSurroundings() +"'."); //return; } lastOffset = _offset; if (state == STATE_READY) { // in this state, nothing but "<" has any significance consumeExcept("<"); if (isFinished()) { break; } if (consumeMatch("")) { state = STATE_READY; } else { // false call; hyphen is not end of comment consumeMatch("-"); } continue; } if (state == STATE_IN_TAG) { consumeWhitespace(); if (consumeMatch("/>")) { // ok, end of element state = STATE_READY; closedTag(currentTagStart, _offset, currentTagName); // and reset vars just in case... currentTagStart = -1; currentTagName = null; } else if (consumeMatch(">")) { if (currentTagName.equalsIgnoreCase("script") || currentTagName.equalsIgnoreCase("style")) { // We've just started a special tag which can contain anything except // the ETAGO marker ("", though that does risk us misinterpreting the // contents of an attribute's associated string value. log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'" + " at line " + getCurrentLineNumber()+". "+ "Surroundings: '" + getTagSurroundings() +"'."); state = STATE_EXPECTING_ETAGO; // and consume one character ++_offset; } else { consumeWhitespace(); // html can have "stand-alone" attributes with no following equals sign if (consumeMatch("=")) { consumeAttrValue(); } } } continue; } if (state == STATE_IN_MARKED_SECTION) { // in this state, nothing but "]]>" has any significance consumeExcept("]"); if (isFinished()) { break; } if (consumeMatch("]]>")) { state = STATE_READY; } else { // false call; ] is not end of cdata section consumeMatch("]"); } continue; } if (state == STATE_EXPECTING_ETAGO) { // The term "ETAGO" is the official spec term for " maxLength) { end = _offset + maxLength; } return _seq.subSequence(_offset, end).toString(); } /** * Invoked when "</" has been seen in the input, this method * handles the parsing of the end tag and the invocation of the * appropriate callback method. * * @return true if the tag was successfully parsed, and false * if there was a fatal parsing error. */ private boolean processEndTag() { int tagStart = _offset - 2; String tagName = consumeElementName(); consumeWhitespace(); if (!consumeMatch(">")) { // log details about malformed end tag log.error("Malformed end tag '" + tagName + "' at line " + getCurrentLineNumber() + "; skipping parsing. Surroundings: '" + getTagSurroundings() +"'."); return false; } // inform user that the tag has been closed closedTag(tagStart, _offset, tagName); // We can't verify that the tag names balance because this is HTML // we are processing, not XML. return true; } /** * Invoke a callback method to inform the listener that we have found a start tag. * * @param startOffset * @param endOffset * @param tagName */ void openedTag(int startOffset, int endOffset, String tagName) { //log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName); if ("head".equalsIgnoreCase(tagName)) { _listener.openedStartTag(startOffset, HEAD_TAG); _listener.closedStartTag(endOffset, HEAD_TAG); } else if ("body".equalsIgnoreCase(tagName)) { _listener.openedStartTag(startOffset, BODY_TAG); _listener.closedStartTag(endOffset, BODY_TAG); } else if ("script".equalsIgnoreCase(tagName)) { _listener.openedStartTag(startOffset, SCRIPT_TAG); _listener.closedStartTag(endOffset, SCRIPT_TAG); } } void closedTag(int startOffset, int endOffset, String tagName) { //log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName); if ("head".equalsIgnoreCase(tagName)) { _listener.openedEndTag(startOffset, HEAD_TAG); _listener.closedEndTag(endOffset, HEAD_TAG); } else if ("body".equalsIgnoreCase(tagName)) { _listener.openedEndTag(startOffset, BODY_TAG); _listener.closedEndTag(endOffset, BODY_TAG); } else if ("script".equalsIgnoreCase(tagName)) { _listener.openedEndTag(startOffset, SCRIPT_TAG); _listener.closedEndTag(endOffset, SCRIPT_TAG); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy