All Downloads are FREE. Search and download functionalities are using the official Maven repository.

goog.caja.string.html.htmlparser.js Maven / Gradle / Ivy

// Copyright 2006-2008, The Google Caja project.
// Modifications Copyright 2009 The Closure Library Authors.
// All Rights Reserved

/**
 * @license Portions of this code are from the google-caja project, received by
 * Google under the Apache license (http://code.google.com/p/google-caja/).
 * All other code is Copyright 2009 Google, Inc. All Rights Reserved.

// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

 */

/**
 * @fileoverview A Html SAX parser.
 *
 * Examples of usage of the {@code goog.string.html.HtmlParser}:
 * 
 *   var handler = new MyCustomHtmlVisitorHandlerThatExtendsHtmlSaxHandler();
 *   var parser = new goog.string.html.HtmlParser();
 *   parser.parse(handler, 'link found!');
 * 
* * TODO(user, msamuel): validate sanitizer regex against the HTML5 grammar at * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html * http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html * * @author [email protected] (Mike Samuel) * @supported IE6+, FF1.5+, Chrome 3.0+, Safari and Opera 10. */ goog.provide('goog.string.html'); goog.provide('goog.string.html.HtmlParser'); goog.provide('goog.string.html.HtmlParser.EFlags'); goog.provide('goog.string.html.HtmlParser.Elements'); goog.provide('goog.string.html.HtmlParser.Entities'); goog.provide('goog.string.html.HtmlSaxHandler'); /** * An Html parser: {@code parse} takes a string and calls methods on * {@code goog.string.html.HtmlSaxHandler} while it is visiting it. * * @constructor */ goog.string.html.HtmlParser = function() { }; /** * HTML entities that are encoded/decoded. * TODO(user): use {@code goog.string.htmlEncode} instead. * @type {!Object} */ goog.string.html.HtmlParser.Entities = { 'lt': '<', 'gt': '>', 'amp': '&', 'nbsp': '\u00a0', 'quot': '"', 'apos': '\'' }; /** * The html eflags, used internally on the parser. * @enum {number} */ goog.string.html.HtmlParser.EFlags = { OPTIONAL_ENDTAG: 1, EMPTY: 2, CDATA: 4, RCDATA: 8, UNSAFE: 16, FOLDABLE: 32 }; /** * A map of element to a bitmap of flags it has, used internally on the parser. * @type {Object} */ goog.string.html.HtmlParser.Elements = { 'a': 0, 'abbr': 0, 'acronym': 0, 'address': 0, 'applet': goog.string.html.HtmlParser.EFlags.UNSAFE, 'area': goog.string.html.HtmlParser.EFlags.EMPTY, 'b': 0, 'base': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'basefont': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'bdo': 0, 'big': 0, 'blockquote': 0, 'body': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.FOLDABLE, 'br': goog.string.html.HtmlParser.EFlags.EMPTY, 'button': 0, 'canvas': 0, 'caption': 0, 'center': 0, 'cite': 0, 'code': 0, 'col': goog.string.html.HtmlParser.EFlags.EMPTY, 'colgroup': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'dd': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'del': 0, 'dfn': 0, 'dir': 0, 'div': 0, 'dl': 0, 'dt': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'em': 0, 'fieldset': 0, 'font': 0, 'form': 0, 'frame': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'frameset': goog.string.html.HtmlParser.EFlags.UNSAFE, 'h1': 0, 'h2': 0, 'h3': 0, 'h4': 0, 'h5': 0, 'h6': 0, 'head': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.FOLDABLE, 'hr': goog.string.html.HtmlParser.EFlags.EMPTY, 'html': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.FOLDABLE, 'i': 0, 'iframe': goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.CDATA, 'img': goog.string.html.HtmlParser.EFlags.EMPTY, 'input': goog.string.html.HtmlParser.EFlags.EMPTY, 'ins': 0, 'isindex': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'kbd': 0, 'label': 0, 'legend': 0, 'li': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'link': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'map': 0, 'menu': 0, 'meta': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'noframes': goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.CDATA, 'noscript': goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.CDATA, 'object': goog.string.html.HtmlParser.EFlags.UNSAFE, 'ol': 0, 'optgroup': 0, 'option': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'p': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'param': goog.string.html.HtmlParser.EFlags.EMPTY | goog.string.html.HtmlParser.EFlags.UNSAFE, 'pre': 0, 'q': 0, 's': 0, 'samp': 0, 'script': goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.CDATA, 'select': 0, 'small': 0, 'span': 0, 'strike': 0, 'strong': 0, 'style': goog.string.html.HtmlParser.EFlags.UNSAFE | goog.string.html.HtmlParser.EFlags.CDATA, 'sub': 0, 'sup': 0, 'table': 0, 'tbody': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'td': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'textarea': goog.string.html.HtmlParser.EFlags.RCDATA, 'tfoot': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'th': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'thead': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'title': goog.string.html.HtmlParser.EFlags.RCDATA | goog.string.html.HtmlParser.EFlags.UNSAFE, 'tr': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, 'tt': 0, 'u': 0, 'ul': 0, 'var': 0 }; /** * Regular expression that matches &s. * @type {RegExp} * @package */ goog.string.html.HtmlParser.AMP_RE = /&/g; /** * Regular expression that matches loose &s. * @type {RegExp} * @private */ goog.string.html.HtmlParser.LOOSE_AMP_RE_ = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; /** * Regular expression that matches <. * @type {RegExp} * @package */ goog.string.html.HtmlParser.LT_RE = /. * @type {RegExp} * @package */ goog.string.html.HtmlParser.GT_RE = />/g; /** * Regular expression that matches ". * @type {RegExp} * @package */ goog.string.html.HtmlParser.QUOTE_RE = /\"/g; /** * Regular expression that matches =. * @type {RegExp} * @package */ goog.string.html.HtmlParser.EQUALS_RE = /=/g; /** * Regular expression that matches null characters. * @type {RegExp} * @private */ goog.string.html.HtmlParser.NULL_RE_ = /\0/g; /** * Regular expression that matches entities. * @type {RegExp} * @private */ goog.string.html.HtmlParser.ENTITY_RE_ = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g; /** * Regular expression that matches decimal numbers. * @type {RegExp} * @private */ goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_ = /^#(\d+)$/; /** * Regular expression that matches hexadecimal numbers. * @type {RegExp} * @private */ goog.string.html.HtmlParser.HEX_ESCAPE_RE_ = /^#x([0-9A-Fa-f]+)$/; /** * Regular expression that matches the next token to be processed. * @type {RegExp} * @private */ goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ = new RegExp( // Don't capture space. '^\\s*(?:' + // Capture an attribute name in group 1, and value in group 3. // We capture the fact that there was an attribute in group 2, since // interpreters are inconsistent in whether a group that matches nothing // is null, undefined, or the empty string. ('(?:' + '([a-z][a-z-]*)' + // attribute name ('(' + // optionally followed '\\s*=\\s*' + ('(' + // A double quoted string. '\"[^\"]*\"' + // A single quoted string. '|\'[^\']*\'' + // The positive lookahead is used to make sure that in // , the value for bar is blank, not "baz=boo". '|(?=[a-z][a-z-]*\\s*=)' + // An unquoted value that is not an attribute name. // We know it is not an attribute name because the previous // zero-width match would've eliminated that possibility. '|[^>\"\'\\s]*' + ')' ) + ')' ) + '?' + ')' ) + // End of tag captured in group 3. '|(/?>)' + // Don't capture cruft '|[^a-z\\s>]+)', 'i'); /** * Regular expression that matches the next token to be processed when we are * outside a tag. * @type {RegExp} * @private */ goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_ = new RegExp( '^(?:' + // Entity captured in group 1. '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);' + // Comment, doctypes, and processing instructions not captured. '|<[!]--[\\s\\S]*?-->|]*>|<\\?[^>*]*>' + // '/' captured in group 2 for close tags, and name captured in group 3. '|<(/)?([a-z][a-z0-9]*)' + // Text captured in group 4. '|([^<&>]+)' + // Cruft captured in group 5. '|([<&>]))', 'i'); /** * Given a SAX-like {@code goog.string.html.HtmlSaxHandler} parses a * {@code htmlText} and lets the {@code handler} know the structure while * visiting the nodes. * * @param {goog.string.html.HtmlSaxHandler} handler The HtmlSaxHandler that will * receive the events. * @param {string} htmlText The html text. */ goog.string.html.HtmlParser.prototype.parse = function(handler, htmlText) { var htmlLower = null; var inTag = false; // True iff we're currently processing a tag. var attribs = []; // Accumulates attribute names and values. var tagName; // The name of the tag currently being processed. var eflags; // The element flags for the current tag. var openTag; // True if the current tag is an open tag. // Lets the handler know that we are starting to parse the document. handler.startDoc(); // Consumes tokens from the htmlText and stops once all tokens are processed. while (htmlText) { var regex = inTag ? goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ : goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_; // Gets the next token var m = htmlText.match(regex); // And removes it from the string htmlText = htmlText.substring(m[0].length); // TODO(goto): cleanup this code breaking it into separate methods. if (inTag) { if (m[1]) { // Attribute. // SetAttribute with uppercase names doesn't work on IE6. var attribName = goog.string.html.toLowerCase(m[1]); var decodedValue; if (m[2]) { var encodedValue = m[3]; switch (encodedValue.charCodeAt(0)) { // Strip quotes. case 34: case 39: encodedValue = encodedValue.substring( 1, encodedValue.length - 1); break; } decodedValue = this.unescapeEntities_(this.stripNULs_(encodedValue)); } else { // Use name as value for valueless attribs, so // // gets attributes ['type', 'checkbox', 'checked', 'checked'] decodedValue = attribName; } attribs.push(attribName, decodedValue); } else if (m[4]) { if (eflags !== void 0) { // False if not in whitelist. if (openTag) { if (handler.startTag) { handler.startTag(/** @type {string} */ (tagName), attribs); } } else { if (handler.endTag) { handler.endTag(/** @type {string} */ (tagName)); } } } if (openTag && (eflags & (goog.string.html.HtmlParser.EFlags.CDATA | goog.string.html.HtmlParser.EFlags.RCDATA))) { if (htmlLower === null) { htmlLower = goog.string.html.toLowerCase(htmlText); } else { htmlLower = htmlLower.substring( htmlLower.length - htmlText.length); } var dataEnd = htmlLower.indexOf('': handler.pcdata('>'); break; default: handler.pcdata('&'); break; } } } } // Lets the handler know that we are done parsing the document. handler.endDoc(); }; /** * Decodes an HTML entity. * * @param {string} name The content between the '&' and the ';'. * @return {string} A single unicode code-point as a string. * @private */ goog.string.html.HtmlParser.prototype.lookupEntity_ = function(name) { // TODO(goto): use {goog.string.htmlDecode} instead ? // TODO(goto): π is different from Π name = goog.string.html.toLowerCase(name); if (goog.string.html.HtmlParser.Entities.hasOwnProperty(name)) { return goog.string.html.HtmlParser.Entities[name]; } var m = name.match(goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_); if (m) { return String.fromCharCode(parseInt(m[1], 10)); } else if (m = name.match(goog.string.html.HtmlParser.HEX_ESCAPE_RE_)) { return String.fromCharCode(parseInt(m[1], 16)); } return ''; }; /** * Removes null characters on the string. * @param {string} s The string to have the null characters removed. * @return {string} A string without null characters. * @private */ goog.string.html.HtmlParser.prototype.stripNULs_ = function(s) { return s.replace(goog.string.html.HtmlParser.NULL_RE_, ''); }; /** * The plain text of a chunk of HTML CDATA which possibly containing. * * TODO(goto): use {@code goog.string.unescapeEntities} instead ? * @param {string} s A chunk of HTML CDATA. It must not start or end inside * an HTML entity. * @return {string} The unescaped entities. * @private */ goog.string.html.HtmlParser.prototype.unescapeEntities_ = function(s) { return s.replace( goog.string.html.HtmlParser.ENTITY_RE_, goog.bind( function(fullEntity, name) { return this.lookupEntity_(name); }, this)); }; /** * Escape entities in RCDATA that can be escaped without changing the meaning. * @param {string} rcdata The RCDATA string we want to normalize. * @return {string} A normalized version of RCDATA. * @private */ goog.string.html.HtmlParser.prototype.normalizeRCData_ = function(rcdata) { return rcdata. replace(goog.string.html.HtmlParser.LOOSE_AMP_RE_, '&$1'). replace(goog.string.html.HtmlParser.LT_RE, '<'). replace(goog.string.html.HtmlParser.GT_RE, '>'); }; /** * TODO(goto): why isn't this in the string package ? does this solves any * real problem ? move it to the goog.string package if it does. * * @param {string} str The string to lower case. * @return {string} The str in lower case format. */ goog.string.html.toLowerCase = function(str) { // The below may not be true on browsers in the Turkish locale. if ('script' === 'SCRIPT'.toLowerCase()) { return str.toLowerCase(); } else { return str.replace(/[A-Z]/g, function(ch) { return String.fromCharCode(ch.charCodeAt(0) | 32); }); } }; /** * An interface to the {@code goog.string.html.HtmlParser} visitor, that gets * called while the HTML is being parsed. * * @interface */ goog.string.html.HtmlSaxHandler = function() { }; /** * Handler called when the parser found a new tag. * @param {string} name The name of the tag that is starting. * @param {Array} attributes The attributes of the tag. */ goog.string.html.HtmlSaxHandler.prototype.startTag = goog.abstractMethod; /** * Handler called when the parser found a closing tag. * @param {string} name The name of the tag that is ending. */ goog.string.html.HtmlSaxHandler.prototype.endTag = goog.abstractMethod; /** * Handler called when PCDATA is found. * @param {string} text The PCDATA text found. */ goog.string.html.HtmlSaxHandler.prototype.pcdata = goog.abstractMethod; /** * Handler called when RCDATA is found. * @param {string} text The RCDATA text found. */ goog.string.html.HtmlSaxHandler.prototype.rcdata = goog.abstractMethod; /** * Handler called when CDATA is found. * @param {string} text The CDATA text found. */ goog.string.html.HtmlSaxHandler.prototype.cdata = goog.abstractMethod; /** * Handler called when the parser is starting to parse the document. */ goog.string.html.HtmlSaxHandler.prototype.startDoc = goog.abstractMethod; /** * Handler called when the parsing is done. */ goog.string.html.HtmlSaxHandler.prototype.endDoc = goog.abstractMethod;




© 2015 - 2024 Weber Informatics LLC | Privacy Policy